xref: /titanic_50/usr/src/uts/intel/ia32/os/sysi86.c (revision 1f041b1785d05ef9863b007d3807833c3609391d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1990, 1991 UNIX System Laboratories, Inc.	*/
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 /*	Copyright (c) 1987, 1988 Microsoft Corporation	*/
31 /*	  All Rights Reserved	*/
32 
33 #pragma ident	"%Z%%M%	%I%	%E% SMI"
34 
35 #include <sys/param.h>
36 #include <sys/types.h>
37 #include <sys/sysmacros.h>
38 #include <sys/systm.h>
39 #include <sys/signal.h>
40 #include <sys/errno.h>
41 #include <sys/fault.h>
42 #include <sys/syscall.h>
43 #include <sys/cpuvar.h>
44 #include <sys/sysi86.h>
45 #include <sys/psw.h>
46 #include <sys/cred.h>
47 #include <sys/policy.h>
48 #include <sys/thread.h>
49 #include <sys/debug.h>
50 #include <sys/ontrap.h>
51 #include <sys/privregs.h>
52 #include <sys/x86_archext.h>
53 #include <sys/vmem.h>
54 #include <sys/kmem.h>
55 #include <sys/mman.h>
56 #include <sys/archsystm.h>
57 #include <vm/hat.h>
58 #include <vm/as.h>
59 #include <vm/seg.h>
60 #include <vm/seg_kmem.h>
61 #include <vm/faultcode.h>
62 #include <sys/fp.h>
63 #include <sys/cmn_err.h>
64 #include <sys/segments.h>
65 #include <sys/clock.h>
66 #if defined(__xpv)
67 #include <sys/hypervisor.h>
68 #include <sys/note.h>
69 #endif
70 
71 static void ldt_alloc(proc_t *, uint_t);
72 static void ldt_free(proc_t *);
73 static void ldt_dup(proc_t *, proc_t *);
74 static void ldt_grow(proc_t *, uint_t);
75 
76 /*
77  * sysi86 System Call
78  */
79 
80 /* ARGSUSED */
81 int
82 sysi86(short cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3)
83 {
84 	struct ssd ssd;
85 	int error = 0;
86 	int c;
87 	proc_t *pp = curproc;
88 
89 	switch (cmd) {
90 
91 	/*
92 	 * The SI86V86 subsystem call of the SYSI86 system call
93 	 * supports only one subcode -- V86SC_IOPL.
94 	 */
95 	case SI86V86:
96 		if (arg1 == V86SC_IOPL) {
97 			struct regs *rp = lwptoregs(ttolwp(curthread));
98 			greg_t oldpl = rp->r_ps & PS_IOPL;
99 			greg_t newpl = arg2 & PS_IOPL;
100 
101 			/*
102 			 * Must be privileged to run this system call
103 			 * if giving more io privilege.
104 			 */
105 			if (newpl > oldpl && (error =
106 			    secpolicy_sys_config(CRED(), B_FALSE)) != 0)
107 				return (set_errno(error));
108 #if defined(__xpv)
109 			kpreempt_disable();
110 			installctx(curthread, NULL, xen_disable_user_iopl,
111 			    xen_enable_user_iopl, NULL, NULL,
112 			    xen_disable_user_iopl, NULL);
113 			xen_enable_user_iopl();
114 			kpreempt_enable();
115 #else
116 			rp->r_ps ^= oldpl ^ newpl;
117 #endif
118 		} else
119 			error = EINVAL;
120 		break;
121 
122 	/*
123 	 * Set a segment descriptor
124 	 */
125 	case SI86DSCR:
126 		/*
127 		 * There are considerable problems here manipulating
128 		 * resources shared by many running lwps.  Get everyone
129 		 * into a safe state before changing the LDT.
130 		 */
131 		if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK1)) {
132 			error = EINTR;
133 			break;
134 		}
135 
136 		if (get_udatamodel() == DATAMODEL_LP64) {
137 			error = EINVAL;
138 			break;
139 		}
140 
141 		if (copyin((caddr_t)arg1, &ssd, sizeof (ssd)) < 0) {
142 			error = EFAULT;
143 			break;
144 		}
145 
146 		error = setdscr(&ssd);
147 
148 		mutex_enter(&pp->p_lock);
149 		if (curthread != pp->p_agenttp)
150 			continuelwps(pp);
151 		mutex_exit(&pp->p_lock);
152 		break;
153 
154 	case SI86FPHW:
155 		c = fp_kind & 0xff;
156 		if (suword32((void *)arg1, c) == -1)
157 			error = EFAULT;
158 		break;
159 
160 	case SI86FPSTART:
161 		/*
162 		 * arg1 is the address of _fp_hw
163 		 * arg2 is the desired x87 FCW value
164 		 * arg3 is the desired SSE MXCSR value
165 		 * a return value of one means SSE hardware, else none.
166 		 */
167 		c = fp_kind & 0xff;
168 		if (suword32((void *)arg1, c) == -1) {
169 			error = EFAULT;
170 			break;
171 		}
172 		fpsetcw((uint16_t)arg2, (uint32_t)arg3);
173 		return (fp_kind == __FP_SSE ? 1 : 0);
174 
175 	/* real time clock management commands */
176 
177 	case WTODC:
178 		if ((error = secpolicy_settime(CRED())) == 0) {
179 			timestruc_t ts;
180 			mutex_enter(&tod_lock);
181 			gethrestime(&ts);
182 			tod_set(ts);
183 			mutex_exit(&tod_lock);
184 		}
185 		break;
186 
187 /* Give some timezone playing room */
188 #define	ONEWEEK	(7 * 24 * 60 * 60)
189 
190 	case SGMTL:
191 		/*
192 		 * Called from 32 bit land, negative values
193 		 * are not sign extended, so we do that here
194 		 * by casting it to an int and back.  We also
195 		 * clamp the value to within reason and detect
196 		 * when a 64 bit call overflows an int.
197 		 */
198 		if ((error = secpolicy_settime(CRED())) == 0) {
199 			int newlag = (int)arg1;
200 
201 #ifdef _SYSCALL32_IMPL
202 			if (get_udatamodel() == DATAMODEL_NATIVE &&
203 			    (long)newlag != (long)arg1) {
204 				error = EOVERFLOW;
205 			} else
206 #endif
207 			if (newlag >= -ONEWEEK && newlag <= ONEWEEK)
208 				sgmtl(newlag);
209 			else
210 				error = EOVERFLOW;
211 		}
212 		break;
213 
214 	case GGMTL:
215 		if (get_udatamodel() == DATAMODEL_NATIVE) {
216 			if (sulword((void *)arg1, ggmtl()) == -1)
217 				error = EFAULT;
218 #ifdef _SYSCALL32_IMPL
219 		} else {
220 			time_t gmtl;
221 
222 			if ((gmtl = ggmtl()) > INT32_MAX) {
223 				/*
224 				 * Since gmt_lag can at most be
225 				 * +/- 12 hours, something is
226 				 * *seriously* messed up here.
227 				 */
228 				error = EOVERFLOW;
229 			} else if (suword32((void *)arg1, (int32_t)gmtl) == -1)
230 				error = EFAULT;
231 #endif
232 		}
233 		break;
234 
235 	case RTCSYNC:
236 		if ((error = secpolicy_settime(CRED())) == 0)
237 			rtcsync();
238 		break;
239 
240 	/* END OF real time clock management commands */
241 
242 	default:
243 		error = EINVAL;
244 		break;
245 	}
246 	return (error == 0 ? 0 : set_errno(error));
247 }
248 
249 void
250 usd_to_ssd(user_desc_t *usd, struct ssd *ssd, selector_t sel)
251 {
252 	ssd->bo = USEGD_GETBASE(usd);
253 	ssd->ls = USEGD_GETLIMIT(usd);
254 	ssd->sel = sel;
255 
256 	/*
257 	 * set type, dpl and present bits.
258 	 */
259 	ssd->acc1 = usd->usd_type;
260 	ssd->acc1 |= usd->usd_dpl << 5;
261 	ssd->acc1 |= usd->usd_p << (5 + 2);
262 
263 	/*
264 	 * set avl, DB and granularity bits.
265 	 */
266 	ssd->acc2 = usd->usd_avl;
267 
268 #if defined(__amd64)
269 	ssd->acc2 |= usd->usd_long << 1;
270 #else
271 	ssd->acc2 |= usd->usd_reserved << 1;
272 #endif
273 
274 	ssd->acc2 |= usd->usd_def32 << (1 + 1);
275 	ssd->acc2 |= usd->usd_gran << (1 + 1 + 1);
276 }
277 
278 static void
279 ssd_to_usd(struct ssd *ssd, user_desc_t *usd)
280 {
281 
282 	ASSERT(bcmp(usd, &null_udesc, sizeof (*usd)) == 0);
283 
284 	USEGD_SETBASE(usd, ssd->bo);
285 	USEGD_SETLIMIT(usd, ssd->ls);
286 
287 	/*
288 	 * set type, dpl and present bits.
289 	 */
290 	usd->usd_type = ssd->acc1;
291 	usd->usd_dpl = ssd->acc1 >> 5;
292 	usd->usd_p = ssd->acc1 >> (5 + 2);
293 
294 	ASSERT(usd->usd_type >= SDT_MEMRO);
295 	ASSERT(usd->usd_dpl == SEL_UPL);
296 
297 	/*
298 	 * 64-bit code selectors are never allowed in the LDT.
299 	 * Reserved bit is always 0 on 32-bit sytems.
300 	 */
301 #if defined(__amd64)
302 	usd->usd_long = 0;
303 #else
304 	usd->usd_reserved = 0;
305 #endif
306 
307 	/*
308 	 * set avl, DB and granularity bits.
309 	 */
310 	usd->usd_avl = ssd->acc2;
311 	usd->usd_def32 = ssd->acc2 >> (1 + 1);
312 	usd->usd_gran = ssd->acc2 >> (1 + 1 + 1);
313 }
314 
315 
316 #if defined(__i386)
317 
318 static void
319 ssd_to_sgd(struct ssd *ssd, gate_desc_t *sgd)
320 {
321 
322 	ASSERT(bcmp(sgd, &null_sdesc, sizeof (*sgd)) == 0);
323 
324 	sgd->sgd_looffset = ssd->bo;
325 	sgd->sgd_hioffset = ssd->bo >> 16;
326 
327 	sgd->sgd_selector = ssd->ls;
328 
329 	/*
330 	 * set type, dpl and present bits.
331 	 */
332 	sgd->sgd_type = ssd->acc1;
333 	sgd->sgd_dpl = ssd->acc1 >> 5;
334 	sgd->sgd_p = ssd->acc1 >> 7;
335 	ASSERT(sgd->sgd_type == SDT_SYSCGT);
336 	ASSERT(sgd->sgd_dpl == SEL_UPL);
337 	sgd->sgd_stkcpy = 0;
338 }
339 
340 #endif	/* __i386 */
341 
342 /*
343  * Load LDT register with the current process's LDT.
344  */
345 static void
346 ldt_load(void)
347 {
348 #if defined(__xpv)
349 	xen_set_ldt(get_ssd_base(&curproc->p_ldt_desc),
350 	    curproc->p_ldtlimit + 1);
351 #else
352 	*((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = curproc->p_ldt_desc;
353 	wr_ldtr(ULDT_SEL);
354 #endif
355 }
356 
357 /*
358  * Store a NULL selector in the LDTR. All subsequent illegal references to
359  * the LDT will result in a #gp.
360  */
361 void
362 ldt_unload(void)
363 {
364 #if defined(__xpv)
365 	xen_set_ldt(NULL, 0);
366 #else
367 	*((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = null_sdesc;
368 	wr_ldtr(0);
369 #endif
370 }
371 
372 /*ARGSUSED*/
373 static void
374 ldt_savectx(proc_t *p)
375 {
376 	ASSERT(p->p_ldt != NULL);
377 	ASSERT(p == curproc);
378 
379 #if defined(__amd64)
380 	/*
381 	 * The 64-bit kernel must be sure to clear any stale ldt
382 	 * selectors when context switching away from a process that
383 	 * has a private ldt. Consider the following example:
384 	 *
385 	 * 	Wine creats a ldt descriptor and points a segment register
386 	 * 	to it.
387 	 *
388 	 *	We then context switch away from wine lwp to kernel
389 	 *	thread and hit breakpoint in kernel with kmdb
390 	 *
391 	 *	When we continue and resume from kmdb we will #gp
392 	 * 	fault since kmdb will have saved the stale ldt selector
393 	 *	from wine and will try to restore it but we are no longer in
394 	 *	the context of the wine process and do not have our
395 	 *	ldtr register pointing to the private ldt.
396 	 */
397 	reset_sregs();
398 #endif
399 
400 	ldt_unload();
401 	cpu_fast_syscall_enable(NULL);
402 }
403 
404 static void
405 ldt_restorectx(proc_t *p)
406 {
407 	ASSERT(p->p_ldt != NULL);
408 	ASSERT(p == curproc);
409 
410 	ldt_load();
411 	cpu_fast_syscall_disable(NULL);
412 }
413 
414 /*
415  * When a process with a private LDT execs, fast syscalls must be enabled for
416  * the new process image.
417  */
418 /* ARGSUSED */
419 static void
420 ldt_freectx(proc_t *p, int isexec)
421 {
422 	ASSERT(p->p_ldt);
423 
424 	if (isexec) {
425 		kpreempt_disable();
426 		cpu_fast_syscall_enable(NULL);
427 		kpreempt_enable();
428 	}
429 
430 	/*
431 	 * ldt_free() will free the memory used by the private LDT, reset the
432 	 * process's descriptor, and re-program the LDTR.
433 	 */
434 	ldt_free(p);
435 }
436 
437 /*
438  * Install ctx op that ensures syscall/sysenter are disabled.
439  * See comments below.
440  *
441  * When a thread with a private LDT forks, the new process
442  * must have the LDT context ops installed.
443  */
444 /* ARGSUSED */
445 static void
446 ldt_installctx(proc_t *p, proc_t *cp)
447 {
448 	proc_t		*targ = p;
449 	kthread_t	*t;
450 
451 	/*
452 	 * If this is a fork, operate on the child process.
453 	 */
454 	if (cp != NULL) {
455 		targ = cp;
456 		ldt_dup(p, cp);
457 	}
458 
459 	/*
460 	 * The process context ops expect the target process as their argument.
461 	 */
462 	ASSERT(removepctx(targ, targ, ldt_savectx, ldt_restorectx,
463 	    ldt_installctx, ldt_savectx, ldt_freectx) == 0);
464 
465 	installpctx(targ, targ, ldt_savectx, ldt_restorectx,
466 	    ldt_installctx, ldt_savectx, ldt_freectx);
467 
468 	/*
469 	 * We've just disabled fast system call and return instructions; take
470 	 * the slow path out to make sure we don't try to use one to return
471 	 * back to user. We must set t_post_sys for every thread in the
472 	 * process to make sure none of them escape out via fast return.
473 	 */
474 
475 	mutex_enter(&targ->p_lock);
476 	t = targ->p_tlist;
477 	do {
478 		t->t_post_sys = 1;
479 	} while ((t = t->t_forw) != targ->p_tlist);
480 	mutex_exit(&targ->p_lock);
481 }
482 
483 int
484 setdscr(struct ssd *ssd)
485 {
486 	ushort_t seli; 		/* selector index */
487 	user_desc_t *ldp;	/* descriptor pointer */
488 	user_desc_t ndesc;	/* new descriptor */
489 	proc_t	*pp = ttoproc(curthread);
490 	int	rc = 0;
491 
492 	/*
493 	 * LDT segments: executable and data at DPL 3 only.
494 	 */
495 	if (!SELISLDT(ssd->sel) || !SELISUPL(ssd->sel))
496 		return (EINVAL);
497 
498 	/*
499 	 * check the selector index.
500 	 */
501 	seli = SELTOIDX(ssd->sel);
502 	if (seli >= MAXNLDT || seli < LDT_UDBASE)
503 		return (EINVAL);
504 
505 	ndesc = null_udesc;
506 	mutex_enter(&pp->p_ldtlock);
507 
508 	/*
509 	 * If this is the first time for this process then setup a
510 	 * private LDT for it.
511 	 */
512 	if (pp->p_ldt == NULL) {
513 		ldt_alloc(pp, seli);
514 
515 		/*
516 		 * Now that this process has a private LDT, the use of
517 		 * the syscall/sysret and sysenter/sysexit instructions
518 		 * is forbidden for this processes because they destroy
519 		 * the contents of %cs and %ss segment registers.
520 		 *
521 		 * Explicity disable them here and add a context handler
522 		 * to the process. Note that disabling
523 		 * them here means we can't use sysret or sysexit on
524 		 * the way out of this system call - so we force this
525 		 * thread to take the slow path (which doesn't make use
526 		 * of sysenter or sysexit) back out.
527 		 */
528 		kpreempt_disable();
529 		ldt_installctx(pp, NULL);
530 		cpu_fast_syscall_disable(NULL);
531 		ASSERT(curthread->t_post_sys != 0);
532 		kpreempt_enable();
533 
534 	} else if (seli > pp->p_ldtlimit) {
535 
536 		/*
537 		 * Increase size of ldt to include seli.
538 		 */
539 		ldt_grow(pp, seli);
540 	}
541 
542 	ASSERT(seli <= pp->p_ldtlimit);
543 	ldp = &pp->p_ldt[seli];
544 
545 	/*
546 	 * On the 64-bit kernel, this is where things get more subtle.
547 	 * Recall that in the 64-bit kernel, when we enter the kernel we
548 	 * deliberately -don't- reload the segment selectors we came in on
549 	 * for %ds, %es, %fs or %gs. Messing with selectors is expensive,
550 	 * and the underlying descriptors are essentially ignored by the
551 	 * hardware in long mode - except for the base that we override with
552 	 * the gsbase MSRs.
553 	 *
554 	 * However, there's one unfortunate issue with this rosy picture --
555 	 * a descriptor that's not marked as 'present' will still generate
556 	 * an #np when loading a segment register.
557 	 *
558 	 * Consider this case.  An lwp creates a harmless LDT entry, points
559 	 * one of it's segment registers at it, then tells the kernel (here)
560 	 * to delete it.  In the 32-bit kernel, the #np will happen on the
561 	 * way back to userland where we reload the segment registers, and be
562 	 * handled in kern_gpfault().  In the 64-bit kernel, the same thing
563 	 * will happen in the normal case too.  However, if we're trying to
564 	 * use a debugger that wants to save and restore the segment registers,
565 	 * and the debugger things that we have valid segment registers, we
566 	 * have the problem that the debugger will try and restore the
567 	 * segment register that points at the now 'not present' descriptor
568 	 * and will take a #np right there.
569 	 *
570 	 * We should obviously fix the debugger to be paranoid about
571 	 * -not- restoring segment registers that point to bad descriptors;
572 	 * however we can prevent the problem here if we check to see if any
573 	 * of the segment registers are still pointing at the thing we're
574 	 * destroying; if they are, return an error instead. (That also seems
575 	 * a lot better failure mode than SIGKILL and a core file
576 	 * from kern_gpfault() too.)
577 	 */
578 	if (SI86SSD_PRES(ssd) == 0) {
579 		kthread_t *t;
580 		int bad = 0;
581 
582 		/*
583 		 * Look carefully at the segment registers of every lwp
584 		 * in the process (they're all stopped by our caller).
585 		 * If we're about to invalidate a descriptor that's still
586 		 * being referenced by *any* of them, return an error,
587 		 * rather than having them #gp on their way out of the kernel.
588 		 */
589 		ASSERT(pp->p_lwprcnt == 1);
590 
591 		mutex_enter(&pp->p_lock);
592 		t = pp->p_tlist;
593 		do {
594 			klwp_t *lwp = ttolwp(t);
595 			struct regs *rp = lwp->lwp_regs;
596 #if defined(__amd64)
597 			pcb_t *pcb = &lwp->lwp_pcb;
598 #endif
599 
600 			if (ssd->sel == rp->r_cs || ssd->sel == rp->r_ss) {
601 				bad = 1;
602 				break;
603 			}
604 
605 #if defined(__amd64)
606 			if (pcb->pcb_rupdate == 1) {
607 				if (ssd->sel == pcb->pcb_ds ||
608 				    ssd->sel == pcb->pcb_es ||
609 				    ssd->sel == pcb->pcb_fs ||
610 				    ssd->sel == pcb->pcb_gs) {
611 					bad = 1;
612 					break;
613 				}
614 			} else
615 #endif
616 			{
617 				if (ssd->sel == rp->r_ds ||
618 				    ssd->sel == rp->r_es ||
619 				    ssd->sel == rp->r_fs ||
620 				    ssd->sel == rp->r_gs) {
621 					bad = 1;
622 					break;
623 				}
624 			}
625 
626 		} while ((t = t->t_forw) != pp->p_tlist);
627 		mutex_exit(&pp->p_lock);
628 
629 		if (bad) {
630 			mutex_exit(&pp->p_ldtlock);
631 			return (EBUSY);
632 		}
633 	}
634 
635 	/*
636 	 * If acc1 is zero, clear the descriptor (including the 'present' bit)
637 	 */
638 	if (ssd->acc1 == 0) {
639 		rc  = ldt_update_segd(ldp, &null_udesc);
640 		mutex_exit(&pp->p_ldtlock);
641 		return (rc);
642 	}
643 
644 	/*
645 	 * Check segment type, allow segment not present and
646 	 * only user DPL (3).
647 	 */
648 	if (SI86SSD_DPL(ssd) != SEL_UPL) {
649 		mutex_exit(&pp->p_ldtlock);
650 		return (EINVAL);
651 	}
652 
653 #if defined(__amd64)
654 	/*
655 	 * Do not allow 32-bit applications to create 64-bit mode code
656 	 * segments.
657 	 */
658 	if (SI86SSD_ISUSEG(ssd) && ((SI86SSD_TYPE(ssd) >> 3) & 1) == 1 &&
659 	    SI86SSD_ISLONG(ssd)) {
660 		mutex_exit(&pp->p_ldtlock);
661 		return (EINVAL);
662 	}
663 #endif /* __amd64 */
664 
665 	/*
666 	 * Set up a code or data user segment descriptor.
667 	 */
668 	if (SI86SSD_ISUSEG(ssd)) {
669 		ssd_to_usd(ssd, &ndesc);
670 		rc = ldt_update_segd(ldp, &ndesc);
671 		mutex_exit(&pp->p_ldtlock);
672 		return (rc);
673 	}
674 
675 #if defined(__i386)
676 	/*
677 	 * Allow a call gate only if the destination is in the LDT
678 	 * and the system is running in 32-bit legacy mode.
679 	 *
680 	 * In long mode 32-bit call gates are redefined as 64-bit call
681 	 * gates and the hw enforces that the target code selector
682 	 * of the call gate must be 64-bit selector. A #gp fault is
683 	 * generated if otherwise. Since we do not allow 32-bit processes
684 	 * to switch themselves to 64-bits we never allow call gates
685 	 * on 64-bit system system.
686 	 */
687 	if (SI86SSD_TYPE(ssd) == SDT_SYSCGT && SELISLDT(ssd->ls)) {
688 
689 
690 		ssd_to_sgd(ssd, (gate_desc_t *)&ndesc);
691 		rc = ldt_update_segd(ldp, &ndesc);
692 		mutex_exit(&pp->p_ldtlock);
693 		return (rc);
694 	}
695 #endif	/* __i386 */
696 
697 	mutex_exit(&pp->p_ldtlock);
698 	return (EINVAL);
699 }
700 
701 /*
702  * Allocate new LDT for process just large enough to contain seli.
703  * Note we allocate and grow LDT in PAGESIZE chunks. We do this
704  * to simplify the implementation and because on the hypervisor it's
705  * required, since the LDT must live on pages that have PROT_WRITE
706  * removed and which are given to the hypervisor.
707  */
708 static void
709 ldt_alloc(proc_t *pp, uint_t seli)
710 {
711 	user_desc_t	*ldt;
712 	size_t		ldtsz;
713 	uint_t		nsels;
714 
715 	ASSERT(MUTEX_HELD(&pp->p_ldtlock));
716 	ASSERT(pp->p_ldt == NULL);
717 	ASSERT(pp->p_ldtlimit == 0);
718 
719 	/*
720 	 * Allocate new LDT just large enough to contain seli.
721 	 */
722 	ldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
723 	nsels = ldtsz / sizeof (user_desc_t);
724 	ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
725 
726 	ldt = kmem_zalloc(ldtsz, KM_SLEEP);
727 	ASSERT(IS_P2ALIGNED(ldt, PAGESIZE));
728 
729 #if defined(__xpv)
730 	if (xen_ldt_setprot(ldt, ldtsz, PROT_READ))
731 		panic("ldt_alloc:xen_ldt_setprot(PROT_READ) failed");
732 #endif
733 
734 	pp->p_ldt = ldt;
735 	pp->p_ldtlimit = nsels - 1;
736 	set_syssegd(&pp->p_ldt_desc, ldt, ldtsz - 1, SDT_SYSLDT, SEL_KPL);
737 
738 	if (pp == curproc) {
739 		kpreempt_disable();
740 		ldt_load();
741 		kpreempt_enable();
742 	}
743 }
744 
745 static void
746 ldt_free(proc_t *pp)
747 {
748 	user_desc_t	*ldt;
749 	size_t		ldtsz;
750 
751 	ASSERT(pp->p_ldt != NULL);
752 
753 	mutex_enter(&pp->p_ldtlock);
754 	ldt = pp->p_ldt;
755 	ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
756 
757 	ASSERT(IS_P2ALIGNED(ldtsz, PAGESIZE));
758 
759 	pp->p_ldt = NULL;
760 	pp->p_ldtlimit = 0;
761 	pp->p_ldt_desc = null_sdesc;
762 	mutex_exit(&pp->p_ldtlock);
763 
764 	if (pp == curproc) {
765 		kpreempt_disable();
766 		ldt_unload();
767 		kpreempt_enable();
768 	}
769 
770 #if defined(__xpv)
771 	/*
772 	 * We are not allowed to make the ldt writable until after
773 	 * we tell the hypervisor to unload it.
774 	 */
775 	if (xen_ldt_setprot(ldt, ldtsz, PROT_READ | PROT_WRITE))
776 		panic("ldt_free:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
777 #endif
778 
779 	kmem_free(ldt, ldtsz);
780 }
781 
782 /*
783  * On fork copy new ldt for child.
784  */
785 static void
786 ldt_dup(proc_t *pp, proc_t *cp)
787 {
788 	size_t	ldtsz;
789 
790 	ASSERT(pp->p_ldt != NULL);
791 	ASSERT(cp != curproc);
792 
793 	/*
794 	 * I assume the parent's ldt can't increase since we're in a fork.
795 	 */
796 	mutex_enter(&pp->p_ldtlock);
797 	mutex_enter(&cp->p_ldtlock);
798 
799 	ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
800 
801 	ldt_alloc(cp, pp->p_ldtlimit);
802 
803 #if defined(__xpv)
804 	/*
805 	 * Make child's ldt writable so it can be copied into from
806 	 * parent's ldt. This works since ldt_alloc above did not load
807 	 * the ldt since its for the child process. If we tried to make
808 	 * an LDT writable that is loaded in hw the setprot operation
809 	 * would fail.
810 	 */
811 	if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ | PROT_WRITE))
812 		panic("ldt_dup:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
813 #endif
814 
815 	bcopy(pp->p_ldt, cp->p_ldt, ldtsz);
816 
817 #if defined(__xpv)
818 	if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ))
819 		panic("ldt_dup:xen_ldt_setprot(PROT_READ) failed");
820 #endif
821 	mutex_exit(&cp->p_ldtlock);
822 	mutex_exit(&pp->p_ldtlock);
823 
824 }
825 
826 static void
827 ldt_grow(proc_t *pp, uint_t seli)
828 {
829 	user_desc_t	*oldt, *nldt;
830 	uint_t		nsels;
831 	size_t		oldtsz, nldtsz;
832 
833 	ASSERT(MUTEX_HELD(&pp->p_ldtlock));
834 	ASSERT(pp->p_ldt != NULL);
835 	ASSERT(pp->p_ldtlimit != 0);
836 
837 	/*
838 	 * Allocate larger LDT just large enough to contain seli.
839 	 */
840 	nldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
841 	nsels = nldtsz / sizeof (user_desc_t);
842 	ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
843 	ASSERT(nsels > pp->p_ldtlimit);
844 
845 	oldt = pp->p_ldt;
846 	oldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
847 
848 	nldt = kmem_zalloc(nldtsz, KM_SLEEP);
849 	ASSERT(IS_P2ALIGNED(nldt, PAGESIZE));
850 
851 	bcopy(oldt, nldt, oldtsz);
852 
853 	/*
854 	 * unload old ldt.
855 	 */
856 	kpreempt_disable();
857 	ldt_unload();
858 	kpreempt_enable();
859 
860 #if defined(__xpv)
861 
862 	/*
863 	 * Make old ldt writable and new ldt read only.
864 	 */
865 	if (xen_ldt_setprot(oldt, oldtsz, PROT_READ | PROT_WRITE))
866 		panic("ldt_grow:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
867 
868 	if (xen_ldt_setprot(nldt, nldtsz, PROT_READ))
869 		panic("ldt_grow:xen_ldt_setprot(PROT_READ) failed");
870 #endif
871 
872 	pp->p_ldt = nldt;
873 	pp->p_ldtlimit = nsels - 1;
874 
875 	/*
876 	 * write new ldt segment descriptor.
877 	 */
878 	set_syssegd(&pp->p_ldt_desc, nldt, nldtsz - 1, SDT_SYSLDT, SEL_KPL);
879 
880 	/*
881 	 * load the new ldt.
882 	 */
883 	kpreempt_disable();
884 	ldt_load();
885 	kpreempt_enable();
886 
887 	kmem_free(oldt, oldtsz);
888 }
889