xref: /illumos-gate/usr/src/cmd/bhyve/task_switch.c (revision b210e77709da8e42dfe621e10ccf4be504206058)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2014 Neel Natu <neel@freebsd.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 /*
29  * This file and its contents are supplied under the terms of the
30  * Common Development and Distribution License ("CDDL"), version 1.0.
31  * You may only use this file in accordance with the terms of version
32  * 1.0 of the CDDL.
33  *
34  * A full copy of the text of the CDDL should have accompanied this
35  * source.  A copy of the CDDL is also available via the Internet at
36  * http://www.illumos.org/license/CDDL.
37  *
38  * Copyright 2020 Oxide Computer Company
39  */
40 
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 #include <sys/param.h>
45 #include <sys/_iovec.h>
46 #include <sys/mman.h>
47 
48 #include <x86/psl.h>
49 #include <x86/segments.h>
50 #include <x86/specialreg.h>
51 #include <machine/vmm.h>
52 
53 #include <assert.h>
54 #include <errno.h>
55 #include <stdbool.h>
56 #include <stdio.h>
57 #include <stdlib.h>
58 
59 #include <vmmapi.h>
60 
61 #include "bhyverun.h"
62 #include "debug.h"
63 
64 /*
65  * Using 'struct i386tss' is tempting but causes myriad sign extension
66  * issues because all of its fields are defined as signed integers.
67  */
68 struct tss32 {
69 	uint16_t	tss_link;
70 	uint16_t	rsvd1;
71 	uint32_t	tss_esp0;
72 	uint16_t	tss_ss0;
73 	uint16_t	rsvd2;
74 	uint32_t	tss_esp1;
75 	uint16_t	tss_ss1;
76 	uint16_t	rsvd3;
77 	uint32_t	tss_esp2;
78 	uint16_t	tss_ss2;
79 	uint16_t	rsvd4;
80 	uint32_t	tss_cr3;
81 	uint32_t	tss_eip;
82 	uint32_t	tss_eflags;
83 	uint32_t	tss_eax;
84 	uint32_t	tss_ecx;
85 	uint32_t	tss_edx;
86 	uint32_t	tss_ebx;
87 	uint32_t	tss_esp;
88 	uint32_t	tss_ebp;
89 	uint32_t	tss_esi;
90 	uint32_t	tss_edi;
91 	uint16_t	tss_es;
92 	uint16_t	rsvd5;
93 	uint16_t	tss_cs;
94 	uint16_t	rsvd6;
95 	uint16_t	tss_ss;
96 	uint16_t	rsvd7;
97 	uint16_t	tss_ds;
98 	uint16_t	rsvd8;
99 	uint16_t	tss_fs;
100 	uint16_t	rsvd9;
101 	uint16_t	tss_gs;
102 	uint16_t	rsvd10;
103 	uint16_t	tss_ldt;
104 	uint16_t	rsvd11;
105 	uint16_t	tss_trap;
106 	uint16_t	tss_iomap;
107 };
108 static_assert(sizeof(struct tss32) == 104, "compile-time assertion failed");
109 
110 #define	SEL_START(sel)	(((sel) & ~0x7))
111 #define	SEL_LIMIT(sel)	(((sel) | 0x7))
112 #define	TSS_BUSY(type)	(((type) & 0x2) != 0)
113 
114 static uint64_t
115 GETREG(struct vmctx *ctx, int vcpu, int reg)
116 {
117 	uint64_t val;
118 	int error;
119 
120 	error = vm_get_register(ctx, vcpu, reg, &val);
121 	assert(error == 0);
122 	return (val);
123 }
124 
125 static void
126 SETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
127 {
128 	int error;
129 
130 	error = vm_set_register(ctx, vcpu, reg, val);
131 	assert(error == 0);
132 }
133 
134 static struct seg_desc
135 usd_to_seg_desc(struct user_segment_descriptor *usd)
136 {
137 	struct seg_desc seg_desc;
138 
139 	seg_desc.base = (u_int)USD_GETBASE(usd);
140 	if (usd->sd_gran)
141 		seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff;
142 	else
143 		seg_desc.limit = (u_int)USD_GETLIMIT(usd);
144 	seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7;
145 	seg_desc.access |= usd->sd_xx << 12;
146 	seg_desc.access |= usd->sd_def32 << 14;
147 	seg_desc.access |= usd->sd_gran << 15;
148 
149 	return (seg_desc);
150 }
151 
152 /*
153  * Inject an exception with an error code that is a segment selector.
154  * The format of the error code is described in section 6.13, "Error Code",
155  * Intel SDM volume 3.
156  *
157  * Bit 0 (EXT) denotes whether the exception occurred during delivery
158  * of an external event like an interrupt.
159  *
160  * Bit 1 (IDT) indicates whether the selector points to a gate descriptor
161  * in the IDT.
162  *
163  * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI).
164  */
165 static void
166 sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext)
167 {
168 	/*
169 	 * Bit 2 from the selector is retained as-is in the error code.
170 	 *
171 	 * Bit 1 can be safely cleared because none of the selectors
172 	 * encountered during task switch emulation refer to a task
173 	 * gate in the IDT.
174 	 *
175 	 * Bit 0 is set depending on the value of 'ext'.
176 	 */
177 	sel &= ~0x3;
178 	if (ext)
179 		sel |= 0x1;
180 	vm_inject_fault(ctx, vcpu, vector, 1, sel);
181 }
182 
183 /*
184  * Return 0 if the selector 'sel' in within the limits of the GDT/LDT
185  * and non-zero otherwise.
186  */
187 static int
188 desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel)
189 {
190 	uint64_t base;
191 	uint32_t limit, access;
192 	int error, reg;
193 
194 	reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
195 	error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
196 	assert(error == 0);
197 
198 	if (reg == VM_REG_GUEST_LDTR) {
199 		if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access))
200 			return (-1);
201 	}
202 
203 	if (limit < SEL_LIMIT(sel))
204 		return (-1);
205 	else
206 		return (0);
207 }
208 
209 /*
210  * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced
211  * by the selector 'sel'.
212  *
213  * Returns 0 on success.
214  * Returns 1 if an exception was injected into the guest.
215  * Returns -1 otherwise.
216  */
217 static int
218 desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
219     uint16_t sel, struct user_segment_descriptor *desc, bool doread,
220     int *faultptr)
221 {
222 	struct iovec iov[2];
223 	uint64_t base;
224 	uint32_t limit, access;
225 	int error, reg;
226 
227 	reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
228 	error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
229 	assert(error == 0);
230 	assert(limit >= SEL_LIMIT(sel));
231 
232 	error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel),
233 	    sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov),
234 	    faultptr);
235 	if (error || *faultptr)
236 		return (error);
237 
238 	if (doread)
239 		vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc));
240 	else
241 		vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc));
242 	return (0);
243 }
244 
245 static int
246 desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
247     uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
248 {
249 	return (desc_table_rw(ctx, vcpu, paging, sel, desc, true, faultptr));
250 }
251 
252 static int
253 desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
254     uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
255 {
256 	return (desc_table_rw(ctx, vcpu, paging, sel, desc, false, faultptr));
257 }
258 
259 /*
260  * Read the TSS descriptor referenced by 'sel' into 'desc'.
261  *
262  * Returns 0 on success.
263  * Returns 1 if an exception was injected into the guest.
264  * Returns -1 otherwise.
265  */
266 static int
267 read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
268     uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
269 {
270 	struct vm_guest_paging sup_paging;
271 	int error;
272 
273 	assert(!ISLDT(sel));
274 	assert(IDXSEL(sel) != 0);
275 
276 	/* Fetch the new TSS descriptor */
277 	if (desc_table_limit_check(ctx, vcpu, sel)) {
278 		if (ts->reason == TSR_IRET)
279 			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
280 		else
281 			sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext);
282 		return (1);
283 	}
284 
285 	sup_paging = ts->paging;
286 	sup_paging.cpl = 0;		/* implicit supervisor mode */
287 	error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc, faultptr);
288 	return (error);
289 }
290 
291 static bool
292 code_desc(int sd_type)
293 {
294 	/* code descriptor */
295 	return ((sd_type & 0x18) == 0x18);
296 }
297 
298 static bool
299 stack_desc(int sd_type)
300 {
301 	/* writable data descriptor */
302 	return ((sd_type & 0x1A) == 0x12);
303 }
304 
305 static bool
306 data_desc(int sd_type)
307 {
308 	/* data descriptor or a readable code descriptor */
309 	return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A);
310 }
311 
312 static bool
313 ldt_desc(int sd_type)
314 {
315 
316 	return (sd_type == SDT_SYSLDT);
317 }
318 
319 /*
320  * Validate the descriptor 'seg_desc' associated with 'segment'.
321  */
322 static int
323 validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
324     int segment, struct seg_desc *seg_desc, int *faultptr)
325 {
326 	struct vm_guest_paging sup_paging;
327 	struct user_segment_descriptor usd;
328 	int error, idtvec;
329 	int cpl, dpl, rpl;
330 	uint16_t sel, cs;
331 	bool ldtseg, codeseg, stackseg, dataseg, conforming;
332 
333 	ldtseg = codeseg = stackseg = dataseg = false;
334 	switch (segment) {
335 	case VM_REG_GUEST_LDTR:
336 		ldtseg = true;
337 		break;
338 	case VM_REG_GUEST_CS:
339 		codeseg = true;
340 		break;
341 	case VM_REG_GUEST_SS:
342 		stackseg = true;
343 		break;
344 	case VM_REG_GUEST_DS:
345 	case VM_REG_GUEST_ES:
346 	case VM_REG_GUEST_FS:
347 	case VM_REG_GUEST_GS:
348 		dataseg = true;
349 		break;
350 	default:
351 		assert(0);
352 	}
353 
354 	/* Get the segment selector */
355 	sel = GETREG(ctx, vcpu, segment);
356 
357 	/* LDT selector must point into the GDT */
358 	if (ldtseg && ISLDT(sel)) {
359 		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
360 		return (1);
361 	}
362 
363 	/* Descriptor table limit check */
364 	if (desc_table_limit_check(ctx, vcpu, sel)) {
365 		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
366 		return (1);
367 	}
368 
369 	/* NULL selector */
370 	if (IDXSEL(sel) == 0) {
371 		/* Code and stack segment selectors cannot be NULL */
372 		if (codeseg || stackseg) {
373 			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
374 			return (1);
375 		}
376 		seg_desc->base = 0;
377 		seg_desc->limit = 0;
378 		seg_desc->access = 0x10000;	/* unusable */
379 		return (0);
380 	}
381 
382 	/* Read the descriptor from the GDT/LDT */
383 	sup_paging = ts->paging;
384 	sup_paging.cpl = 0;	/* implicit supervisor mode */
385 	error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd, faultptr);
386 	if (error || *faultptr)
387 		return (error);
388 
389 	/* Verify that the descriptor type is compatible with the segment */
390 	if ((ldtseg && !ldt_desc(usd.sd_type)) ||
391 	    (codeseg && !code_desc(usd.sd_type)) ||
392 	    (dataseg && !data_desc(usd.sd_type)) ||
393 	    (stackseg && !stack_desc(usd.sd_type))) {
394 		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
395 		return (1);
396 	}
397 
398 	/* Segment must be marked present */
399 	if (!usd.sd_p) {
400 		if (ldtseg)
401 			idtvec = IDT_TS;
402 		else if (stackseg)
403 			idtvec = IDT_SS;
404 		else
405 			idtvec = IDT_NP;
406 		sel_exception(ctx, vcpu, idtvec, sel, ts->ext);
407 		return (1);
408 	}
409 
410 	cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
411 	cpl = cs & SEL_RPL_MASK;
412 	rpl = sel & SEL_RPL_MASK;
413 	dpl = usd.sd_dpl;
414 
415 	if (stackseg && (rpl != cpl || dpl != cpl)) {
416 		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
417 		return (1);
418 	}
419 
420 	if (codeseg) {
421 		conforming = (usd.sd_type & 0x4) ? true : false;
422 		if ((conforming && (cpl < dpl)) ||
423 		    (!conforming && (cpl != dpl))) {
424 			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
425 			return (1);
426 		}
427 	}
428 
429 	if (dataseg) {
430 		/*
431 		 * A data segment is always non-conforming except when it's
432 		 * descriptor is a readable, conforming code segment.
433 		 */
434 		if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0)
435 			conforming = true;
436 		else
437 			conforming = false;
438 
439 		if (!conforming && (rpl > dpl || cpl > dpl)) {
440 			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
441 			return (1);
442 		}
443 	}
444 	*seg_desc = usd_to_seg_desc(&usd);
445 	return (0);
446 }
447 
448 static void
449 tss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch,
450     uint32_t eip, struct tss32 *tss, struct iovec *iov)
451 {
452 
453 	/* General purpose registers */
454 	tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX);
455 	tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX);
456 	tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX);
457 	tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX);
458 	tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
459 	tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP);
460 	tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI);
461 	tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI);
462 
463 	/* Segment selectors */
464 	tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES);
465 	tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
466 	tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
467 	tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS);
468 	tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS);
469 	tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS);
470 
471 	/* eflags and eip */
472 	tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
473 	if (task_switch->reason == TSR_IRET)
474 		tss->tss_eflags &= ~PSL_NT;
475 	tss->tss_eip = eip;
476 
477 	/* Copy updated old TSS into guest memory */
478 	vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32));
479 }
480 
481 static void
482 update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd)
483 {
484 	int error;
485 
486 	error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access);
487 	assert(error == 0);
488 }
489 
490 /*
491  * Update the vcpu registers to reflect the state of the new task.
492  */
493 static int
494 tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
495     uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr)
496 {
497 	struct seg_desc seg_desc, seg_desc2;
498 	uint64_t *pdpte, maxphyaddr, reserved;
499 	uint32_t eflags;
500 	int error, i;
501 	bool nested;
502 
503 	nested = false;
504 	if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) {
505 		tss->tss_link = ot_sel;
506 		nested = true;
507 	}
508 
509 	eflags = tss->tss_eflags;
510 	if (nested)
511 		eflags |= PSL_NT;
512 
513 	/* LDTR */
514 	SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt);
515 
516 	/* PBDR */
517 	if (ts->paging.paging_mode != PAGING_MODE_FLAT) {
518 		if (ts->paging.paging_mode == PAGING_MODE_PAE) {
519 			/*
520 			 * XXX Assuming 36-bit MAXPHYADDR.
521 			 */
522 			maxphyaddr = (1UL << 36) - 1;
523 			pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32);
524 			for (i = 0; i < 4; i++) {
525 				/* Check reserved bits if the PDPTE is valid */
526 				if (!(pdpte[i] & 0x1))
527 					continue;
528 				/*
529 				 * Bits 2:1, 8:5 and bits above the processor's
530 				 * maximum physical address are reserved.
531 				 */
532 				reserved = ~maxphyaddr | 0x1E6;
533 				if (pdpte[i] & reserved) {
534 					vm_inject_gp(ctx, vcpu);
535 					return (1);
536 				}
537 			}
538 			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]);
539 			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]);
540 			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]);
541 			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]);
542 		}
543 		SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3);
544 		ts->paging.cr3 = tss->tss_cr3;
545 	}
546 
547 	/* eflags and eip */
548 	SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags);
549 	SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip);
550 
551 	/* General purpose registers */
552 	SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax);
553 	SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx);
554 	SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx);
555 	SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx);
556 	SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp);
557 	SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp);
558 	SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi);
559 	SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi);
560 
561 	/* Segment selectors */
562 	SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es);
563 	SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs);
564 	SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss);
565 	SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds);
566 	SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs);
567 	SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs);
568 
569 	/*
570 	 * If this is a nested task then write out the new TSS to update
571 	 * the previous link field.
572 	 */
573 	if (nested)
574 		vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss));
575 
576 	/* Validate segment descriptors */
577 	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc,
578 	    faultptr);
579 	if (error || *faultptr)
580 		return (error);
581 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc);
582 
583 	/*
584 	 * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3.
585 	 *
586 	 * The SS and CS attribute checks on VM-entry are inter-dependent so
587 	 * we need to make sure that both segments are valid before updating
588 	 * either of them. This ensures that the VMCS state can pass the
589 	 * VM-entry checks so the guest can handle any exception injected
590 	 * during task switch emulation.
591 	 */
592 	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc,
593 	    faultptr);
594 	if (error || *faultptr)
595 		return (error);
596 
597 	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2,
598 	    faultptr);
599 	if (error || *faultptr)
600 		return (error);
601 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc);
602 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2);
603 	ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK;
604 
605 	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc,
606 	    faultptr);
607 	if (error || *faultptr)
608 		return (error);
609 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc);
610 
611 	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc,
612 	    faultptr);
613 	if (error || *faultptr)
614 		return (error);
615 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc);
616 
617 	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc,
618 	    faultptr);
619 	if (error || *faultptr)
620 		return (error);
621 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc);
622 
623 	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc,
624 	    faultptr);
625 	if (error || *faultptr)
626 		return (error);
627 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc);
628 
629 	return (0);
630 }
631 
632 
633 /*
634  * Copy of vie_alignment_check() from vmm_instruction_emul.c
635  */
636 static int
637 alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
638 {
639 	assert(size == 1 || size == 2 || size == 4 || size == 8);
640 	assert(cpl >= 0 && cpl <= 3);
641 
642 	if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
643 		return (0);
644 
645 	return ((gla & (size - 1)) ? 1 : 0);
646 }
647 
648 /*
649  * Copy of vie_size2mask() from vmm_instruction_emul.c
650  */
651 static uint64_t
652 size2mask(int size)
653 {
654 	switch (size) {
655 	case 1:
656 		return (0xff);
657 	case 2:
658 		return (0xffff);
659 	case 4:
660 		return (0xffffffff);
661 	case 8:
662 		return (0xffffffffffffffff);
663 	default:
664 		assert(0);
665 		/* not reached */
666 		return (0);
667 	}
668 }
669 
670 /*
671  * Copy of vie_calculate_gla() from vmm_instruction_emul.c
672  */
673 static int
674 calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
675     struct seg_desc *desc, uint64_t offset, int length, int addrsize,
676     int prot, uint64_t *gla)
677 {
678 	uint64_t firstoff, low_limit, high_limit, segbase;
679 	int glasize, type;
680 
681 	assert(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS);
682 	assert((length == 1 || length == 2 || length == 4 || length == 8));
683 	assert((prot & ~(PROT_READ | PROT_WRITE)) == 0);
684 
685 	firstoff = offset;
686 	if (cpu_mode == CPU_MODE_64BIT) {
687 		assert(addrsize == 4 || addrsize == 8);
688 		glasize = 8;
689 	} else {
690 		assert(addrsize == 2 || addrsize == 4);
691 		glasize = 4;
692 		/*
693 		 * If the segment selector is loaded with a NULL selector
694 		 * then the descriptor is unusable and attempting to use
695 		 * it results in a #GP(0).
696 		 */
697 		if (SEG_DESC_UNUSABLE(desc->access))
698 			return (-1);
699 
700 		/*
701 		 * The processor generates a #NP exception when a segment
702 		 * register is loaded with a selector that points to a
703 		 * descriptor that is not present. If this was the case then
704 		 * it would have been checked before the VM-exit.
705 		 */
706 		assert(SEG_DESC_PRESENT(desc->access));
707 
708 		/*
709 		 * The descriptor type must indicate a code/data segment.
710 		 */
711 		type = SEG_DESC_TYPE(desc->access);
712 		assert(type >= 16 && type <= 31);
713 
714 		if (prot & PROT_READ) {
715 			/* #GP on a read access to a exec-only code segment */
716 			if ((type & 0xA) == 0x8)
717 				return (-1);
718 		}
719 
720 		if (prot & PROT_WRITE) {
721 			/*
722 			 * #GP on a write access to a code segment or a
723 			 * read-only data segment.
724 			 */
725 			if (type & 0x8)			/* code segment */
726 				return (-1);
727 
728 			if ((type & 0xA) == 0)		/* read-only data seg */
729 				return (-1);
730 		}
731 
732 		/*
733 		 * 'desc->limit' is fully expanded taking granularity into
734 		 * account.
735 		 */
736 		if ((type & 0xC) == 0x4) {
737 			/* expand-down data segment */
738 			low_limit = desc->limit + 1;
739 			high_limit = SEG_DESC_DEF32(desc->access) ?
740 			    0xffffffff : 0xffff;
741 		} else {
742 			/* code segment or expand-up data segment */
743 			low_limit = 0;
744 			high_limit = desc->limit;
745 		}
746 
747 		while (length > 0) {
748 			offset &= size2mask(addrsize);
749 			if (offset < low_limit || offset > high_limit)
750 				return (-1);
751 			offset++;
752 			length--;
753 		}
754 	}
755 
756 	/*
757 	 * In 64-bit mode all segments except %fs and %gs have a segment
758 	 * base address of 0.
759 	 */
760 	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
761 	    seg != VM_REG_GUEST_GS) {
762 		segbase = 0;
763 	} else {
764 		segbase = desc->base;
765 	}
766 
767 	/*
768 	 * Truncate 'firstoff' to the effective address size before adding
769 	 * it to the segment base.
770 	 */
771 	firstoff &= size2mask(addrsize);
772 	*gla = (segbase + firstoff) & size2mask(glasize);
773 	return (0);
774 }
775 
776 /*
777  * Push an error code on the stack of the new task. This is needed if the
778  * task switch was triggered by a hardware exception that causes an error
779  * code to be saved (e.g. #PF).
780  */
781 static int
782 push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
783     int task_type, uint32_t errcode, int *faultptr)
784 {
785 	struct iovec iov[2];
786 	struct seg_desc seg_desc;
787 	int stacksize, bytes, error;
788 	uint64_t gla, cr0, rflags;
789 	uint32_t esp;
790 	uint16_t stacksel;
791 
792 	*faultptr = 0;
793 
794 	cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
795 	rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
796 	stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
797 
798 	error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base,
799 	    &seg_desc.limit, &seg_desc.access);
800 	assert(error == 0);
801 
802 	/*
803 	 * Section "Error Code" in the Intel SDM vol 3: the error code is
804 	 * pushed on the stack as a doubleword or word (depending on the
805 	 * default interrupt, trap or task gate size).
806 	 */
807 	if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS)
808 		bytes = 4;
809 	else
810 		bytes = 2;
811 
812 	/*
813 	 * PUSH instruction from Intel SDM vol 2: the 'B' flag in the
814 	 * stack-segment descriptor determines the size of the stack
815 	 * pointer outside of 64-bit mode.
816 	 */
817 	if (SEG_DESC_DEF32(seg_desc.access))
818 		stacksize = 4;
819 	else
820 		stacksize = 2;
821 
822 	esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
823 	esp -= bytes;
824 
825 	if (calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS,
826 	    &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) {
827 		sel_exception(ctx, vcpu, IDT_SS, stacksel, 1);
828 		*faultptr = 1;
829 		return (0);
830 	}
831 
832 	if (alignment_check(paging->cpl, bytes, cr0, rflags, gla)) {
833 		vm_inject_ac(ctx, vcpu, 1);
834 		*faultptr = 1;
835 		return (0);
836 	}
837 
838 	error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE,
839 	    iov, nitems(iov), faultptr);
840 	if (error || *faultptr)
841 		return (error);
842 
843 	vm_copyout(ctx, vcpu, &errcode, iov, bytes);
844 	SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp);
845 	return (0);
846 }
847 
848 /*
849  * Evaluate return value from helper functions and potentially return to
850  * the VM run loop.
851  */
852 #define	CHKERR(error,fault)						\
853 	do {								\
854 		assert((error == 0) || (error == EFAULT));		\
855 		if (error)						\
856 			return (VMEXIT_ABORT);				\
857 		else if (fault)						\
858 			return (VMEXIT_CONTINUE);			\
859 	} while (0)
860 
861 int
862 vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
863 {
864 	struct seg_desc nt;
865 	struct tss32 oldtss, newtss;
866 	struct vm_task_switch *task_switch;
867 	struct vm_guest_paging *paging, sup_paging;
868 	struct user_segment_descriptor nt_desc, ot_desc;
869 	struct iovec nt_iov[2], ot_iov[2];
870 	uint64_t cr0, ot_base;
871 	uint32_t eip, ot_lim, access;
872 	int error, ext, fault, minlimit, nt_type, ot_type, vcpu;
873 	enum task_switch_reason reason;
874 	uint16_t nt_sel, ot_sel;
875 
876 	task_switch = &vmexit->u.task_switch;
877 	nt_sel = task_switch->tsssel;
878 	ext = vmexit->u.task_switch.ext;
879 	reason = vmexit->u.task_switch.reason;
880 	paging = &vmexit->u.task_switch.paging;
881 	vcpu = *pvcpu;
882 
883 	assert(paging->cpu_mode == CPU_MODE_PROTECTED);
884 
885 	/*
886 	 * Calculate the instruction pointer to store in the old TSS.
887 	 */
888 	eip = vmexit->rip + vmexit->inst_length;
889 
890 	/*
891 	 * Section 4.6, "Access Rights" in Intel SDM Vol 3.
892 	 * The following page table accesses are implicitly supervisor mode:
893 	 * - accesses to GDT or LDT to load segment descriptors
894 	 * - accesses to the task state segment during task switch
895 	 */
896 	sup_paging = *paging;
897 	sup_paging.cpl = 0;	/* implicit supervisor mode */
898 
899 	/* Fetch the new TSS descriptor */
900 	error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc,
901 	    &fault);
902 	CHKERR(error, fault);
903 
904 	nt = usd_to_seg_desc(&nt_desc);
905 
906 	/* Verify the type of the new TSS */
907 	nt_type = SEG_DESC_TYPE(nt.access);
908 	if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS &&
909 	    nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) {
910 		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
911 		goto done;
912 	}
913 
914 	/* TSS descriptor must have present bit set */
915 	if (!SEG_DESC_PRESENT(nt.access)) {
916 		sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext);
917 		goto done;
918 	}
919 
920 	/*
921 	 * TSS must have a minimum length of 104 bytes for a 32-bit TSS and
922 	 * 44 bytes for a 16-bit TSS.
923 	 */
924 	if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS)
925 		minlimit = 104 - 1;
926 	else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS)
927 		minlimit = 44 - 1;
928 	else
929 		minlimit = 0;
930 
931 	assert(minlimit > 0);
932 	if (nt.limit < (unsigned int)minlimit) {
933 		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
934 		goto done;
935 	}
936 
937 	/* TSS must be busy if task switch is due to IRET */
938 	if (reason == TSR_IRET && !TSS_BUSY(nt_type)) {
939 		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
940 		goto done;
941 	}
942 
943 	/*
944 	 * TSS must be available (not busy) if task switch reason is
945 	 * CALL, JMP, exception or interrupt.
946 	 */
947 	if (reason != TSR_IRET && TSS_BUSY(nt_type)) {
948 		sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext);
949 		goto done;
950 	}
951 
952 	/* Fetch the new TSS */
953 	error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1,
954 	    PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault);
955 	CHKERR(error, fault);
956 	vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1);
957 
958 	/* Get the old TSS selector from the guest's task register */
959 	ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR);
960 	if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) {
961 		/*
962 		 * This might happen if a task switch was attempted without
963 		 * ever loading the task register with LTR. In this case the
964 		 * TR would contain the values from power-on:
965 		 * (sel = 0, base = 0, limit = 0xffff).
966 		 */
967 		sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext);
968 		goto done;
969 	}
970 
971 	/* Get the old TSS base and limit from the guest's task register */
972 	error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim,
973 	    &access);
974 	assert(error == 0);
975 	assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access));
976 	ot_type = SEG_DESC_TYPE(access);
977 	assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY);
978 
979 	/* Fetch the old TSS descriptor */
980 	error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc,
981 	    &fault);
982 	CHKERR(error, fault);
983 
984 	/* Get the old TSS */
985 	error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1,
986 	    PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault);
987 	CHKERR(error, fault);
988 	vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1);
989 
990 	/*
991 	 * Clear the busy bit in the old TSS descriptor if the task switch
992 	 * due to an IRET or JMP instruction.
993 	 */
994 	if (reason == TSR_IRET || reason == TSR_JMP) {
995 		ot_desc.sd_type &= ~0x2;
996 		error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel,
997 		    &ot_desc, &fault);
998 		CHKERR(error, fault);
999 	}
1000 
1001 	if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) {
1002 		EPRINTLN("Task switch to 16-bit TSS not supported");
1003 		return (VMEXIT_ABORT);
1004 	}
1005 
1006 	/* Save processor state in old TSS */
1007 	tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov);
1008 
1009 	/*
1010 	 * If the task switch was triggered for any reason other than IRET
1011 	 * then set the busy bit in the new TSS descriptor.
1012 	 */
1013 	if (reason != TSR_IRET) {
1014 		nt_desc.sd_type |= 0x2;
1015 		error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel,
1016 		    &nt_desc, &fault);
1017 		CHKERR(error, fault);
1018 	}
1019 
1020 	/* Update task register to point at the new TSS */
1021 	SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel);
1022 
1023 	/* Update the hidden descriptor state of the task register */
1024 	nt = usd_to_seg_desc(&nt_desc);
1025 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt);
1026 
1027 	/* Set CR0.TS */
1028 	cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
1029 	SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS);
1030 
1031 	/*
1032 	 * We are now committed to the task switch. Any exceptions encountered
1033 	 * after this point will be handled in the context of the new task and
1034 	 * the saved instruction pointer will belong to the new task.
1035 	 */
1036 	error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, newtss.tss_eip);
1037 	assert(error == 0);
1038 
1039 	/* Load processor state from new TSS */
1040 	error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov,
1041 	    &fault);
1042 	CHKERR(error, fault);
1043 
1044 	/*
1045 	 * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception
1046 	 * caused an error code to be generated, this error code is copied
1047 	 * to the stack of the new task.
1048 	 */
1049 	if (task_switch->errcode_valid) {
1050 		assert(task_switch->ext);
1051 		assert(task_switch->reason == TSR_IDT_GATE);
1052 		error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type,
1053 		    task_switch->errcode, &fault);
1054 		CHKERR(error, fault);
1055 	}
1056 
1057 	/*
1058 	 * Treatment of virtual-NMI blocking if NMI is delivered through
1059 	 * a task gate.
1060 	 *
1061 	 * Section "Architectural State Before A VM Exit", Intel SDM, Vol3:
1062 	 * If the virtual NMIs VM-execution control is 1, VM entry injects
1063 	 * an NMI, and delivery of the NMI causes a task switch that causes
1064 	 * a VM exit, virtual-NMI blocking is in effect before the VM exit
1065 	 * commences.
1066 	 *
1067 	 * Thus, virtual-NMI blocking is in effect at the time of the task
1068 	 * switch VM exit.
1069 	 */
1070 
1071 	/*
1072 	 * Treatment of virtual-NMI unblocking on IRET from NMI handler task.
1073 	 *
1074 	 * Section "Changes to Instruction Behavior in VMX Non-Root Operation"
1075 	 * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking.
1076 	 * This unblocking of virtual-NMI occurs even if IRET causes a fault.
1077 	 *
1078 	 * Thus, virtual-NMI blocking is cleared at the time of the task switch
1079 	 * VM exit.
1080 	 */
1081 
1082 	/*
1083 	 * If the task switch was triggered by an event delivered through
1084 	 * the IDT then extinguish the pending event from the vcpu's
1085 	 * exitintinfo.
1086 	 */
1087 	if (task_switch->reason == TSR_IDT_GATE) {
1088 		error = vm_set_intinfo(ctx, vcpu, 0);
1089 		assert(error == 0);
1090 	}
1091 
1092 	/*
1093 	 * XXX should inject debug exception if 'T' bit is 1
1094 	 */
1095 done:
1096 	return (VMEXIT_CONTINUE);
1097 }
1098