xref: /freebsd/usr.sbin/bhyve/amd64/task_switch.c (revision 1ca63a8219b88b752b064d19bd3428c61dbcf1f9)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2014 Neel Natu <neel@freebsd.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 #include <sys/param.h>
31 #include <sys/_iovec.h>
32 #include <sys/mman.h>
33 
34 #include <x86/psl.h>
35 #include <x86/specialreg.h>
36 #include <machine/vmm.h>
37 #include <machine/vmm_instruction_emul.h>
38 
39 #include <assert.h>
40 #include <errno.h>
41 #include <stdbool.h>
42 #include <stdio.h>
43 #include <stdlib.h>
44 
45 #include <vmmapi.h>
46 
47 #include "bhyverun.h"
48 #include "debug.h"
49 
50 /*
51  * Using 'struct i386tss' is tempting but causes myriad sign extension
52  * issues because all of its fields are defined as signed integers.
53  */
54 struct tss32 {
55 	uint16_t	tss_link;
56 	uint16_t	rsvd1;
57 	uint32_t	tss_esp0;
58 	uint16_t	tss_ss0;
59 	uint16_t	rsvd2;
60 	uint32_t	tss_esp1;
61 	uint16_t	tss_ss1;
62 	uint16_t	rsvd3;
63 	uint32_t	tss_esp2;
64 	uint16_t	tss_ss2;
65 	uint16_t	rsvd4;
66 	uint32_t	tss_cr3;
67 	uint32_t	tss_eip;
68 	uint32_t	tss_eflags;
69 	uint32_t	tss_eax;
70 	uint32_t	tss_ecx;
71 	uint32_t	tss_edx;
72 	uint32_t	tss_ebx;
73 	uint32_t	tss_esp;
74 	uint32_t	tss_ebp;
75 	uint32_t	tss_esi;
76 	uint32_t	tss_edi;
77 	uint16_t	tss_es;
78 	uint16_t	rsvd5;
79 	uint16_t	tss_cs;
80 	uint16_t	rsvd6;
81 	uint16_t	tss_ss;
82 	uint16_t	rsvd7;
83 	uint16_t	tss_ds;
84 	uint16_t	rsvd8;
85 	uint16_t	tss_fs;
86 	uint16_t	rsvd9;
87 	uint16_t	tss_gs;
88 	uint16_t	rsvd10;
89 	uint16_t	tss_ldt;
90 	uint16_t	rsvd11;
91 	uint16_t	tss_trap;
92 	uint16_t	tss_iomap;
93 };
94 static_assert(sizeof(struct tss32) == 104, "compile-time assertion failed");
95 
96 #define	SEL_START(sel)	(((sel) & ~0x7))
97 #define	SEL_LIMIT(sel)	(((sel) | 0x7))
98 #define	TSS_BUSY(type)	(((type) & 0x2) != 0)
99 
100 static uint64_t
101 GETREG(struct vcpu *vcpu, int reg)
102 {
103 	uint64_t val;
104 	int error;
105 
106 	error = vm_get_register(vcpu, reg, &val);
107 	assert(error == 0);
108 	return (val);
109 }
110 
111 static void
112 SETREG(struct vcpu *vcpu, int reg, uint64_t val)
113 {
114 	int error;
115 
116 	error = vm_set_register(vcpu, reg, val);
117 	assert(error == 0);
118 }
119 
120 static struct seg_desc
121 usd_to_seg_desc(struct user_segment_descriptor *usd)
122 {
123 	struct seg_desc seg_desc;
124 
125 	seg_desc.base = (u_int)USD_GETBASE(usd);
126 	if (usd->sd_gran)
127 		seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff;
128 	else
129 		seg_desc.limit = (u_int)USD_GETLIMIT(usd);
130 	seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7;
131 	seg_desc.access |= usd->sd_xx << 12;
132 	seg_desc.access |= usd->sd_def32 << 14;
133 	seg_desc.access |= usd->sd_gran << 15;
134 
135 	return (seg_desc);
136 }
137 
138 /*
139  * Inject an exception with an error code that is a segment selector.
140  * The format of the error code is described in section 6.13, "Error Code",
141  * Intel SDM volume 3.
142  *
143  * Bit 0 (EXT) denotes whether the exception occurred during delivery
144  * of an external event like an interrupt.
145  *
146  * Bit 1 (IDT) indicates whether the selector points to a gate descriptor
147  * in the IDT.
148  *
149  * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI).
150  */
151 static void
152 sel_exception(struct vcpu *vcpu, int vector, uint16_t sel, int ext)
153 {
154 	/*
155 	 * Bit 2 from the selector is retained as-is in the error code.
156 	 *
157 	 * Bit 1 can be safely cleared because none of the selectors
158 	 * encountered during task switch emulation refer to a task
159 	 * gate in the IDT.
160 	 *
161 	 * Bit 0 is set depending on the value of 'ext'.
162 	 */
163 	sel &= ~0x3;
164 	if (ext)
165 		sel |= 0x1;
166 	vm_inject_fault(vcpu, vector, 1, sel);
167 }
168 
169 /*
170  * Return 0 if the selector 'sel' in within the limits of the GDT/LDT
171  * and non-zero otherwise.
172  */
173 static int
174 desc_table_limit_check(struct vcpu *vcpu, uint16_t sel)
175 {
176 	uint64_t base;
177 	uint32_t limit, access;
178 	int error, reg;
179 
180 	reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
181 	error = vm_get_desc(vcpu, reg, &base, &limit, &access);
182 	assert(error == 0);
183 
184 	if (reg == VM_REG_GUEST_LDTR) {
185 		if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access))
186 			return (-1);
187 	}
188 
189 	if (limit < SEL_LIMIT(sel))
190 		return (-1);
191 	else
192 		return (0);
193 }
194 
195 /*
196  * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced
197  * by the selector 'sel'.
198  *
199  * Returns 0 on success.
200  * Returns 1 if an exception was injected into the guest.
201  * Returns -1 otherwise.
202  */
203 static int
204 desc_table_rw(struct vcpu *vcpu, struct vm_guest_paging *paging,
205     uint16_t sel, struct user_segment_descriptor *desc, bool doread,
206     int *faultptr)
207 {
208 	struct iovec iov[2];
209 	uint64_t base;
210 	uint32_t limit, access;
211 	int error, reg;
212 
213 	reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
214 	error = vm_get_desc(vcpu, reg, &base, &limit, &access);
215 	assert(error == 0);
216 	assert(limit >= SEL_LIMIT(sel));
217 
218 	error = vm_copy_setup(vcpu, paging, base + SEL_START(sel),
219 	    sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov),
220 	    faultptr);
221 	if (error || *faultptr)
222 		return (error);
223 
224 	if (doread)
225 		vm_copyin(iov, desc, sizeof(*desc));
226 	else
227 		vm_copyout(desc, iov, sizeof(*desc));
228 	return (0);
229 }
230 
231 static int
232 desc_table_read(struct vcpu *vcpu, struct vm_guest_paging *paging,
233     uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
234 {
235 	return (desc_table_rw(vcpu, paging, sel, desc, true, faultptr));
236 }
237 
238 static int
239 desc_table_write(struct vcpu *vcpu, struct vm_guest_paging *paging,
240     uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
241 {
242 	return (desc_table_rw(vcpu, paging, sel, desc, false, faultptr));
243 }
244 
245 /*
246  * Read the TSS descriptor referenced by 'sel' into 'desc'.
247  *
248  * Returns 0 on success.
249  * Returns 1 if an exception was injected into the guest.
250  * Returns -1 otherwise.
251  */
252 static int
253 read_tss_descriptor(struct vcpu *vcpu, struct vm_task_switch *ts,
254     uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
255 {
256 	struct vm_guest_paging sup_paging;
257 	int error;
258 
259 	assert(!ISLDT(sel));
260 	assert(IDXSEL(sel) != 0);
261 
262 	/* Fetch the new TSS descriptor */
263 	if (desc_table_limit_check(vcpu, sel)) {
264 		if (ts->reason == TSR_IRET)
265 			sel_exception(vcpu, IDT_TS, sel, ts->ext);
266 		else
267 			sel_exception(vcpu, IDT_GP, sel, ts->ext);
268 		return (1);
269 	}
270 
271 	sup_paging = ts->paging;
272 	sup_paging.cpl = 0;		/* implicit supervisor mode */
273 	error = desc_table_read(vcpu, &sup_paging, sel, desc, faultptr);
274 	return (error);
275 }
276 
277 static bool
278 code_desc(int sd_type)
279 {
280 	/* code descriptor */
281 	return ((sd_type & 0x18) == 0x18);
282 }
283 
284 static bool
285 stack_desc(int sd_type)
286 {
287 	/* writable data descriptor */
288 	return ((sd_type & 0x1A) == 0x12);
289 }
290 
291 static bool
292 data_desc(int sd_type)
293 {
294 	/* data descriptor or a readable code descriptor */
295 	return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A);
296 }
297 
298 static bool
299 ldt_desc(int sd_type)
300 {
301 
302 	return (sd_type == SDT_SYSLDT);
303 }
304 
305 /*
306  * Validate the descriptor 'seg_desc' associated with 'segment'.
307  */
308 static int
309 validate_seg_desc(struct vcpu *vcpu, struct vm_task_switch *ts,
310     int segment, struct seg_desc *seg_desc, int *faultptr)
311 {
312 	struct vm_guest_paging sup_paging;
313 	struct user_segment_descriptor usd;
314 	int error, idtvec;
315 	int cpl, dpl, rpl;
316 	uint16_t sel, cs;
317 	bool ldtseg, codeseg, stackseg, dataseg, conforming;
318 
319 	ldtseg = codeseg = stackseg = dataseg = false;
320 	switch (segment) {
321 	case VM_REG_GUEST_LDTR:
322 		ldtseg = true;
323 		break;
324 	case VM_REG_GUEST_CS:
325 		codeseg = true;
326 		break;
327 	case VM_REG_GUEST_SS:
328 		stackseg = true;
329 		break;
330 	case VM_REG_GUEST_DS:
331 	case VM_REG_GUEST_ES:
332 	case VM_REG_GUEST_FS:
333 	case VM_REG_GUEST_GS:
334 		dataseg = true;
335 		break;
336 	default:
337 		assert(0);
338 	}
339 
340 	/* Get the segment selector */
341 	sel = GETREG(vcpu, segment);
342 
343 	/* LDT selector must point into the GDT */
344 	if (ldtseg && ISLDT(sel)) {
345 		sel_exception(vcpu, IDT_TS, sel, ts->ext);
346 		return (1);
347 	}
348 
349 	/* Descriptor table limit check */
350 	if (desc_table_limit_check(vcpu, sel)) {
351 		sel_exception(vcpu, IDT_TS, sel, ts->ext);
352 		return (1);
353 	}
354 
355 	/* NULL selector */
356 	if (IDXSEL(sel) == 0) {
357 		/* Code and stack segment selectors cannot be NULL */
358 		if (codeseg || stackseg) {
359 			sel_exception(vcpu, IDT_TS, sel, ts->ext);
360 			return (1);
361 		}
362 		seg_desc->base = 0;
363 		seg_desc->limit = 0;
364 		seg_desc->access = 0x10000;	/* unusable */
365 		return (0);
366 	}
367 
368 	/* Read the descriptor from the GDT/LDT */
369 	sup_paging = ts->paging;
370 	sup_paging.cpl = 0;	/* implicit supervisor mode */
371 	error = desc_table_read(vcpu, &sup_paging, sel, &usd, faultptr);
372 	if (error || *faultptr)
373 		return (error);
374 
375 	/* Verify that the descriptor type is compatible with the segment */
376 	if ((ldtseg && !ldt_desc(usd.sd_type)) ||
377 	    (codeseg && !code_desc(usd.sd_type)) ||
378 	    (dataseg && !data_desc(usd.sd_type)) ||
379 	    (stackseg && !stack_desc(usd.sd_type))) {
380 		sel_exception(vcpu, IDT_TS, sel, ts->ext);
381 		return (1);
382 	}
383 
384 	/* Segment must be marked present */
385 	if (!usd.sd_p) {
386 		if (ldtseg)
387 			idtvec = IDT_TS;
388 		else if (stackseg)
389 			idtvec = IDT_SS;
390 		else
391 			idtvec = IDT_NP;
392 		sel_exception(vcpu, idtvec, sel, ts->ext);
393 		return (1);
394 	}
395 
396 	cs = GETREG(vcpu, VM_REG_GUEST_CS);
397 	cpl = cs & SEL_RPL_MASK;
398 	rpl = sel & SEL_RPL_MASK;
399 	dpl = usd.sd_dpl;
400 
401 	if (stackseg && (rpl != cpl || dpl != cpl)) {
402 		sel_exception(vcpu, IDT_TS, sel, ts->ext);
403 		return (1);
404 	}
405 
406 	if (codeseg) {
407 		conforming = (usd.sd_type & 0x4) ? true : false;
408 		if ((conforming && (cpl < dpl)) ||
409 		    (!conforming && (cpl != dpl))) {
410 			sel_exception(vcpu, IDT_TS, sel, ts->ext);
411 			return (1);
412 		}
413 	}
414 
415 	if (dataseg) {
416 		/*
417 		 * A data segment is always non-conforming except when it's
418 		 * descriptor is a readable, conforming code segment.
419 		 */
420 		if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0)
421 			conforming = true;
422 		else
423 			conforming = false;
424 
425 		if (!conforming && (rpl > dpl || cpl > dpl)) {
426 			sel_exception(vcpu, IDT_TS, sel, ts->ext);
427 			return (1);
428 		}
429 	}
430 	*seg_desc = usd_to_seg_desc(&usd);
431 	return (0);
432 }
433 
434 static void
435 tss32_save(struct vcpu *vcpu, struct vm_task_switch *task_switch,
436     uint32_t eip, struct tss32 *tss, struct iovec *iov)
437 {
438 
439 	/* General purpose registers */
440 	tss->tss_eax = GETREG(vcpu, VM_REG_GUEST_RAX);
441 	tss->tss_ecx = GETREG(vcpu, VM_REG_GUEST_RCX);
442 	tss->tss_edx = GETREG(vcpu, VM_REG_GUEST_RDX);
443 	tss->tss_ebx = GETREG(vcpu, VM_REG_GUEST_RBX);
444 	tss->tss_esp = GETREG(vcpu, VM_REG_GUEST_RSP);
445 	tss->tss_ebp = GETREG(vcpu, VM_REG_GUEST_RBP);
446 	tss->tss_esi = GETREG(vcpu, VM_REG_GUEST_RSI);
447 	tss->tss_edi = GETREG(vcpu, VM_REG_GUEST_RDI);
448 
449 	/* Segment selectors */
450 	tss->tss_es = GETREG(vcpu, VM_REG_GUEST_ES);
451 	tss->tss_cs = GETREG(vcpu, VM_REG_GUEST_CS);
452 	tss->tss_ss = GETREG(vcpu, VM_REG_GUEST_SS);
453 	tss->tss_ds = GETREG(vcpu, VM_REG_GUEST_DS);
454 	tss->tss_fs = GETREG(vcpu, VM_REG_GUEST_FS);
455 	tss->tss_gs = GETREG(vcpu, VM_REG_GUEST_GS);
456 
457 	/* eflags and eip */
458 	tss->tss_eflags = GETREG(vcpu, VM_REG_GUEST_RFLAGS);
459 	if (task_switch->reason == TSR_IRET)
460 		tss->tss_eflags &= ~PSL_NT;
461 	tss->tss_eip = eip;
462 
463 	/* Copy updated old TSS into guest memory */
464 	vm_copyout(tss, iov, sizeof(struct tss32));
465 }
466 
467 static void
468 update_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *sd)
469 {
470 	int error;
471 
472 	error = vm_set_desc(vcpu, reg, sd->base, sd->limit, sd->access);
473 	assert(error == 0);
474 }
475 
476 /*
477  * Update the vcpu registers to reflect the state of the new task.
478  */
479 static int
480 tss32_restore(struct vmctx *ctx, struct vcpu *vcpu, struct vm_task_switch *ts,
481     uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr)
482 {
483 	struct seg_desc seg_desc, seg_desc2;
484 	uint64_t *pdpte, maxphyaddr, reserved;
485 	uint32_t eflags;
486 	int error, i;
487 	bool nested;
488 
489 	nested = false;
490 	if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) {
491 		tss->tss_link = ot_sel;
492 		nested = true;
493 	}
494 
495 	eflags = tss->tss_eflags;
496 	if (nested)
497 		eflags |= PSL_NT;
498 
499 	/* LDTR */
500 	SETREG(vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt);
501 
502 	/* PBDR */
503 	if (ts->paging.paging_mode != PAGING_MODE_FLAT) {
504 		if (ts->paging.paging_mode == PAGING_MODE_PAE) {
505 			/*
506 			 * XXX Assuming 36-bit MAXPHYADDR.
507 			 */
508 			maxphyaddr = (1UL << 36) - 1;
509 			pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32);
510 			for (i = 0; i < 4; i++) {
511 				/* Check reserved bits if the PDPTE is valid */
512 				if (!(pdpte[i] & 0x1))
513 					continue;
514 				/*
515 				 * Bits 2:1, 8:5 and bits above the processor's
516 				 * maximum physical address are reserved.
517 				 */
518 				reserved = ~maxphyaddr | 0x1E6;
519 				if (pdpte[i] & reserved) {
520 					vm_inject_gp(vcpu);
521 					return (1);
522 				}
523 			}
524 			SETREG(vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]);
525 			SETREG(vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]);
526 			SETREG(vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]);
527 			SETREG(vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]);
528 		}
529 		SETREG(vcpu, VM_REG_GUEST_CR3, tss->tss_cr3);
530 		ts->paging.cr3 = tss->tss_cr3;
531 	}
532 
533 	/* eflags and eip */
534 	SETREG(vcpu, VM_REG_GUEST_RFLAGS, eflags);
535 	SETREG(vcpu, VM_REG_GUEST_RIP, tss->tss_eip);
536 
537 	/* General purpose registers */
538 	SETREG(vcpu, VM_REG_GUEST_RAX, tss->tss_eax);
539 	SETREG(vcpu, VM_REG_GUEST_RCX, tss->tss_ecx);
540 	SETREG(vcpu, VM_REG_GUEST_RDX, tss->tss_edx);
541 	SETREG(vcpu, VM_REG_GUEST_RBX, tss->tss_ebx);
542 	SETREG(vcpu, VM_REG_GUEST_RSP, tss->tss_esp);
543 	SETREG(vcpu, VM_REG_GUEST_RBP, tss->tss_ebp);
544 	SETREG(vcpu, VM_REG_GUEST_RSI, tss->tss_esi);
545 	SETREG(vcpu, VM_REG_GUEST_RDI, tss->tss_edi);
546 
547 	/* Segment selectors */
548 	SETREG(vcpu, VM_REG_GUEST_ES, tss->tss_es);
549 	SETREG(vcpu, VM_REG_GUEST_CS, tss->tss_cs);
550 	SETREG(vcpu, VM_REG_GUEST_SS, tss->tss_ss);
551 	SETREG(vcpu, VM_REG_GUEST_DS, tss->tss_ds);
552 	SETREG(vcpu, VM_REG_GUEST_FS, tss->tss_fs);
553 	SETREG(vcpu, VM_REG_GUEST_GS, tss->tss_gs);
554 
555 	/*
556 	 * If this is a nested task then write out the new TSS to update
557 	 * the previous link field.
558 	 */
559 	if (nested)
560 		vm_copyout(tss, iov, sizeof(*tss));
561 
562 	/* Validate segment descriptors */
563 	error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc,
564 	    faultptr);
565 	if (error || *faultptr)
566 		return (error);
567 	update_seg_desc(vcpu, VM_REG_GUEST_LDTR, &seg_desc);
568 
569 	/*
570 	 * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3.
571 	 *
572 	 * The SS and CS attribute checks on VM-entry are inter-dependent so
573 	 * we need to make sure that both segments are valid before updating
574 	 * either of them. This ensures that the VMCS state can pass the
575 	 * VM-entry checks so the guest can handle any exception injected
576 	 * during task switch emulation.
577 	 */
578 	error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_CS, &seg_desc,
579 	    faultptr);
580 	if (error || *faultptr)
581 		return (error);
582 
583 	error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_SS, &seg_desc2,
584 	    faultptr);
585 	if (error || *faultptr)
586 		return (error);
587 	update_seg_desc(vcpu, VM_REG_GUEST_CS, &seg_desc);
588 	update_seg_desc(vcpu, VM_REG_GUEST_SS, &seg_desc2);
589 	ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK;
590 
591 	error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_DS, &seg_desc,
592 	    faultptr);
593 	if (error || *faultptr)
594 		return (error);
595 	update_seg_desc(vcpu, VM_REG_GUEST_DS, &seg_desc);
596 
597 	error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_ES, &seg_desc,
598 	    faultptr);
599 	if (error || *faultptr)
600 		return (error);
601 	update_seg_desc(vcpu, VM_REG_GUEST_ES, &seg_desc);
602 
603 	error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_FS, &seg_desc,
604 	    faultptr);
605 	if (error || *faultptr)
606 		return (error);
607 	update_seg_desc(vcpu, VM_REG_GUEST_FS, &seg_desc);
608 
609 	error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_GS, &seg_desc,
610 	    faultptr);
611 	if (error || *faultptr)
612 		return (error);
613 	update_seg_desc(vcpu, VM_REG_GUEST_GS, &seg_desc);
614 
615 	return (0);
616 }
617 
618 /*
619  * Push an error code on the stack of the new task. This is needed if the
620  * task switch was triggered by a hardware exception that causes an error
621  * code to be saved (e.g. #PF).
622  */
623 static int
624 push_errcode(struct vcpu *vcpu, struct vm_guest_paging *paging,
625     int task_type, uint32_t errcode, int *faultptr)
626 {
627 	struct iovec iov[2];
628 	struct seg_desc seg_desc;
629 	int stacksize, bytes, error;
630 	uint64_t gla, cr0, rflags;
631 	uint32_t esp;
632 	uint16_t stacksel;
633 
634 	*faultptr = 0;
635 
636 	cr0 = GETREG(vcpu, VM_REG_GUEST_CR0);
637 	rflags = GETREG(vcpu, VM_REG_GUEST_RFLAGS);
638 	stacksel = GETREG(vcpu, VM_REG_GUEST_SS);
639 
640 	error = vm_get_desc(vcpu, VM_REG_GUEST_SS, &seg_desc.base,
641 	    &seg_desc.limit, &seg_desc.access);
642 	assert(error == 0);
643 
644 	/*
645 	 * Section "Error Code" in the Intel SDM vol 3: the error code is
646 	 * pushed on the stack as a doubleword or word (depending on the
647 	 * default interrupt, trap or task gate size).
648 	 */
649 	if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS)
650 		bytes = 4;
651 	else
652 		bytes = 2;
653 
654 	/*
655 	 * PUSH instruction from Intel SDM vol 2: the 'B' flag in the
656 	 * stack-segment descriptor determines the size of the stack
657 	 * pointer outside of 64-bit mode.
658 	 */
659 	if (SEG_DESC_DEF32(seg_desc.access))
660 		stacksize = 4;
661 	else
662 		stacksize = 2;
663 
664 	esp = GETREG(vcpu, VM_REG_GUEST_RSP);
665 	esp -= bytes;
666 
667 	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS,
668 	    &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) {
669 		sel_exception(vcpu, IDT_SS, stacksel, 1);
670 		*faultptr = 1;
671 		return (0);
672 	}
673 
674 	if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) {
675 		vm_inject_ac(vcpu, 1);
676 		*faultptr = 1;
677 		return (0);
678 	}
679 
680 	error = vm_copy_setup(vcpu, paging, gla, bytes, PROT_WRITE,
681 	    iov, nitems(iov), faultptr);
682 	if (error || *faultptr)
683 		return (error);
684 
685 	vm_copyout(&errcode, iov, bytes);
686 	SETREG(vcpu, VM_REG_GUEST_RSP, esp);
687 	return (0);
688 }
689 
690 /*
691  * Evaluate return value from helper functions and potentially return to
692  * the VM run loop.
693  */
694 #define	CHKERR(error,fault)						\
695 	do {								\
696 		assert((error == 0) || (error == EFAULT));		\
697 		if (error)						\
698 			return (VMEXIT_ABORT);				\
699 		else if (fault)						\
700 			return (VMEXIT_CONTINUE);			\
701 	} while (0)
702 
703 int vmexit_task_switch(struct vmctx *, struct vcpu *, struct vm_run *);
704 
705 int
706 vmexit_task_switch(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun)
707 {
708 	struct seg_desc nt;
709 	struct tss32 oldtss, newtss;
710 	struct vm_task_switch *task_switch;
711 	struct vm_guest_paging *paging, sup_paging;
712 	struct user_segment_descriptor nt_desc, ot_desc;
713 	struct iovec nt_iov[2], ot_iov[2];
714 	struct vm_exit *vmexit;
715 	uint64_t cr0, ot_base;
716 	uint32_t eip, ot_lim, access;
717 	int error, ext, fault, minlimit, nt_type, ot_type;
718 	enum task_switch_reason reason;
719 	uint16_t nt_sel, ot_sel;
720 
721 	vmexit = vmrun->vm_exit;
722 	task_switch = &vmexit->u.task_switch;
723 	nt_sel = task_switch->tsssel;
724 	ext = vmexit->u.task_switch.ext;
725 	reason = vmexit->u.task_switch.reason;
726 	paging = &vmexit->u.task_switch.paging;
727 
728 	assert(paging->cpu_mode == CPU_MODE_PROTECTED);
729 
730 	/*
731 	 * Calculate the instruction pointer to store in the old TSS.
732 	 */
733 	eip = vmexit->rip + vmexit->inst_length;
734 
735 	/*
736 	 * Section 4.6, "Access Rights" in Intel SDM Vol 3.
737 	 * The following page table accesses are implicitly supervisor mode:
738 	 * - accesses to GDT or LDT to load segment descriptors
739 	 * - accesses to the task state segment during task switch
740 	 */
741 	sup_paging = *paging;
742 	sup_paging.cpl = 0;	/* implicit supervisor mode */
743 
744 	/* Fetch the new TSS descriptor */
745 	error = read_tss_descriptor(vcpu, task_switch, nt_sel, &nt_desc,
746 	    &fault);
747 	CHKERR(error, fault);
748 
749 	nt = usd_to_seg_desc(&nt_desc);
750 
751 	/* Verify the type of the new TSS */
752 	nt_type = SEG_DESC_TYPE(nt.access);
753 	if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS &&
754 	    nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) {
755 		sel_exception(vcpu, IDT_TS, nt_sel, ext);
756 		goto done;
757 	}
758 
759 	/* TSS descriptor must have present bit set */
760 	if (!SEG_DESC_PRESENT(nt.access)) {
761 		sel_exception(vcpu, IDT_NP, nt_sel, ext);
762 		goto done;
763 	}
764 
765 	/*
766 	 * TSS must have a minimum length of 104 bytes for a 32-bit TSS and
767 	 * 44 bytes for a 16-bit TSS.
768 	 */
769 	if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS)
770 		minlimit = 104 - 1;
771 	else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS)
772 		minlimit = 44 - 1;
773 	else
774 		minlimit = 0;
775 
776 	assert(minlimit > 0);
777 	if (nt.limit < (unsigned int)minlimit) {
778 		sel_exception(vcpu, IDT_TS, nt_sel, ext);
779 		goto done;
780 	}
781 
782 	/* TSS must be busy if task switch is due to IRET */
783 	if (reason == TSR_IRET && !TSS_BUSY(nt_type)) {
784 		sel_exception(vcpu, IDT_TS, nt_sel, ext);
785 		goto done;
786 	}
787 
788 	/*
789 	 * TSS must be available (not busy) if task switch reason is
790 	 * CALL, JMP, exception or interrupt.
791 	 */
792 	if (reason != TSR_IRET && TSS_BUSY(nt_type)) {
793 		sel_exception(vcpu, IDT_GP, nt_sel, ext);
794 		goto done;
795 	}
796 
797 	/* Fetch the new TSS */
798 	error = vm_copy_setup(vcpu, &sup_paging, nt.base, minlimit + 1,
799 	    PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault);
800 	CHKERR(error, fault);
801 	vm_copyin(nt_iov, &newtss, minlimit + 1);
802 
803 	/* Get the old TSS selector from the guest's task register */
804 	ot_sel = GETREG(vcpu, VM_REG_GUEST_TR);
805 	if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) {
806 		/*
807 		 * This might happen if a task switch was attempted without
808 		 * ever loading the task register with LTR. In this case the
809 		 * TR would contain the values from power-on:
810 		 * (sel = 0, base = 0, limit = 0xffff).
811 		 */
812 		sel_exception(vcpu, IDT_TS, ot_sel, task_switch->ext);
813 		goto done;
814 	}
815 
816 	/* Get the old TSS base and limit from the guest's task register */
817 	error = vm_get_desc(vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim,
818 	    &access);
819 	assert(error == 0);
820 	assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access));
821 	ot_type = SEG_DESC_TYPE(access);
822 	assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY);
823 
824 	/* Fetch the old TSS descriptor */
825 	error = read_tss_descriptor(vcpu, task_switch, ot_sel, &ot_desc,
826 	    &fault);
827 	CHKERR(error, fault);
828 
829 	/* Get the old TSS */
830 	error = vm_copy_setup(vcpu, &sup_paging, ot_base, minlimit + 1,
831 	    PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault);
832 	CHKERR(error, fault);
833 	vm_copyin(ot_iov, &oldtss, minlimit + 1);
834 
835 	/*
836 	 * Clear the busy bit in the old TSS descriptor if the task switch
837 	 * due to an IRET or JMP instruction.
838 	 */
839 	if (reason == TSR_IRET || reason == TSR_JMP) {
840 		ot_desc.sd_type &= ~0x2;
841 		error = desc_table_write(vcpu, &sup_paging, ot_sel,
842 		    &ot_desc, &fault);
843 		CHKERR(error, fault);
844 	}
845 
846 	if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) {
847 		EPRINTLN("Task switch to 16-bit TSS not supported");
848 		return (VMEXIT_ABORT);
849 	}
850 
851 	/* Save processor state in old TSS */
852 	tss32_save(vcpu, task_switch, eip, &oldtss, ot_iov);
853 
854 	/*
855 	 * If the task switch was triggered for any reason other than IRET
856 	 * then set the busy bit in the new TSS descriptor.
857 	 */
858 	if (reason != TSR_IRET) {
859 		nt_desc.sd_type |= 0x2;
860 		error = desc_table_write(vcpu, &sup_paging, nt_sel,
861 		    &nt_desc, &fault);
862 		CHKERR(error, fault);
863 	}
864 
865 	/* Update task register to point at the new TSS */
866 	SETREG(vcpu, VM_REG_GUEST_TR, nt_sel);
867 
868 	/* Update the hidden descriptor state of the task register */
869 	nt = usd_to_seg_desc(&nt_desc);
870 	update_seg_desc(vcpu, VM_REG_GUEST_TR, &nt);
871 
872 	/* Set CR0.TS */
873 	cr0 = GETREG(vcpu, VM_REG_GUEST_CR0);
874 	SETREG(vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS);
875 
876 	/*
877 	 * We are now committed to the task switch. Any exceptions encountered
878 	 * after this point will be handled in the context of the new task and
879 	 * the saved instruction pointer will belong to the new task.
880 	 */
881 	error = vm_set_register(vcpu, VM_REG_GUEST_RIP, newtss.tss_eip);
882 	assert(error == 0);
883 
884 	/* Load processor state from new TSS */
885 	error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov,
886 	    &fault);
887 	CHKERR(error, fault);
888 
889 	/*
890 	 * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception
891 	 * caused an error code to be generated, this error code is copied
892 	 * to the stack of the new task.
893 	 */
894 	if (task_switch->errcode_valid) {
895 		assert(task_switch->ext);
896 		assert(task_switch->reason == TSR_IDT_GATE);
897 		error = push_errcode(vcpu, &task_switch->paging, nt_type,
898 		    task_switch->errcode, &fault);
899 		CHKERR(error, fault);
900 	}
901 
902 	/*
903 	 * Treatment of virtual-NMI blocking if NMI is delivered through
904 	 * a task gate.
905 	 *
906 	 * Section "Architectural State Before A VM Exit", Intel SDM, Vol3:
907 	 * If the virtual NMIs VM-execution control is 1, VM entry injects
908 	 * an NMI, and delivery of the NMI causes a task switch that causes
909 	 * a VM exit, virtual-NMI blocking is in effect before the VM exit
910 	 * commences.
911 	 *
912 	 * Thus, virtual-NMI blocking is in effect at the time of the task
913 	 * switch VM exit.
914 	 */
915 
916 	/*
917 	 * Treatment of virtual-NMI unblocking on IRET from NMI handler task.
918 	 *
919 	 * Section "Changes to Instruction Behavior in VMX Non-Root Operation"
920 	 * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking.
921 	 * This unblocking of virtual-NMI occurs even if IRET causes a fault.
922 	 *
923 	 * Thus, virtual-NMI blocking is cleared at the time of the task switch
924 	 * VM exit.
925 	 */
926 
927 	/*
928 	 * If the task switch was triggered by an event delivered through
929 	 * the IDT then extinguish the pending event from the vcpu's
930 	 * exitintinfo.
931 	 */
932 	if (task_switch->reason == TSR_IDT_GATE) {
933 		error = vm_set_intinfo(vcpu, 0);
934 		assert(error == 0);
935 	}
936 
937 	/*
938 	 * XXX should inject debug exception if 'T' bit is 1
939 	 */
940 done:
941 	return (VMEXIT_CONTINUE);
942 }
943