1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2014 Neel Natu <neel@freebsd.org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/param.h>
30 #include <sys/_iovec.h>
31 #include <sys/mman.h>
32
33 #include <x86/psl.h>
34 #include <x86/specialreg.h>
35 #include <machine/vmm.h>
36 #include <machine/vmm_instruction_emul.h>
37
38 #include <assert.h>
39 #include <errno.h>
40 #include <stdbool.h>
41 #include <stdio.h>
42 #include <stdlib.h>
43
44 #include <vmmapi.h>
45
46 #include "bhyverun.h"
47 #include "debug.h"
48
49 /*
50 * Using 'struct i386tss' is tempting but causes myriad sign extension
51 * issues because all of its fields are defined as signed integers.
52 */
53 struct tss32 {
54 uint16_t tss_link;
55 uint16_t rsvd1;
56 uint32_t tss_esp0;
57 uint16_t tss_ss0;
58 uint16_t rsvd2;
59 uint32_t tss_esp1;
60 uint16_t tss_ss1;
61 uint16_t rsvd3;
62 uint32_t tss_esp2;
63 uint16_t tss_ss2;
64 uint16_t rsvd4;
65 uint32_t tss_cr3;
66 uint32_t tss_eip;
67 uint32_t tss_eflags;
68 uint32_t tss_eax;
69 uint32_t tss_ecx;
70 uint32_t tss_edx;
71 uint32_t tss_ebx;
72 uint32_t tss_esp;
73 uint32_t tss_ebp;
74 uint32_t tss_esi;
75 uint32_t tss_edi;
76 uint16_t tss_es;
77 uint16_t rsvd5;
78 uint16_t tss_cs;
79 uint16_t rsvd6;
80 uint16_t tss_ss;
81 uint16_t rsvd7;
82 uint16_t tss_ds;
83 uint16_t rsvd8;
84 uint16_t tss_fs;
85 uint16_t rsvd9;
86 uint16_t tss_gs;
87 uint16_t rsvd10;
88 uint16_t tss_ldt;
89 uint16_t rsvd11;
90 uint16_t tss_trap;
91 uint16_t tss_iomap;
92 };
93 static_assert(sizeof(struct tss32) == 104, "compile-time assertion failed");
94
95 #define SEL_START(sel) (((sel) & ~0x7))
96 #define SEL_LIMIT(sel) (((sel) | 0x7))
97 #define TSS_BUSY(type) (((type) & 0x2) != 0)
98
99 static uint64_t
GETREG(struct vcpu * vcpu,int reg)100 GETREG(struct vcpu *vcpu, int reg)
101 {
102 uint64_t val;
103 int error;
104
105 error = vm_get_register(vcpu, reg, &val);
106 assert(error == 0);
107 return (val);
108 }
109
110 static void
SETREG(struct vcpu * vcpu,int reg,uint64_t val)111 SETREG(struct vcpu *vcpu, int reg, uint64_t val)
112 {
113 int error;
114
115 error = vm_set_register(vcpu, reg, val);
116 assert(error == 0);
117 }
118
119 static struct seg_desc
usd_to_seg_desc(struct user_segment_descriptor * usd)120 usd_to_seg_desc(struct user_segment_descriptor *usd)
121 {
122 struct seg_desc seg_desc;
123
124 seg_desc.base = (u_int)USD_GETBASE(usd);
125 if (usd->sd_gran)
126 seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff;
127 else
128 seg_desc.limit = (u_int)USD_GETLIMIT(usd);
129 seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7;
130 seg_desc.access |= usd->sd_xx << 12;
131 seg_desc.access |= usd->sd_def32 << 14;
132 seg_desc.access |= usd->sd_gran << 15;
133
134 return (seg_desc);
135 }
136
137 /*
138 * Inject an exception with an error code that is a segment selector.
139 * The format of the error code is described in section 6.13, "Error Code",
140 * Intel SDM volume 3.
141 *
142 * Bit 0 (EXT) denotes whether the exception occurred during delivery
143 * of an external event like an interrupt.
144 *
145 * Bit 1 (IDT) indicates whether the selector points to a gate descriptor
146 * in the IDT.
147 *
148 * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI).
149 */
150 static void
sel_exception(struct vcpu * vcpu,int vector,uint16_t sel,int ext)151 sel_exception(struct vcpu *vcpu, int vector, uint16_t sel, int ext)
152 {
153 /*
154 * Bit 2 from the selector is retained as-is in the error code.
155 *
156 * Bit 1 can be safely cleared because none of the selectors
157 * encountered during task switch emulation refer to a task
158 * gate in the IDT.
159 *
160 * Bit 0 is set depending on the value of 'ext'.
161 */
162 sel &= ~0x3;
163 if (ext)
164 sel |= 0x1;
165 vm_inject_fault(vcpu, vector, 1, sel);
166 }
167
168 /*
169 * Return 0 if the selector 'sel' in within the limits of the GDT/LDT
170 * and non-zero otherwise.
171 */
172 static int
desc_table_limit_check(struct vcpu * vcpu,uint16_t sel)173 desc_table_limit_check(struct vcpu *vcpu, uint16_t sel)
174 {
175 uint64_t base;
176 uint32_t limit, access;
177 int error, reg;
178
179 reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
180 error = vm_get_desc(vcpu, reg, &base, &limit, &access);
181 assert(error == 0);
182
183 if (reg == VM_REG_GUEST_LDTR) {
184 if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access))
185 return (-1);
186 }
187
188 if (limit < SEL_LIMIT(sel))
189 return (-1);
190 else
191 return (0);
192 }
193
194 /*
195 * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced
196 * by the selector 'sel'.
197 *
198 * Returns 0 on success.
199 * Returns 1 if an exception was injected into the guest.
200 * Returns -1 otherwise.
201 */
202 static int
desc_table_rw(struct vcpu * vcpu,struct vm_guest_paging * paging,uint16_t sel,struct user_segment_descriptor * desc,bool doread,int * faultptr)203 desc_table_rw(struct vcpu *vcpu, struct vm_guest_paging *paging,
204 uint16_t sel, struct user_segment_descriptor *desc, bool doread,
205 int *faultptr)
206 {
207 struct iovec iov[2];
208 uint64_t base;
209 uint32_t limit, access;
210 int error, reg;
211
212 reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
213 error = vm_get_desc(vcpu, reg, &base, &limit, &access);
214 assert(error == 0);
215 assert(limit >= SEL_LIMIT(sel));
216
217 error = vm_copy_setup(vcpu, paging, base + SEL_START(sel),
218 sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov),
219 faultptr);
220 if (error || *faultptr)
221 return (error);
222
223 if (doread)
224 vm_copyin(iov, desc, sizeof(*desc));
225 else
226 vm_copyout(desc, iov, sizeof(*desc));
227 return (0);
228 }
229
230 static int
desc_table_read(struct vcpu * vcpu,struct vm_guest_paging * paging,uint16_t sel,struct user_segment_descriptor * desc,int * faultptr)231 desc_table_read(struct vcpu *vcpu, struct vm_guest_paging *paging,
232 uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
233 {
234 return (desc_table_rw(vcpu, paging, sel, desc, true, faultptr));
235 }
236
237 static int
desc_table_write(struct vcpu * vcpu,struct vm_guest_paging * paging,uint16_t sel,struct user_segment_descriptor * desc,int * faultptr)238 desc_table_write(struct vcpu *vcpu, struct vm_guest_paging *paging,
239 uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
240 {
241 return (desc_table_rw(vcpu, paging, sel, desc, false, faultptr));
242 }
243
244 /*
245 * Read the TSS descriptor referenced by 'sel' into 'desc'.
246 *
247 * Returns 0 on success.
248 * Returns 1 if an exception was injected into the guest.
249 * Returns -1 otherwise.
250 */
251 static int
read_tss_descriptor(struct vcpu * vcpu,struct vm_task_switch * ts,uint16_t sel,struct user_segment_descriptor * desc,int * faultptr)252 read_tss_descriptor(struct vcpu *vcpu, struct vm_task_switch *ts,
253 uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
254 {
255 struct vm_guest_paging sup_paging;
256 int error;
257
258 assert(!ISLDT(sel));
259 assert(IDXSEL(sel) != 0);
260
261 /* Fetch the new TSS descriptor */
262 if (desc_table_limit_check(vcpu, sel)) {
263 if (ts->reason == TSR_IRET)
264 sel_exception(vcpu, IDT_TS, sel, ts->ext);
265 else
266 sel_exception(vcpu, IDT_GP, sel, ts->ext);
267 return (1);
268 }
269
270 sup_paging = ts->paging;
271 sup_paging.cpl = 0; /* implicit supervisor mode */
272 error = desc_table_read(vcpu, &sup_paging, sel, desc, faultptr);
273 return (error);
274 }
275
276 static bool
code_desc(int sd_type)277 code_desc(int sd_type)
278 {
279 /* code descriptor */
280 return ((sd_type & 0x18) == 0x18);
281 }
282
283 static bool
stack_desc(int sd_type)284 stack_desc(int sd_type)
285 {
286 /* writable data descriptor */
287 return ((sd_type & 0x1A) == 0x12);
288 }
289
290 static bool
data_desc(int sd_type)291 data_desc(int sd_type)
292 {
293 /* data descriptor or a readable code descriptor */
294 return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A);
295 }
296
297 static bool
ldt_desc(int sd_type)298 ldt_desc(int sd_type)
299 {
300
301 return (sd_type == SDT_SYSLDT);
302 }
303
304 /*
305 * Validate the descriptor 'seg_desc' associated with 'segment'.
306 */
307 static int
validate_seg_desc(struct vcpu * vcpu,struct vm_task_switch * ts,int segment,struct seg_desc * seg_desc,int * faultptr)308 validate_seg_desc(struct vcpu *vcpu, struct vm_task_switch *ts,
309 int segment, struct seg_desc *seg_desc, int *faultptr)
310 {
311 struct vm_guest_paging sup_paging;
312 struct user_segment_descriptor usd;
313 int error, idtvec;
314 int cpl, dpl, rpl;
315 uint16_t sel, cs;
316 bool ldtseg, codeseg, stackseg, dataseg, conforming;
317
318 ldtseg = codeseg = stackseg = dataseg = false;
319 switch (segment) {
320 case VM_REG_GUEST_LDTR:
321 ldtseg = true;
322 break;
323 case VM_REG_GUEST_CS:
324 codeseg = true;
325 break;
326 case VM_REG_GUEST_SS:
327 stackseg = true;
328 break;
329 case VM_REG_GUEST_DS:
330 case VM_REG_GUEST_ES:
331 case VM_REG_GUEST_FS:
332 case VM_REG_GUEST_GS:
333 dataseg = true;
334 break;
335 default:
336 assert(0);
337 }
338
339 /* Get the segment selector */
340 sel = GETREG(vcpu, segment);
341
342 /* LDT selector must point into the GDT */
343 if (ldtseg && ISLDT(sel)) {
344 sel_exception(vcpu, IDT_TS, sel, ts->ext);
345 return (1);
346 }
347
348 /* Descriptor table limit check */
349 if (desc_table_limit_check(vcpu, sel)) {
350 sel_exception(vcpu, IDT_TS, sel, ts->ext);
351 return (1);
352 }
353
354 /* NULL selector */
355 if (IDXSEL(sel) == 0) {
356 /* Code and stack segment selectors cannot be NULL */
357 if (codeseg || stackseg) {
358 sel_exception(vcpu, IDT_TS, sel, ts->ext);
359 return (1);
360 }
361 seg_desc->base = 0;
362 seg_desc->limit = 0;
363 seg_desc->access = 0x10000; /* unusable */
364 return (0);
365 }
366
367 /* Read the descriptor from the GDT/LDT */
368 sup_paging = ts->paging;
369 sup_paging.cpl = 0; /* implicit supervisor mode */
370 error = desc_table_read(vcpu, &sup_paging, sel, &usd, faultptr);
371 if (error || *faultptr)
372 return (error);
373
374 /* Verify that the descriptor type is compatible with the segment */
375 if ((ldtseg && !ldt_desc(usd.sd_type)) ||
376 (codeseg && !code_desc(usd.sd_type)) ||
377 (dataseg && !data_desc(usd.sd_type)) ||
378 (stackseg && !stack_desc(usd.sd_type))) {
379 sel_exception(vcpu, IDT_TS, sel, ts->ext);
380 return (1);
381 }
382
383 /* Segment must be marked present */
384 if (!usd.sd_p) {
385 if (ldtseg)
386 idtvec = IDT_TS;
387 else if (stackseg)
388 idtvec = IDT_SS;
389 else
390 idtvec = IDT_NP;
391 sel_exception(vcpu, idtvec, sel, ts->ext);
392 return (1);
393 }
394
395 cs = GETREG(vcpu, VM_REG_GUEST_CS);
396 cpl = cs & SEL_RPL_MASK;
397 rpl = sel & SEL_RPL_MASK;
398 dpl = usd.sd_dpl;
399
400 if (stackseg && (rpl != cpl || dpl != cpl)) {
401 sel_exception(vcpu, IDT_TS, sel, ts->ext);
402 return (1);
403 }
404
405 if (codeseg) {
406 conforming = (usd.sd_type & 0x4) ? true : false;
407 if ((conforming && (cpl < dpl)) ||
408 (!conforming && (cpl != dpl))) {
409 sel_exception(vcpu, IDT_TS, sel, ts->ext);
410 return (1);
411 }
412 }
413
414 if (dataseg) {
415 /*
416 * A data segment is always non-conforming except when it's
417 * descriptor is a readable, conforming code segment.
418 */
419 if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0)
420 conforming = true;
421 else
422 conforming = false;
423
424 if (!conforming && (rpl > dpl || cpl > dpl)) {
425 sel_exception(vcpu, IDT_TS, sel, ts->ext);
426 return (1);
427 }
428 }
429 *seg_desc = usd_to_seg_desc(&usd);
430 return (0);
431 }
432
433 static void
tss32_save(struct vcpu * vcpu,struct vm_task_switch * task_switch,uint32_t eip,struct tss32 * tss,struct iovec * iov)434 tss32_save(struct vcpu *vcpu, struct vm_task_switch *task_switch,
435 uint32_t eip, struct tss32 *tss, struct iovec *iov)
436 {
437
438 /* General purpose registers */
439 tss->tss_eax = GETREG(vcpu, VM_REG_GUEST_RAX);
440 tss->tss_ecx = GETREG(vcpu, VM_REG_GUEST_RCX);
441 tss->tss_edx = GETREG(vcpu, VM_REG_GUEST_RDX);
442 tss->tss_ebx = GETREG(vcpu, VM_REG_GUEST_RBX);
443 tss->tss_esp = GETREG(vcpu, VM_REG_GUEST_RSP);
444 tss->tss_ebp = GETREG(vcpu, VM_REG_GUEST_RBP);
445 tss->tss_esi = GETREG(vcpu, VM_REG_GUEST_RSI);
446 tss->tss_edi = GETREG(vcpu, VM_REG_GUEST_RDI);
447
448 /* Segment selectors */
449 tss->tss_es = GETREG(vcpu, VM_REG_GUEST_ES);
450 tss->tss_cs = GETREG(vcpu, VM_REG_GUEST_CS);
451 tss->tss_ss = GETREG(vcpu, VM_REG_GUEST_SS);
452 tss->tss_ds = GETREG(vcpu, VM_REG_GUEST_DS);
453 tss->tss_fs = GETREG(vcpu, VM_REG_GUEST_FS);
454 tss->tss_gs = GETREG(vcpu, VM_REG_GUEST_GS);
455
456 /* eflags and eip */
457 tss->tss_eflags = GETREG(vcpu, VM_REG_GUEST_RFLAGS);
458 if (task_switch->reason == TSR_IRET)
459 tss->tss_eflags &= ~PSL_NT;
460 tss->tss_eip = eip;
461
462 /* Copy updated old TSS into guest memory */
463 vm_copyout(tss, iov, sizeof(struct tss32));
464 }
465
466 static void
update_seg_desc(struct vcpu * vcpu,int reg,struct seg_desc * sd)467 update_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *sd)
468 {
469 int error;
470
471 error = vm_set_desc(vcpu, reg, sd->base, sd->limit, sd->access);
472 assert(error == 0);
473 }
474
475 /*
476 * Update the vcpu registers to reflect the state of the new task.
477 */
478 static int
tss32_restore(struct vmctx * ctx,struct vcpu * vcpu,struct vm_task_switch * ts,uint16_t ot_sel,struct tss32 * tss,struct iovec * iov,int * faultptr)479 tss32_restore(struct vmctx *ctx, struct vcpu *vcpu, struct vm_task_switch *ts,
480 uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr)
481 {
482 struct seg_desc seg_desc, seg_desc2;
483 uint64_t *pdpte, maxphyaddr, reserved;
484 uint32_t eflags;
485 int error, i;
486 bool nested;
487
488 nested = false;
489 if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) {
490 tss->tss_link = ot_sel;
491 nested = true;
492 }
493
494 eflags = tss->tss_eflags;
495 if (nested)
496 eflags |= PSL_NT;
497
498 /* LDTR */
499 SETREG(vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt);
500
501 /* PBDR */
502 if (ts->paging.paging_mode != PAGING_MODE_FLAT) {
503 if (ts->paging.paging_mode == PAGING_MODE_PAE) {
504 /*
505 * XXX Assuming 36-bit MAXPHYADDR.
506 */
507 maxphyaddr = (1UL << 36) - 1;
508 pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32);
509 for (i = 0; i < 4; i++) {
510 /* Check reserved bits if the PDPTE is valid */
511 if (!(pdpte[i] & 0x1))
512 continue;
513 /*
514 * Bits 2:1, 8:5 and bits above the processor's
515 * maximum physical address are reserved.
516 */
517 reserved = ~maxphyaddr | 0x1E6;
518 if (pdpte[i] & reserved) {
519 vm_inject_gp(vcpu);
520 return (1);
521 }
522 }
523 SETREG(vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]);
524 SETREG(vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]);
525 SETREG(vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]);
526 SETREG(vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]);
527 }
528 SETREG(vcpu, VM_REG_GUEST_CR3, tss->tss_cr3);
529 ts->paging.cr3 = tss->tss_cr3;
530 }
531
532 /* eflags and eip */
533 SETREG(vcpu, VM_REG_GUEST_RFLAGS, eflags);
534 SETREG(vcpu, VM_REG_GUEST_RIP, tss->tss_eip);
535
536 /* General purpose registers */
537 SETREG(vcpu, VM_REG_GUEST_RAX, tss->tss_eax);
538 SETREG(vcpu, VM_REG_GUEST_RCX, tss->tss_ecx);
539 SETREG(vcpu, VM_REG_GUEST_RDX, tss->tss_edx);
540 SETREG(vcpu, VM_REG_GUEST_RBX, tss->tss_ebx);
541 SETREG(vcpu, VM_REG_GUEST_RSP, tss->tss_esp);
542 SETREG(vcpu, VM_REG_GUEST_RBP, tss->tss_ebp);
543 SETREG(vcpu, VM_REG_GUEST_RSI, tss->tss_esi);
544 SETREG(vcpu, VM_REG_GUEST_RDI, tss->tss_edi);
545
546 /* Segment selectors */
547 SETREG(vcpu, VM_REG_GUEST_ES, tss->tss_es);
548 SETREG(vcpu, VM_REG_GUEST_CS, tss->tss_cs);
549 SETREG(vcpu, VM_REG_GUEST_SS, tss->tss_ss);
550 SETREG(vcpu, VM_REG_GUEST_DS, tss->tss_ds);
551 SETREG(vcpu, VM_REG_GUEST_FS, tss->tss_fs);
552 SETREG(vcpu, VM_REG_GUEST_GS, tss->tss_gs);
553
554 /*
555 * If this is a nested task then write out the new TSS to update
556 * the previous link field.
557 */
558 if (nested)
559 vm_copyout(tss, iov, sizeof(*tss));
560
561 /* Validate segment descriptors */
562 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc,
563 faultptr);
564 if (error || *faultptr)
565 return (error);
566 update_seg_desc(vcpu, VM_REG_GUEST_LDTR, &seg_desc);
567
568 /*
569 * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3.
570 *
571 * The SS and CS attribute checks on VM-entry are inter-dependent so
572 * we need to make sure that both segments are valid before updating
573 * either of them. This ensures that the VMCS state can pass the
574 * VM-entry checks so the guest can handle any exception injected
575 * during task switch emulation.
576 */
577 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_CS, &seg_desc,
578 faultptr);
579 if (error || *faultptr)
580 return (error);
581
582 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_SS, &seg_desc2,
583 faultptr);
584 if (error || *faultptr)
585 return (error);
586 update_seg_desc(vcpu, VM_REG_GUEST_CS, &seg_desc);
587 update_seg_desc(vcpu, VM_REG_GUEST_SS, &seg_desc2);
588 ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK;
589
590 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_DS, &seg_desc,
591 faultptr);
592 if (error || *faultptr)
593 return (error);
594 update_seg_desc(vcpu, VM_REG_GUEST_DS, &seg_desc);
595
596 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_ES, &seg_desc,
597 faultptr);
598 if (error || *faultptr)
599 return (error);
600 update_seg_desc(vcpu, VM_REG_GUEST_ES, &seg_desc);
601
602 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_FS, &seg_desc,
603 faultptr);
604 if (error || *faultptr)
605 return (error);
606 update_seg_desc(vcpu, VM_REG_GUEST_FS, &seg_desc);
607
608 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_GS, &seg_desc,
609 faultptr);
610 if (error || *faultptr)
611 return (error);
612 update_seg_desc(vcpu, VM_REG_GUEST_GS, &seg_desc);
613
614 return (0);
615 }
616
617 /*
618 * Push an error code on the stack of the new task. This is needed if the
619 * task switch was triggered by a hardware exception that causes an error
620 * code to be saved (e.g. #PF).
621 */
622 static int
push_errcode(struct vcpu * vcpu,struct vm_guest_paging * paging,int task_type,uint32_t errcode,int * faultptr)623 push_errcode(struct vcpu *vcpu, struct vm_guest_paging *paging,
624 int task_type, uint32_t errcode, int *faultptr)
625 {
626 struct iovec iov[2];
627 struct seg_desc seg_desc;
628 int stacksize, bytes, error;
629 uint64_t gla, cr0, rflags;
630 uint32_t esp;
631 uint16_t stacksel;
632
633 *faultptr = 0;
634
635 cr0 = GETREG(vcpu, VM_REG_GUEST_CR0);
636 rflags = GETREG(vcpu, VM_REG_GUEST_RFLAGS);
637 stacksel = GETREG(vcpu, VM_REG_GUEST_SS);
638
639 error = vm_get_desc(vcpu, VM_REG_GUEST_SS, &seg_desc.base,
640 &seg_desc.limit, &seg_desc.access);
641 assert(error == 0);
642
643 /*
644 * Section "Error Code" in the Intel SDM vol 3: the error code is
645 * pushed on the stack as a doubleword or word (depending on the
646 * default interrupt, trap or task gate size).
647 */
648 if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS)
649 bytes = 4;
650 else
651 bytes = 2;
652
653 /*
654 * PUSH instruction from Intel SDM vol 2: the 'B' flag in the
655 * stack-segment descriptor determines the size of the stack
656 * pointer outside of 64-bit mode.
657 */
658 if (SEG_DESC_DEF32(seg_desc.access))
659 stacksize = 4;
660 else
661 stacksize = 2;
662
663 esp = GETREG(vcpu, VM_REG_GUEST_RSP);
664 esp -= bytes;
665
666 if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS,
667 &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) {
668 sel_exception(vcpu, IDT_SS, stacksel, 1);
669 *faultptr = 1;
670 return (0);
671 }
672
673 if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) {
674 vm_inject_ac(vcpu, 1);
675 *faultptr = 1;
676 return (0);
677 }
678
679 error = vm_copy_setup(vcpu, paging, gla, bytes, PROT_WRITE,
680 iov, nitems(iov), faultptr);
681 if (error || *faultptr)
682 return (error);
683
684 vm_copyout(&errcode, iov, bytes);
685 SETREG(vcpu, VM_REG_GUEST_RSP, esp);
686 return (0);
687 }
688
689 /*
690 * Evaluate return value from helper functions and potentially return to
691 * the VM run loop.
692 */
693 #define CHKERR(error,fault) \
694 do { \
695 assert((error == 0) || (error == EFAULT)); \
696 if (error) \
697 return (VMEXIT_ABORT); \
698 else if (fault) \
699 return (VMEXIT_CONTINUE); \
700 } while (0)
701
702 int vmexit_task_switch(struct vmctx *, struct vcpu *, struct vm_run *);
703
704 int
vmexit_task_switch(struct vmctx * ctx,struct vcpu * vcpu,struct vm_run * vmrun)705 vmexit_task_switch(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun)
706 {
707 struct seg_desc nt;
708 struct tss32 oldtss, newtss;
709 struct vm_task_switch *task_switch;
710 struct vm_guest_paging *paging, sup_paging;
711 struct user_segment_descriptor nt_desc, ot_desc;
712 struct iovec nt_iov[2], ot_iov[2];
713 struct vm_exit *vmexit;
714 uint64_t cr0, ot_base;
715 uint32_t eip, ot_lim, access;
716 int error, ext, fault, minlimit, nt_type, ot_type;
717 enum task_switch_reason reason;
718 uint16_t nt_sel, ot_sel;
719
720 vmexit = vmrun->vm_exit;
721 task_switch = &vmexit->u.task_switch;
722 nt_sel = task_switch->tsssel;
723 ext = vmexit->u.task_switch.ext;
724 reason = vmexit->u.task_switch.reason;
725 paging = &vmexit->u.task_switch.paging;
726
727 assert(paging->cpu_mode == CPU_MODE_PROTECTED);
728
729 /*
730 * Calculate the instruction pointer to store in the old TSS.
731 */
732 eip = vmexit->rip + vmexit->inst_length;
733
734 /*
735 * Section 4.6, "Access Rights" in Intel SDM Vol 3.
736 * The following page table accesses are implicitly supervisor mode:
737 * - accesses to GDT or LDT to load segment descriptors
738 * - accesses to the task state segment during task switch
739 */
740 sup_paging = *paging;
741 sup_paging.cpl = 0; /* implicit supervisor mode */
742
743 /* Fetch the new TSS descriptor */
744 error = read_tss_descriptor(vcpu, task_switch, nt_sel, &nt_desc,
745 &fault);
746 CHKERR(error, fault);
747
748 nt = usd_to_seg_desc(&nt_desc);
749
750 /* Verify the type of the new TSS */
751 nt_type = SEG_DESC_TYPE(nt.access);
752 if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS &&
753 nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) {
754 sel_exception(vcpu, IDT_TS, nt_sel, ext);
755 goto done;
756 }
757
758 /* TSS descriptor must have present bit set */
759 if (!SEG_DESC_PRESENT(nt.access)) {
760 sel_exception(vcpu, IDT_NP, nt_sel, ext);
761 goto done;
762 }
763
764 /*
765 * TSS must have a minimum length of 104 bytes for a 32-bit TSS and
766 * 44 bytes for a 16-bit TSS.
767 */
768 if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS)
769 minlimit = 104 - 1;
770 else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS)
771 minlimit = 44 - 1;
772 else
773 minlimit = 0;
774
775 assert(minlimit > 0);
776 if (nt.limit < (unsigned int)minlimit) {
777 sel_exception(vcpu, IDT_TS, nt_sel, ext);
778 goto done;
779 }
780
781 /* TSS must be busy if task switch is due to IRET */
782 if (reason == TSR_IRET && !TSS_BUSY(nt_type)) {
783 sel_exception(vcpu, IDT_TS, nt_sel, ext);
784 goto done;
785 }
786
787 /*
788 * TSS must be available (not busy) if task switch reason is
789 * CALL, JMP, exception or interrupt.
790 */
791 if (reason != TSR_IRET && TSS_BUSY(nt_type)) {
792 sel_exception(vcpu, IDT_GP, nt_sel, ext);
793 goto done;
794 }
795
796 /* Fetch the new TSS */
797 error = vm_copy_setup(vcpu, &sup_paging, nt.base, minlimit + 1,
798 PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault);
799 CHKERR(error, fault);
800 vm_copyin(nt_iov, &newtss, minlimit + 1);
801
802 /* Get the old TSS selector from the guest's task register */
803 ot_sel = GETREG(vcpu, VM_REG_GUEST_TR);
804 if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) {
805 /*
806 * This might happen if a task switch was attempted without
807 * ever loading the task register with LTR. In this case the
808 * TR would contain the values from power-on:
809 * (sel = 0, base = 0, limit = 0xffff).
810 */
811 sel_exception(vcpu, IDT_TS, ot_sel, task_switch->ext);
812 goto done;
813 }
814
815 /* Get the old TSS base and limit from the guest's task register */
816 error = vm_get_desc(vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim,
817 &access);
818 assert(error == 0);
819 assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access));
820 ot_type = SEG_DESC_TYPE(access);
821 assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY);
822
823 /* Fetch the old TSS descriptor */
824 error = read_tss_descriptor(vcpu, task_switch, ot_sel, &ot_desc,
825 &fault);
826 CHKERR(error, fault);
827
828 /* Get the old TSS */
829 error = vm_copy_setup(vcpu, &sup_paging, ot_base, minlimit + 1,
830 PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault);
831 CHKERR(error, fault);
832 vm_copyin(ot_iov, &oldtss, minlimit + 1);
833
834 /*
835 * Clear the busy bit in the old TSS descriptor if the task switch
836 * due to an IRET or JMP instruction.
837 */
838 if (reason == TSR_IRET || reason == TSR_JMP) {
839 ot_desc.sd_type &= ~0x2;
840 error = desc_table_write(vcpu, &sup_paging, ot_sel,
841 &ot_desc, &fault);
842 CHKERR(error, fault);
843 }
844
845 if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) {
846 EPRINTLN("Task switch to 16-bit TSS not supported");
847 return (VMEXIT_ABORT);
848 }
849
850 /* Save processor state in old TSS */
851 tss32_save(vcpu, task_switch, eip, &oldtss, ot_iov);
852
853 /*
854 * If the task switch was triggered for any reason other than IRET
855 * then set the busy bit in the new TSS descriptor.
856 */
857 if (reason != TSR_IRET) {
858 nt_desc.sd_type |= 0x2;
859 error = desc_table_write(vcpu, &sup_paging, nt_sel,
860 &nt_desc, &fault);
861 CHKERR(error, fault);
862 }
863
864 /* Update task register to point at the new TSS */
865 SETREG(vcpu, VM_REG_GUEST_TR, nt_sel);
866
867 /* Update the hidden descriptor state of the task register */
868 nt = usd_to_seg_desc(&nt_desc);
869 update_seg_desc(vcpu, VM_REG_GUEST_TR, &nt);
870
871 /* Set CR0.TS */
872 cr0 = GETREG(vcpu, VM_REG_GUEST_CR0);
873 SETREG(vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS);
874
875 /*
876 * We are now committed to the task switch. Any exceptions encountered
877 * after this point will be handled in the context of the new task and
878 * the saved instruction pointer will belong to the new task.
879 */
880 error = vm_set_register(vcpu, VM_REG_GUEST_RIP, newtss.tss_eip);
881 assert(error == 0);
882
883 /* Load processor state from new TSS */
884 error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov,
885 &fault);
886 CHKERR(error, fault);
887
888 /*
889 * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception
890 * caused an error code to be generated, this error code is copied
891 * to the stack of the new task.
892 */
893 if (task_switch->errcode_valid) {
894 assert(task_switch->ext);
895 assert(task_switch->reason == TSR_IDT_GATE);
896 error = push_errcode(vcpu, &task_switch->paging, nt_type,
897 task_switch->errcode, &fault);
898 CHKERR(error, fault);
899 }
900
901 /*
902 * Treatment of virtual-NMI blocking if NMI is delivered through
903 * a task gate.
904 *
905 * Section "Architectural State Before A VM Exit", Intel SDM, Vol3:
906 * If the virtual NMIs VM-execution control is 1, VM entry injects
907 * an NMI, and delivery of the NMI causes a task switch that causes
908 * a VM exit, virtual-NMI blocking is in effect before the VM exit
909 * commences.
910 *
911 * Thus, virtual-NMI blocking is in effect at the time of the task
912 * switch VM exit.
913 */
914
915 /*
916 * Treatment of virtual-NMI unblocking on IRET from NMI handler task.
917 *
918 * Section "Changes to Instruction Behavior in VMX Non-Root Operation"
919 * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking.
920 * This unblocking of virtual-NMI occurs even if IRET causes a fault.
921 *
922 * Thus, virtual-NMI blocking is cleared at the time of the task switch
923 * VM exit.
924 */
925
926 /*
927 * If the task switch was triggered by an event delivered through
928 * the IDT then extinguish the pending event from the vcpu's
929 * exitintinfo.
930 */
931 if (task_switch->reason == TSR_IDT_GATE) {
932 error = vm_set_intinfo(vcpu, 0);
933 assert(error == 0);
934 }
935
936 /*
937 * XXX should inject debug exception if 'T' bit is 1
938 */
939 done:
940 return (VMEXIT_CONTINUE);
941 }
942