1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2014 Neel Natu <neel@freebsd.org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28 /*
29 * This file and its contents are supplied under the terms of the
30 * Common Development and Distribution License ("CDDL"), version 1.0.
31 * You may only use this file in accordance with the terms of version
32 * 1.0 of the CDDL.
33 *
34 * A full copy of the text of the CDDL should have accompanied this
35 * source. A copy of the CDDL is also available via the Internet at
36 * http://www.illumos.org/license/CDDL.
37 *
38 * Copyright 2020 Oxide Computer Company
39 */
40
41
42 #include <sys/param.h>
43 #include <sys/_iovec.h>
44 #include <sys/mman.h>
45
46 #include <x86/psl.h>
47 #include <x86/segments.h>
48 #include <x86/specialreg.h>
49 #include <machine/vmm.h>
50
51 #include <assert.h>
52 #include <errno.h>
53 #include <stdbool.h>
54 #include <stdio.h>
55 #include <stdlib.h>
56
57 #include <vmmapi.h>
58
59 #include "bhyverun.h"
60 #include "debug.h"
61
62 /*
63 * Using 'struct i386tss' is tempting but causes myriad sign extension
64 * issues because all of its fields are defined as signed integers.
65 */
66 struct tss32 {
67 uint16_t tss_link;
68 uint16_t rsvd1;
69 uint32_t tss_esp0;
70 uint16_t tss_ss0;
71 uint16_t rsvd2;
72 uint32_t tss_esp1;
73 uint16_t tss_ss1;
74 uint16_t rsvd3;
75 uint32_t tss_esp2;
76 uint16_t tss_ss2;
77 uint16_t rsvd4;
78 uint32_t tss_cr3;
79 uint32_t tss_eip;
80 uint32_t tss_eflags;
81 uint32_t tss_eax;
82 uint32_t tss_ecx;
83 uint32_t tss_edx;
84 uint32_t tss_ebx;
85 uint32_t tss_esp;
86 uint32_t tss_ebp;
87 uint32_t tss_esi;
88 uint32_t tss_edi;
89 uint16_t tss_es;
90 uint16_t rsvd5;
91 uint16_t tss_cs;
92 uint16_t rsvd6;
93 uint16_t tss_ss;
94 uint16_t rsvd7;
95 uint16_t tss_ds;
96 uint16_t rsvd8;
97 uint16_t tss_fs;
98 uint16_t rsvd9;
99 uint16_t tss_gs;
100 uint16_t rsvd10;
101 uint16_t tss_ldt;
102 uint16_t rsvd11;
103 uint16_t tss_trap;
104 uint16_t tss_iomap;
105 };
106 static_assert(sizeof(struct tss32) == 104, "compile-time assertion failed");
107
108 #define SEL_START(sel) (((sel) & ~0x7))
109 #define SEL_LIMIT(sel) (((sel) | 0x7))
110 #define TSS_BUSY(type) (((type) & 0x2) != 0)
111
112 static uint64_t
GETREG(struct vcpu * vcpu,int reg)113 GETREG(struct vcpu *vcpu, int reg)
114 {
115 uint64_t val;
116 int error;
117
118 error = vm_get_register(vcpu, reg, &val);
119 assert(error == 0);
120 return (val);
121 }
122
123 static void
SETREG(struct vcpu * vcpu,int reg,uint64_t val)124 SETREG(struct vcpu *vcpu, int reg, uint64_t val)
125 {
126 int error;
127
128 error = vm_set_register(vcpu, reg, val);
129 assert(error == 0);
130 }
131
132 static struct seg_desc
usd_to_seg_desc(struct user_segment_descriptor * usd)133 usd_to_seg_desc(struct user_segment_descriptor *usd)
134 {
135 struct seg_desc seg_desc;
136
137 seg_desc.base = (u_int)USD_GETBASE(usd);
138 if (usd->sd_gran)
139 seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff;
140 else
141 seg_desc.limit = (u_int)USD_GETLIMIT(usd);
142 seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7;
143 seg_desc.access |= usd->sd_xx << 12;
144 seg_desc.access |= usd->sd_def32 << 14;
145 seg_desc.access |= usd->sd_gran << 15;
146
147 return (seg_desc);
148 }
149
150 /*
151 * Inject an exception with an error code that is a segment selector.
152 * The format of the error code is described in section 6.13, "Error Code",
153 * Intel SDM volume 3.
154 *
155 * Bit 0 (EXT) denotes whether the exception occurred during delivery
156 * of an external event like an interrupt.
157 *
158 * Bit 1 (IDT) indicates whether the selector points to a gate descriptor
159 * in the IDT.
160 *
161 * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI).
162 */
163 static void
sel_exception(struct vcpu * vcpu,int vector,uint16_t sel,int ext)164 sel_exception(struct vcpu *vcpu, int vector, uint16_t sel, int ext)
165 {
166 /*
167 * Bit 2 from the selector is retained as-is in the error code.
168 *
169 * Bit 1 can be safely cleared because none of the selectors
170 * encountered during task switch emulation refer to a task
171 * gate in the IDT.
172 *
173 * Bit 0 is set depending on the value of 'ext'.
174 */
175 sel &= ~0x3;
176 if (ext)
177 sel |= 0x1;
178 vm_inject_fault(vcpu, vector, 1, sel);
179 }
180
181 /*
182 * Return 0 if the selector 'sel' in within the limits of the GDT/LDT
183 * and non-zero otherwise.
184 */
185 static int
desc_table_limit_check(struct vcpu * vcpu,uint16_t sel)186 desc_table_limit_check(struct vcpu *vcpu, uint16_t sel)
187 {
188 uint64_t base;
189 uint32_t limit, access;
190 int error, reg;
191
192 reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
193 error = vm_get_desc(vcpu, reg, &base, &limit, &access);
194 assert(error == 0);
195
196 if (reg == VM_REG_GUEST_LDTR) {
197 if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access))
198 return (-1);
199 }
200
201 if (limit < SEL_LIMIT(sel))
202 return (-1);
203 else
204 return (0);
205 }
206
207 /*
208 * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced
209 * by the selector 'sel'.
210 *
211 * Returns 0 on success.
212 * Returns 1 if an exception was injected into the guest.
213 * Returns -1 otherwise.
214 */
215 static int
desc_table_rw(struct vcpu * vcpu,struct vm_guest_paging * paging,uint16_t sel,struct user_segment_descriptor * desc,bool doread,int * faultptr)216 desc_table_rw(struct vcpu *vcpu, struct vm_guest_paging *paging,
217 uint16_t sel, struct user_segment_descriptor *desc, bool doread,
218 int *faultptr)
219 {
220 struct iovec iov[2];
221 uint64_t base;
222 uint32_t limit, access;
223 int error, reg;
224
225 reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
226 error = vm_get_desc(vcpu, reg, &base, &limit, &access);
227 assert(error == 0);
228 assert(limit >= SEL_LIMIT(sel));
229
230 error = vm_copy_setup(vcpu, paging, base + SEL_START(sel),
231 sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov),
232 faultptr);
233 if (error || *faultptr)
234 return (error);
235
236 if (doread)
237 vm_copyin(iov, desc, sizeof(*desc));
238 else
239 vm_copyout(desc, iov, sizeof(*desc));
240 return (0);
241 }
242
243 static int
desc_table_read(struct vcpu * vcpu,struct vm_guest_paging * paging,uint16_t sel,struct user_segment_descriptor * desc,int * faultptr)244 desc_table_read(struct vcpu *vcpu, struct vm_guest_paging *paging,
245 uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
246 {
247 return (desc_table_rw(vcpu, paging, sel, desc, true, faultptr));
248 }
249
250 static int
desc_table_write(struct vcpu * vcpu,struct vm_guest_paging * paging,uint16_t sel,struct user_segment_descriptor * desc,int * faultptr)251 desc_table_write(struct vcpu *vcpu, struct vm_guest_paging *paging,
252 uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
253 {
254 return (desc_table_rw(vcpu, paging, sel, desc, false, faultptr));
255 }
256
257 /*
258 * Read the TSS descriptor referenced by 'sel' into 'desc'.
259 *
260 * Returns 0 on success.
261 * Returns 1 if an exception was injected into the guest.
262 * Returns -1 otherwise.
263 */
264 static int
read_tss_descriptor(struct vcpu * vcpu,struct vm_task_switch * ts,uint16_t sel,struct user_segment_descriptor * desc,int * faultptr)265 read_tss_descriptor(struct vcpu *vcpu, struct vm_task_switch *ts,
266 uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
267 {
268 struct vm_guest_paging sup_paging;
269 int error;
270
271 assert(!ISLDT(sel));
272 assert(IDXSEL(sel) != 0);
273
274 /* Fetch the new TSS descriptor */
275 if (desc_table_limit_check(vcpu, sel)) {
276 if (ts->reason == TSR_IRET)
277 sel_exception(vcpu, IDT_TS, sel, ts->ext);
278 else
279 sel_exception(vcpu, IDT_GP, sel, ts->ext);
280 return (1);
281 }
282
283 sup_paging = ts->paging;
284 sup_paging.cpl = 0; /* implicit supervisor mode */
285 error = desc_table_read(vcpu, &sup_paging, sel, desc, faultptr);
286 return (error);
287 }
288
289 static bool
code_desc(int sd_type)290 code_desc(int sd_type)
291 {
292 /* code descriptor */
293 return ((sd_type & 0x18) == 0x18);
294 }
295
296 static bool
stack_desc(int sd_type)297 stack_desc(int sd_type)
298 {
299 /* writable data descriptor */
300 return ((sd_type & 0x1A) == 0x12);
301 }
302
303 static bool
data_desc(int sd_type)304 data_desc(int sd_type)
305 {
306 /* data descriptor or a readable code descriptor */
307 return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A);
308 }
309
310 static bool
ldt_desc(int sd_type)311 ldt_desc(int sd_type)
312 {
313
314 return (sd_type == SDT_SYSLDT);
315 }
316
317 /*
318 * Validate the descriptor 'seg_desc' associated with 'segment'.
319 */
320 static int
validate_seg_desc(struct vcpu * vcpu,struct vm_task_switch * ts,int segment,struct seg_desc * seg_desc,int * faultptr)321 validate_seg_desc(struct vcpu *vcpu, struct vm_task_switch *ts,
322 int segment, struct seg_desc *seg_desc, int *faultptr)
323 {
324 struct vm_guest_paging sup_paging;
325 struct user_segment_descriptor usd;
326 int error, idtvec;
327 int cpl, dpl, rpl;
328 uint16_t sel, cs;
329 bool ldtseg, codeseg, stackseg, dataseg, conforming;
330
331 ldtseg = codeseg = stackseg = dataseg = false;
332 switch (segment) {
333 case VM_REG_GUEST_LDTR:
334 ldtseg = true;
335 break;
336 case VM_REG_GUEST_CS:
337 codeseg = true;
338 break;
339 case VM_REG_GUEST_SS:
340 stackseg = true;
341 break;
342 case VM_REG_GUEST_DS:
343 case VM_REG_GUEST_ES:
344 case VM_REG_GUEST_FS:
345 case VM_REG_GUEST_GS:
346 dataseg = true;
347 break;
348 default:
349 assert(0);
350 }
351
352 /* Get the segment selector */
353 sel = GETREG(vcpu, segment);
354
355 /* LDT selector must point into the GDT */
356 if (ldtseg && ISLDT(sel)) {
357 sel_exception(vcpu, IDT_TS, sel, ts->ext);
358 return (1);
359 }
360
361 /* Descriptor table limit check */
362 if (desc_table_limit_check(vcpu, sel)) {
363 sel_exception(vcpu, IDT_TS, sel, ts->ext);
364 return (1);
365 }
366
367 /* NULL selector */
368 if (IDXSEL(sel) == 0) {
369 /* Code and stack segment selectors cannot be NULL */
370 if (codeseg || stackseg) {
371 sel_exception(vcpu, IDT_TS, sel, ts->ext);
372 return (1);
373 }
374 seg_desc->base = 0;
375 seg_desc->limit = 0;
376 seg_desc->access = 0x10000; /* unusable */
377 return (0);
378 }
379
380 /* Read the descriptor from the GDT/LDT */
381 sup_paging = ts->paging;
382 sup_paging.cpl = 0; /* implicit supervisor mode */
383 error = desc_table_read(vcpu, &sup_paging, sel, &usd, faultptr);
384 if (error || *faultptr)
385 return (error);
386
387 /* Verify that the descriptor type is compatible with the segment */
388 if ((ldtseg && !ldt_desc(usd.sd_type)) ||
389 (codeseg && !code_desc(usd.sd_type)) ||
390 (dataseg && !data_desc(usd.sd_type)) ||
391 (stackseg && !stack_desc(usd.sd_type))) {
392 sel_exception(vcpu, IDT_TS, sel, ts->ext);
393 return (1);
394 }
395
396 /* Segment must be marked present */
397 if (!usd.sd_p) {
398 if (ldtseg)
399 idtvec = IDT_TS;
400 else if (stackseg)
401 idtvec = IDT_SS;
402 else
403 idtvec = IDT_NP;
404 sel_exception(vcpu, idtvec, sel, ts->ext);
405 return (1);
406 }
407
408 cs = GETREG(vcpu, VM_REG_GUEST_CS);
409 cpl = cs & SEL_RPL_MASK;
410 rpl = sel & SEL_RPL_MASK;
411 dpl = usd.sd_dpl;
412
413 if (stackseg && (rpl != cpl || dpl != cpl)) {
414 sel_exception(vcpu, IDT_TS, sel, ts->ext);
415 return (1);
416 }
417
418 if (codeseg) {
419 conforming = (usd.sd_type & 0x4) ? true : false;
420 if ((conforming && (cpl < dpl)) ||
421 (!conforming && (cpl != dpl))) {
422 sel_exception(vcpu, IDT_TS, sel, ts->ext);
423 return (1);
424 }
425 }
426
427 if (dataseg) {
428 /*
429 * A data segment is always non-conforming except when it's
430 * descriptor is a readable, conforming code segment.
431 */
432 if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0)
433 conforming = true;
434 else
435 conforming = false;
436
437 if (!conforming && (rpl > dpl || cpl > dpl)) {
438 sel_exception(vcpu, IDT_TS, sel, ts->ext);
439 return (1);
440 }
441 }
442 *seg_desc = usd_to_seg_desc(&usd);
443 return (0);
444 }
445
446 static void
tss32_save(struct vcpu * vcpu,struct vm_task_switch * task_switch,uint32_t eip,struct tss32 * tss,struct iovec * iov)447 tss32_save(struct vcpu *vcpu, struct vm_task_switch *task_switch,
448 uint32_t eip, struct tss32 *tss, struct iovec *iov)
449 {
450
451 /* General purpose registers */
452 tss->tss_eax = GETREG(vcpu, VM_REG_GUEST_RAX);
453 tss->tss_ecx = GETREG(vcpu, VM_REG_GUEST_RCX);
454 tss->tss_edx = GETREG(vcpu, VM_REG_GUEST_RDX);
455 tss->tss_ebx = GETREG(vcpu, VM_REG_GUEST_RBX);
456 tss->tss_esp = GETREG(vcpu, VM_REG_GUEST_RSP);
457 tss->tss_ebp = GETREG(vcpu, VM_REG_GUEST_RBP);
458 tss->tss_esi = GETREG(vcpu, VM_REG_GUEST_RSI);
459 tss->tss_edi = GETREG(vcpu, VM_REG_GUEST_RDI);
460
461 /* Segment selectors */
462 tss->tss_es = GETREG(vcpu, VM_REG_GUEST_ES);
463 tss->tss_cs = GETREG(vcpu, VM_REG_GUEST_CS);
464 tss->tss_ss = GETREG(vcpu, VM_REG_GUEST_SS);
465 tss->tss_ds = GETREG(vcpu, VM_REG_GUEST_DS);
466 tss->tss_fs = GETREG(vcpu, VM_REG_GUEST_FS);
467 tss->tss_gs = GETREG(vcpu, VM_REG_GUEST_GS);
468
469 /* eflags and eip */
470 tss->tss_eflags = GETREG(vcpu, VM_REG_GUEST_RFLAGS);
471 if (task_switch->reason == TSR_IRET)
472 tss->tss_eflags &= ~PSL_NT;
473 tss->tss_eip = eip;
474
475 /* Copy updated old TSS into guest memory */
476 vm_copyout(tss, iov, sizeof(struct tss32));
477 }
478
479 static void
update_seg_desc(struct vcpu * vcpu,int reg,struct seg_desc * sd)480 update_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *sd)
481 {
482 int error;
483
484 error = vm_set_desc(vcpu, reg, sd->base, sd->limit, sd->access);
485 assert(error == 0);
486 }
487
488 /*
489 * Update the vcpu registers to reflect the state of the new task.
490 */
491 static int
tss32_restore(struct vmctx * ctx,struct vcpu * vcpu,struct vm_task_switch * ts,uint16_t ot_sel,struct tss32 * tss,struct iovec * iov,int * faultptr)492 tss32_restore(struct vmctx *ctx, struct vcpu *vcpu, struct vm_task_switch *ts,
493 uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr)
494 {
495 struct seg_desc seg_desc, seg_desc2;
496 uint64_t *pdpte, maxphyaddr, reserved;
497 uint32_t eflags;
498 int error, i;
499 bool nested;
500
501 nested = false;
502 if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) {
503 tss->tss_link = ot_sel;
504 nested = true;
505 }
506
507 eflags = tss->tss_eflags;
508 if (nested)
509 eflags |= PSL_NT;
510
511 /* LDTR */
512 SETREG(vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt);
513
514 /* PBDR */
515 if (ts->paging.paging_mode != PAGING_MODE_FLAT) {
516 if (ts->paging.paging_mode == PAGING_MODE_PAE) {
517 /*
518 * XXX Assuming 36-bit MAXPHYADDR.
519 */
520 maxphyaddr = (1UL << 36) - 1;
521 pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32);
522 for (i = 0; i < 4; i++) {
523 /* Check reserved bits if the PDPTE is valid */
524 if (!(pdpte[i] & 0x1))
525 continue;
526 /*
527 * Bits 2:1, 8:5 and bits above the processor's
528 * maximum physical address are reserved.
529 */
530 reserved = ~maxphyaddr | 0x1E6;
531 if (pdpte[i] & reserved) {
532 vm_inject_gp(vcpu);
533 return (1);
534 }
535 }
536 SETREG(vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]);
537 SETREG(vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]);
538 SETREG(vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]);
539 SETREG(vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]);
540 }
541 SETREG(vcpu, VM_REG_GUEST_CR3, tss->tss_cr3);
542 ts->paging.cr3 = tss->tss_cr3;
543 }
544
545 /* eflags and eip */
546 SETREG(vcpu, VM_REG_GUEST_RFLAGS, eflags);
547 SETREG(vcpu, VM_REG_GUEST_RIP, tss->tss_eip);
548
549 /* General purpose registers */
550 SETREG(vcpu, VM_REG_GUEST_RAX, tss->tss_eax);
551 SETREG(vcpu, VM_REG_GUEST_RCX, tss->tss_ecx);
552 SETREG(vcpu, VM_REG_GUEST_RDX, tss->tss_edx);
553 SETREG(vcpu, VM_REG_GUEST_RBX, tss->tss_ebx);
554 SETREG(vcpu, VM_REG_GUEST_RSP, tss->tss_esp);
555 SETREG(vcpu, VM_REG_GUEST_RBP, tss->tss_ebp);
556 SETREG(vcpu, VM_REG_GUEST_RSI, tss->tss_esi);
557 SETREG(vcpu, VM_REG_GUEST_RDI, tss->tss_edi);
558
559 /* Segment selectors */
560 SETREG(vcpu, VM_REG_GUEST_ES, tss->tss_es);
561 SETREG(vcpu, VM_REG_GUEST_CS, tss->tss_cs);
562 SETREG(vcpu, VM_REG_GUEST_SS, tss->tss_ss);
563 SETREG(vcpu, VM_REG_GUEST_DS, tss->tss_ds);
564 SETREG(vcpu, VM_REG_GUEST_FS, tss->tss_fs);
565 SETREG(vcpu, VM_REG_GUEST_GS, tss->tss_gs);
566
567 /*
568 * If this is a nested task then write out the new TSS to update
569 * the previous link field.
570 */
571 if (nested)
572 vm_copyout(tss, iov, sizeof(*tss));
573
574 /* Validate segment descriptors */
575 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc,
576 faultptr);
577 if (error || *faultptr)
578 return (error);
579 update_seg_desc(vcpu, VM_REG_GUEST_LDTR, &seg_desc);
580
581 /*
582 * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3.
583 *
584 * The SS and CS attribute checks on VM-entry are inter-dependent so
585 * we need to make sure that both segments are valid before updating
586 * either of them. This ensures that the VMCS state can pass the
587 * VM-entry checks so the guest can handle any exception injected
588 * during task switch emulation.
589 */
590 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_CS, &seg_desc,
591 faultptr);
592 if (error || *faultptr)
593 return (error);
594
595 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_SS, &seg_desc2,
596 faultptr);
597 if (error || *faultptr)
598 return (error);
599 update_seg_desc(vcpu, VM_REG_GUEST_CS, &seg_desc);
600 update_seg_desc(vcpu, VM_REG_GUEST_SS, &seg_desc2);
601 ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK;
602
603 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_DS, &seg_desc,
604 faultptr);
605 if (error || *faultptr)
606 return (error);
607 update_seg_desc(vcpu, VM_REG_GUEST_DS, &seg_desc);
608
609 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_ES, &seg_desc,
610 faultptr);
611 if (error || *faultptr)
612 return (error);
613 update_seg_desc(vcpu, VM_REG_GUEST_ES, &seg_desc);
614
615 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_FS, &seg_desc,
616 faultptr);
617 if (error || *faultptr)
618 return (error);
619 update_seg_desc(vcpu, VM_REG_GUEST_FS, &seg_desc);
620
621 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_GS, &seg_desc,
622 faultptr);
623 if (error || *faultptr)
624 return (error);
625 update_seg_desc(vcpu, VM_REG_GUEST_GS, &seg_desc);
626
627 return (0);
628 }
629
630
631 /*
632 * Copy of vie_alignment_check() from vmm_instruction_emul.c
633 */
634 static int
alignment_check(int cpl,int size,uint64_t cr0,uint64_t rf,uint64_t gla)635 alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
636 {
637 assert(size == 1 || size == 2 || size == 4 || size == 8);
638 assert(cpl >= 0 && cpl <= 3);
639
640 if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
641 return (0);
642
643 return ((gla & (size - 1)) ? 1 : 0);
644 }
645
646 /*
647 * Copy of vie_size2mask() from vmm_instruction_emul.c
648 */
649 static uint64_t
size2mask(int size)650 size2mask(int size)
651 {
652 switch (size) {
653 case 1:
654 return (0xff);
655 case 2:
656 return (0xffff);
657 case 4:
658 return (0xffffffff);
659 case 8:
660 return (0xffffffffffffffff);
661 default:
662 assert(0);
663 /* not reached */
664 return (0);
665 }
666 }
667
668 /*
669 * Copy of vie_calculate_gla() from vmm_instruction_emul.c
670 */
671 static int
calculate_gla(enum vm_cpu_mode cpu_mode,enum vm_reg_name seg,struct seg_desc * desc,uint64_t offset,int length,int addrsize,int prot,uint64_t * gla)672 calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
673 struct seg_desc *desc, uint64_t offset, int length, int addrsize,
674 int prot, uint64_t *gla)
675 {
676 uint64_t firstoff, low_limit, high_limit, segbase;
677 int glasize, type;
678
679 assert(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS);
680 assert((length == 1 || length == 2 || length == 4 || length == 8));
681 assert((prot & ~(PROT_READ | PROT_WRITE)) == 0);
682
683 firstoff = offset;
684 if (cpu_mode == CPU_MODE_64BIT) {
685 assert(addrsize == 4 || addrsize == 8);
686 glasize = 8;
687 } else {
688 assert(addrsize == 2 || addrsize == 4);
689 glasize = 4;
690 /*
691 * If the segment selector is loaded with a NULL selector
692 * then the descriptor is unusable and attempting to use
693 * it results in a #GP(0).
694 */
695 if (SEG_DESC_UNUSABLE(desc->access))
696 return (-1);
697
698 /*
699 * The processor generates a #NP exception when a segment
700 * register is loaded with a selector that points to a
701 * descriptor that is not present. If this was the case then
702 * it would have been checked before the VM-exit.
703 */
704 assert(SEG_DESC_PRESENT(desc->access));
705
706 /*
707 * The descriptor type must indicate a code/data segment.
708 */
709 type = SEG_DESC_TYPE(desc->access);
710 assert(type >= 16 && type <= 31);
711
712 if (prot & PROT_READ) {
713 /* #GP on a read access to a exec-only code segment */
714 if ((type & 0xA) == 0x8)
715 return (-1);
716 }
717
718 if (prot & PROT_WRITE) {
719 /*
720 * #GP on a write access to a code segment or a
721 * read-only data segment.
722 */
723 if (type & 0x8) /* code segment */
724 return (-1);
725
726 if ((type & 0xA) == 0) /* read-only data seg */
727 return (-1);
728 }
729
730 /*
731 * 'desc->limit' is fully expanded taking granularity into
732 * account.
733 */
734 if ((type & 0xC) == 0x4) {
735 /* expand-down data segment */
736 low_limit = desc->limit + 1;
737 high_limit = SEG_DESC_DEF32(desc->access) ?
738 0xffffffff : 0xffff;
739 } else {
740 /* code segment or expand-up data segment */
741 low_limit = 0;
742 high_limit = desc->limit;
743 }
744
745 while (length > 0) {
746 offset &= size2mask(addrsize);
747 if (offset < low_limit || offset > high_limit)
748 return (-1);
749 offset++;
750 length--;
751 }
752 }
753
754 /*
755 * In 64-bit mode all segments except %fs and %gs have a segment
756 * base address of 0.
757 */
758 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
759 seg != VM_REG_GUEST_GS) {
760 segbase = 0;
761 } else {
762 segbase = desc->base;
763 }
764
765 /*
766 * Truncate 'firstoff' to the effective address size before adding
767 * it to the segment base.
768 */
769 firstoff &= size2mask(addrsize);
770 *gla = (segbase + firstoff) & size2mask(glasize);
771 return (0);
772 }
773
774 /*
775 * Push an error code on the stack of the new task. This is needed if the
776 * task switch was triggered by a hardware exception that causes an error
777 * code to be saved (e.g. #PF).
778 */
779 static int
push_errcode(struct vcpu * vcpu,struct vm_guest_paging * paging,int task_type,uint32_t errcode,int * faultptr)780 push_errcode(struct vcpu *vcpu, struct vm_guest_paging *paging,
781 int task_type, uint32_t errcode, int *faultptr)
782 {
783 struct iovec iov[2];
784 struct seg_desc seg_desc;
785 int stacksize, bytes, error;
786 uint64_t gla, cr0, rflags;
787 uint32_t esp;
788 uint16_t stacksel;
789
790 *faultptr = 0;
791
792 cr0 = GETREG(vcpu, VM_REG_GUEST_CR0);
793 rflags = GETREG(vcpu, VM_REG_GUEST_RFLAGS);
794 stacksel = GETREG(vcpu, VM_REG_GUEST_SS);
795
796 error = vm_get_desc(vcpu, VM_REG_GUEST_SS, &seg_desc.base,
797 &seg_desc.limit, &seg_desc.access);
798 assert(error == 0);
799
800 /*
801 * Section "Error Code" in the Intel SDM vol 3: the error code is
802 * pushed on the stack as a doubleword or word (depending on the
803 * default interrupt, trap or task gate size).
804 */
805 if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS)
806 bytes = 4;
807 else
808 bytes = 2;
809
810 /*
811 * PUSH instruction from Intel SDM vol 2: the 'B' flag in the
812 * stack-segment descriptor determines the size of the stack
813 * pointer outside of 64-bit mode.
814 */
815 if (SEG_DESC_DEF32(seg_desc.access))
816 stacksize = 4;
817 else
818 stacksize = 2;
819
820 esp = GETREG(vcpu, VM_REG_GUEST_RSP);
821 esp -= bytes;
822
823 if (calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS,
824 &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) {
825 sel_exception(vcpu, IDT_SS, stacksel, 1);
826 *faultptr = 1;
827 return (0);
828 }
829
830 if (alignment_check(paging->cpl, bytes, cr0, rflags, gla)) {
831 vm_inject_ac(vcpu, 1);
832 *faultptr = 1;
833 return (0);
834 }
835
836 error = vm_copy_setup(vcpu, paging, gla, bytes, PROT_WRITE,
837 iov, nitems(iov), faultptr);
838 if (error || *faultptr)
839 return (error);
840
841 vm_copyout(&errcode, iov, bytes);
842 SETREG(vcpu, VM_REG_GUEST_RSP, esp);
843 return (0);
844 }
845
846 /*
847 * Evaluate return value from helper functions and potentially return to
848 * the VM run loop.
849 */
850 #define CHKERR(error,fault) \
851 do { \
852 assert((error == 0) || (error == EFAULT)); \
853 if (error) \
854 return (VMEXIT_ABORT); \
855 else if (fault) \
856 return (VMEXIT_CONTINUE); \
857 } while (0)
858
859 int
vmexit_task_switch(struct vmctx * ctx,struct vcpu * vcpu,struct vm_exit * vmexit)860 vmexit_task_switch(struct vmctx *ctx, struct vcpu *vcpu, struct vm_exit *vmexit)
861 {
862 struct seg_desc nt;
863 struct tss32 oldtss, newtss;
864 struct vm_task_switch *task_switch;
865 struct vm_guest_paging *paging, sup_paging;
866 struct user_segment_descriptor nt_desc, ot_desc;
867 struct iovec nt_iov[2], ot_iov[2];
868 uint64_t cr0, ot_base;
869 uint32_t eip, ot_lim, access;
870 int error, ext, fault, minlimit, nt_type, ot_type;
871 enum task_switch_reason reason;
872 uint16_t nt_sel, ot_sel;
873
874 task_switch = &vmexit->u.task_switch;
875 nt_sel = task_switch->tsssel;
876 ext = vmexit->u.task_switch.ext;
877 reason = vmexit->u.task_switch.reason;
878 paging = &vmexit->u.task_switch.paging;
879
880 assert(paging->cpu_mode == CPU_MODE_PROTECTED);
881
882 /*
883 * Calculate the instruction pointer to store in the old TSS.
884 */
885 eip = vmexit->rip + vmexit->inst_length;
886
887 /*
888 * Section 4.6, "Access Rights" in Intel SDM Vol 3.
889 * The following page table accesses are implicitly supervisor mode:
890 * - accesses to GDT or LDT to load segment descriptors
891 * - accesses to the task state segment during task switch
892 */
893 sup_paging = *paging;
894 sup_paging.cpl = 0; /* implicit supervisor mode */
895
896 /* Fetch the new TSS descriptor */
897 error = read_tss_descriptor(vcpu, task_switch, nt_sel, &nt_desc,
898 &fault);
899 CHKERR(error, fault);
900
901 nt = usd_to_seg_desc(&nt_desc);
902
903 /* Verify the type of the new TSS */
904 nt_type = SEG_DESC_TYPE(nt.access);
905 if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS &&
906 nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) {
907 sel_exception(vcpu, IDT_TS, nt_sel, ext);
908 goto done;
909 }
910
911 /* TSS descriptor must have present bit set */
912 if (!SEG_DESC_PRESENT(nt.access)) {
913 sel_exception(vcpu, IDT_NP, nt_sel, ext);
914 goto done;
915 }
916
917 /*
918 * TSS must have a minimum length of 104 bytes for a 32-bit TSS and
919 * 44 bytes for a 16-bit TSS.
920 */
921 if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS)
922 minlimit = 104 - 1;
923 else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS)
924 minlimit = 44 - 1;
925 else
926 minlimit = 0;
927
928 assert(minlimit > 0);
929 if (nt.limit < (unsigned int)minlimit) {
930 sel_exception(vcpu, IDT_TS, nt_sel, ext);
931 goto done;
932 }
933
934 /* TSS must be busy if task switch is due to IRET */
935 if (reason == TSR_IRET && !TSS_BUSY(nt_type)) {
936 sel_exception(vcpu, IDT_TS, nt_sel, ext);
937 goto done;
938 }
939
940 /*
941 * TSS must be available (not busy) if task switch reason is
942 * CALL, JMP, exception or interrupt.
943 */
944 if (reason != TSR_IRET && TSS_BUSY(nt_type)) {
945 sel_exception(vcpu, IDT_GP, nt_sel, ext);
946 goto done;
947 }
948
949 /* Fetch the new TSS */
950 error = vm_copy_setup(vcpu, &sup_paging, nt.base, minlimit + 1,
951 PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault);
952 CHKERR(error, fault);
953 vm_copyin(nt_iov, &newtss, minlimit + 1);
954
955 /* Get the old TSS selector from the guest's task register */
956 ot_sel = GETREG(vcpu, VM_REG_GUEST_TR);
957 if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) {
958 /*
959 * This might happen if a task switch was attempted without
960 * ever loading the task register with LTR. In this case the
961 * TR would contain the values from power-on:
962 * (sel = 0, base = 0, limit = 0xffff).
963 */
964 sel_exception(vcpu, IDT_TS, ot_sel, task_switch->ext);
965 goto done;
966 }
967
968 /* Get the old TSS base and limit from the guest's task register */
969 error = vm_get_desc(vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim,
970 &access);
971 assert(error == 0);
972 assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access));
973 ot_type = SEG_DESC_TYPE(access);
974 assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY);
975
976 /* Fetch the old TSS descriptor */
977 error = read_tss_descriptor(vcpu, task_switch, ot_sel, &ot_desc,
978 &fault);
979 CHKERR(error, fault);
980
981 /* Get the old TSS */
982 error = vm_copy_setup(vcpu, &sup_paging, ot_base, minlimit + 1,
983 PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault);
984 CHKERR(error, fault);
985 vm_copyin(ot_iov, &oldtss, minlimit + 1);
986
987 /*
988 * Clear the busy bit in the old TSS descriptor if the task switch
989 * due to an IRET or JMP instruction.
990 */
991 if (reason == TSR_IRET || reason == TSR_JMP) {
992 ot_desc.sd_type &= ~0x2;
993 error = desc_table_write(vcpu, &sup_paging, ot_sel,
994 &ot_desc, &fault);
995 CHKERR(error, fault);
996 }
997
998 if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) {
999 EPRINTLN("Task switch to 16-bit TSS not supported");
1000 return (VMEXIT_ABORT);
1001 }
1002
1003 /* Save processor state in old TSS */
1004 tss32_save(vcpu, task_switch, eip, &oldtss, ot_iov);
1005
1006 /*
1007 * If the task switch was triggered for any reason other than IRET
1008 * then set the busy bit in the new TSS descriptor.
1009 */
1010 if (reason != TSR_IRET) {
1011 nt_desc.sd_type |= 0x2;
1012 error = desc_table_write(vcpu, &sup_paging, nt_sel,
1013 &nt_desc, &fault);
1014 CHKERR(error, fault);
1015 }
1016
1017 /* Update task register to point at the new TSS */
1018 SETREG(vcpu, VM_REG_GUEST_TR, nt_sel);
1019
1020 /* Update the hidden descriptor state of the task register */
1021 nt = usd_to_seg_desc(&nt_desc);
1022 update_seg_desc(vcpu, VM_REG_GUEST_TR, &nt);
1023
1024 /* Set CR0.TS */
1025 cr0 = GETREG(vcpu, VM_REG_GUEST_CR0);
1026 SETREG(vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS);
1027
1028 /*
1029 * We are now committed to the task switch. Any exceptions encountered
1030 * after this point will be handled in the context of the new task and
1031 * the saved instruction pointer will belong to the new task.
1032 */
1033 error = vm_set_register(vcpu, VM_REG_GUEST_RIP, newtss.tss_eip);
1034 assert(error == 0);
1035
1036 /* Load processor state from new TSS */
1037 error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov,
1038 &fault);
1039 CHKERR(error, fault);
1040
1041 /*
1042 * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception
1043 * caused an error code to be generated, this error code is copied
1044 * to the stack of the new task.
1045 */
1046 if (task_switch->errcode_valid) {
1047 assert(task_switch->ext);
1048 assert(task_switch->reason == TSR_IDT_GATE);
1049 error = push_errcode(vcpu, &task_switch->paging, nt_type,
1050 task_switch->errcode, &fault);
1051 CHKERR(error, fault);
1052 }
1053
1054 /*
1055 * Treatment of virtual-NMI blocking if NMI is delivered through
1056 * a task gate.
1057 *
1058 * Section "Architectural State Before A VM Exit", Intel SDM, Vol3:
1059 * If the virtual NMIs VM-execution control is 1, VM entry injects
1060 * an NMI, and delivery of the NMI causes a task switch that causes
1061 * a VM exit, virtual-NMI blocking is in effect before the VM exit
1062 * commences.
1063 *
1064 * Thus, virtual-NMI blocking is in effect at the time of the task
1065 * switch VM exit.
1066 */
1067
1068 /*
1069 * Treatment of virtual-NMI unblocking on IRET from NMI handler task.
1070 *
1071 * Section "Changes to Instruction Behavior in VMX Non-Root Operation"
1072 * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking.
1073 * This unblocking of virtual-NMI occurs even if IRET causes a fault.
1074 *
1075 * Thus, virtual-NMI blocking is cleared at the time of the task switch
1076 * VM exit.
1077 */
1078
1079 /*
1080 * If the task switch was triggered by an event delivered through
1081 * the IDT then extinguish the pending event from the vcpu's
1082 * exitintinfo.
1083 */
1084 if (task_switch->reason == TSR_IDT_GATE) {
1085 error = vm_set_intinfo(vcpu, 0);
1086 assert(error == 0);
1087 }
1088
1089 /*
1090 * XXX should inject debug exception if 'T' bit is 1
1091 */
1092 done:
1093 return (VMEXIT_CONTINUE);
1094 }
1095