1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2014 Neel Natu <neel@freebsd.org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28 /*
29 * This file and its contents are supplied under the terms of the
30 * Common Development and Distribution License ("CDDL"), version 1.0.
31 * You may only use this file in accordance with the terms of version
32 * 1.0 of the CDDL.
33 *
34 * A full copy of the text of the CDDL should have accompanied this
35 * source. A copy of the CDDL is also available via the Internet at
36 * http://www.illumos.org/license/CDDL.
37 *
38 * Copyright 2020 Oxide Computer Company
39 */
40
41 #include <sys/cdefs.h>
42
43 #include <sys/param.h>
44 #include <sys/_iovec.h>
45 #include <sys/mman.h>
46
47 #include <x86/psl.h>
48 #include <x86/segments.h>
49 #include <x86/specialreg.h>
50 #include <machine/vmm.h>
51
52 #include <assert.h>
53 #include <errno.h>
54 #include <stdbool.h>
55 #include <stdio.h>
56 #include <stdlib.h>
57
58 #include <vmmapi.h>
59
60 #include "bhyverun.h"
61 #include "debug.h"
62
63 /*
64 * Using 'struct i386tss' is tempting but causes myriad sign extension
65 * issues because all of its fields are defined as signed integers.
66 */
67 struct tss32 {
68 uint16_t tss_link;
69 uint16_t rsvd1;
70 uint32_t tss_esp0;
71 uint16_t tss_ss0;
72 uint16_t rsvd2;
73 uint32_t tss_esp1;
74 uint16_t tss_ss1;
75 uint16_t rsvd3;
76 uint32_t tss_esp2;
77 uint16_t tss_ss2;
78 uint16_t rsvd4;
79 uint32_t tss_cr3;
80 uint32_t tss_eip;
81 uint32_t tss_eflags;
82 uint32_t tss_eax;
83 uint32_t tss_ecx;
84 uint32_t tss_edx;
85 uint32_t tss_ebx;
86 uint32_t tss_esp;
87 uint32_t tss_ebp;
88 uint32_t tss_esi;
89 uint32_t tss_edi;
90 uint16_t tss_es;
91 uint16_t rsvd5;
92 uint16_t tss_cs;
93 uint16_t rsvd6;
94 uint16_t tss_ss;
95 uint16_t rsvd7;
96 uint16_t tss_ds;
97 uint16_t rsvd8;
98 uint16_t tss_fs;
99 uint16_t rsvd9;
100 uint16_t tss_gs;
101 uint16_t rsvd10;
102 uint16_t tss_ldt;
103 uint16_t rsvd11;
104 uint16_t tss_trap;
105 uint16_t tss_iomap;
106 };
107 static_assert(sizeof(struct tss32) == 104, "compile-time assertion failed");
108
109 #define SEL_START(sel) (((sel) & ~0x7))
110 #define SEL_LIMIT(sel) (((sel) | 0x7))
111 #define TSS_BUSY(type) (((type) & 0x2) != 0)
112
113 static uint64_t
GETREG(struct vcpu * vcpu,int reg)114 GETREG(struct vcpu *vcpu, int reg)
115 {
116 uint64_t val;
117 int error;
118
119 error = vm_get_register(vcpu, reg, &val);
120 assert(error == 0);
121 return (val);
122 }
123
124 static void
SETREG(struct vcpu * vcpu,int reg,uint64_t val)125 SETREG(struct vcpu *vcpu, int reg, uint64_t val)
126 {
127 int error;
128
129 error = vm_set_register(vcpu, reg, val);
130 assert(error == 0);
131 }
132
133 static struct seg_desc
usd_to_seg_desc(struct user_segment_descriptor * usd)134 usd_to_seg_desc(struct user_segment_descriptor *usd)
135 {
136 struct seg_desc seg_desc;
137
138 seg_desc.base = (u_int)USD_GETBASE(usd);
139 if (usd->sd_gran)
140 seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff;
141 else
142 seg_desc.limit = (u_int)USD_GETLIMIT(usd);
143 seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7;
144 seg_desc.access |= usd->sd_xx << 12;
145 seg_desc.access |= usd->sd_def32 << 14;
146 seg_desc.access |= usd->sd_gran << 15;
147
148 return (seg_desc);
149 }
150
151 /*
152 * Inject an exception with an error code that is a segment selector.
153 * The format of the error code is described in section 6.13, "Error Code",
154 * Intel SDM volume 3.
155 *
156 * Bit 0 (EXT) denotes whether the exception occurred during delivery
157 * of an external event like an interrupt.
158 *
159 * Bit 1 (IDT) indicates whether the selector points to a gate descriptor
160 * in the IDT.
161 *
162 * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI).
163 */
164 static void
sel_exception(struct vcpu * vcpu,int vector,uint16_t sel,int ext)165 sel_exception(struct vcpu *vcpu, int vector, uint16_t sel, int ext)
166 {
167 /*
168 * Bit 2 from the selector is retained as-is in the error code.
169 *
170 * Bit 1 can be safely cleared because none of the selectors
171 * encountered during task switch emulation refer to a task
172 * gate in the IDT.
173 *
174 * Bit 0 is set depending on the value of 'ext'.
175 */
176 sel &= ~0x3;
177 if (ext)
178 sel |= 0x1;
179 vm_inject_fault(vcpu, vector, 1, sel);
180 }
181
182 /*
183 * Return 0 if the selector 'sel' in within the limits of the GDT/LDT
184 * and non-zero otherwise.
185 */
186 static int
desc_table_limit_check(struct vcpu * vcpu,uint16_t sel)187 desc_table_limit_check(struct vcpu *vcpu, uint16_t sel)
188 {
189 uint64_t base;
190 uint32_t limit, access;
191 int error, reg;
192
193 reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
194 error = vm_get_desc(vcpu, reg, &base, &limit, &access);
195 assert(error == 0);
196
197 if (reg == VM_REG_GUEST_LDTR) {
198 if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access))
199 return (-1);
200 }
201
202 if (limit < SEL_LIMIT(sel))
203 return (-1);
204 else
205 return (0);
206 }
207
208 /*
209 * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced
210 * by the selector 'sel'.
211 *
212 * Returns 0 on success.
213 * Returns 1 if an exception was injected into the guest.
214 * Returns -1 otherwise.
215 */
216 static int
desc_table_rw(struct vcpu * vcpu,struct vm_guest_paging * paging,uint16_t sel,struct user_segment_descriptor * desc,bool doread,int * faultptr)217 desc_table_rw(struct vcpu *vcpu, struct vm_guest_paging *paging,
218 uint16_t sel, struct user_segment_descriptor *desc, bool doread,
219 int *faultptr)
220 {
221 struct iovec iov[2];
222 uint64_t base;
223 uint32_t limit, access;
224 int error, reg;
225
226 reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
227 error = vm_get_desc(vcpu, reg, &base, &limit, &access);
228 assert(error == 0);
229 assert(limit >= SEL_LIMIT(sel));
230
231 error = vm_copy_setup(vcpu, paging, base + SEL_START(sel),
232 sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov),
233 faultptr);
234 if (error || *faultptr)
235 return (error);
236
237 if (doread)
238 vm_copyin(iov, desc, sizeof(*desc));
239 else
240 vm_copyout(desc, iov, sizeof(*desc));
241 return (0);
242 }
243
244 static int
desc_table_read(struct vcpu * vcpu,struct vm_guest_paging * paging,uint16_t sel,struct user_segment_descriptor * desc,int * faultptr)245 desc_table_read(struct vcpu *vcpu, struct vm_guest_paging *paging,
246 uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
247 {
248 return (desc_table_rw(vcpu, paging, sel, desc, true, faultptr));
249 }
250
251 static int
desc_table_write(struct vcpu * vcpu,struct vm_guest_paging * paging,uint16_t sel,struct user_segment_descriptor * desc,int * faultptr)252 desc_table_write(struct vcpu *vcpu, struct vm_guest_paging *paging,
253 uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
254 {
255 return (desc_table_rw(vcpu, paging, sel, desc, false, faultptr));
256 }
257
258 /*
259 * Read the TSS descriptor referenced by 'sel' into 'desc'.
260 *
261 * Returns 0 on success.
262 * Returns 1 if an exception was injected into the guest.
263 * Returns -1 otherwise.
264 */
265 static int
read_tss_descriptor(struct vcpu * vcpu,struct vm_task_switch * ts,uint16_t sel,struct user_segment_descriptor * desc,int * faultptr)266 read_tss_descriptor(struct vcpu *vcpu, struct vm_task_switch *ts,
267 uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
268 {
269 struct vm_guest_paging sup_paging;
270 int error;
271
272 assert(!ISLDT(sel));
273 assert(IDXSEL(sel) != 0);
274
275 /* Fetch the new TSS descriptor */
276 if (desc_table_limit_check(vcpu, sel)) {
277 if (ts->reason == TSR_IRET)
278 sel_exception(vcpu, IDT_TS, sel, ts->ext);
279 else
280 sel_exception(vcpu, IDT_GP, sel, ts->ext);
281 return (1);
282 }
283
284 sup_paging = ts->paging;
285 sup_paging.cpl = 0; /* implicit supervisor mode */
286 error = desc_table_read(vcpu, &sup_paging, sel, desc, faultptr);
287 return (error);
288 }
289
290 static bool
code_desc(int sd_type)291 code_desc(int sd_type)
292 {
293 /* code descriptor */
294 return ((sd_type & 0x18) == 0x18);
295 }
296
297 static bool
stack_desc(int sd_type)298 stack_desc(int sd_type)
299 {
300 /* writable data descriptor */
301 return ((sd_type & 0x1A) == 0x12);
302 }
303
304 static bool
data_desc(int sd_type)305 data_desc(int sd_type)
306 {
307 /* data descriptor or a readable code descriptor */
308 return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A);
309 }
310
311 static bool
ldt_desc(int sd_type)312 ldt_desc(int sd_type)
313 {
314
315 return (sd_type == SDT_SYSLDT);
316 }
317
318 /*
319 * Validate the descriptor 'seg_desc' associated with 'segment'.
320 */
321 static int
validate_seg_desc(struct vcpu * vcpu,struct vm_task_switch * ts,int segment,struct seg_desc * seg_desc,int * faultptr)322 validate_seg_desc(struct vcpu *vcpu, struct vm_task_switch *ts,
323 int segment, struct seg_desc *seg_desc, int *faultptr)
324 {
325 struct vm_guest_paging sup_paging;
326 struct user_segment_descriptor usd;
327 int error, idtvec;
328 int cpl, dpl, rpl;
329 uint16_t sel, cs;
330 bool ldtseg, codeseg, stackseg, dataseg, conforming;
331
332 ldtseg = codeseg = stackseg = dataseg = false;
333 switch (segment) {
334 case VM_REG_GUEST_LDTR:
335 ldtseg = true;
336 break;
337 case VM_REG_GUEST_CS:
338 codeseg = true;
339 break;
340 case VM_REG_GUEST_SS:
341 stackseg = true;
342 break;
343 case VM_REG_GUEST_DS:
344 case VM_REG_GUEST_ES:
345 case VM_REG_GUEST_FS:
346 case VM_REG_GUEST_GS:
347 dataseg = true;
348 break;
349 default:
350 assert(0);
351 }
352
353 /* Get the segment selector */
354 sel = GETREG(vcpu, segment);
355
356 /* LDT selector must point into the GDT */
357 if (ldtseg && ISLDT(sel)) {
358 sel_exception(vcpu, IDT_TS, sel, ts->ext);
359 return (1);
360 }
361
362 /* Descriptor table limit check */
363 if (desc_table_limit_check(vcpu, sel)) {
364 sel_exception(vcpu, IDT_TS, sel, ts->ext);
365 return (1);
366 }
367
368 /* NULL selector */
369 if (IDXSEL(sel) == 0) {
370 /* Code and stack segment selectors cannot be NULL */
371 if (codeseg || stackseg) {
372 sel_exception(vcpu, IDT_TS, sel, ts->ext);
373 return (1);
374 }
375 seg_desc->base = 0;
376 seg_desc->limit = 0;
377 seg_desc->access = 0x10000; /* unusable */
378 return (0);
379 }
380
381 /* Read the descriptor from the GDT/LDT */
382 sup_paging = ts->paging;
383 sup_paging.cpl = 0; /* implicit supervisor mode */
384 error = desc_table_read(vcpu, &sup_paging, sel, &usd, faultptr);
385 if (error || *faultptr)
386 return (error);
387
388 /* Verify that the descriptor type is compatible with the segment */
389 if ((ldtseg && !ldt_desc(usd.sd_type)) ||
390 (codeseg && !code_desc(usd.sd_type)) ||
391 (dataseg && !data_desc(usd.sd_type)) ||
392 (stackseg && !stack_desc(usd.sd_type))) {
393 sel_exception(vcpu, IDT_TS, sel, ts->ext);
394 return (1);
395 }
396
397 /* Segment must be marked present */
398 if (!usd.sd_p) {
399 if (ldtseg)
400 idtvec = IDT_TS;
401 else if (stackseg)
402 idtvec = IDT_SS;
403 else
404 idtvec = IDT_NP;
405 sel_exception(vcpu, idtvec, sel, ts->ext);
406 return (1);
407 }
408
409 cs = GETREG(vcpu, VM_REG_GUEST_CS);
410 cpl = cs & SEL_RPL_MASK;
411 rpl = sel & SEL_RPL_MASK;
412 dpl = usd.sd_dpl;
413
414 if (stackseg && (rpl != cpl || dpl != cpl)) {
415 sel_exception(vcpu, IDT_TS, sel, ts->ext);
416 return (1);
417 }
418
419 if (codeseg) {
420 conforming = (usd.sd_type & 0x4) ? true : false;
421 if ((conforming && (cpl < dpl)) ||
422 (!conforming && (cpl != dpl))) {
423 sel_exception(vcpu, IDT_TS, sel, ts->ext);
424 return (1);
425 }
426 }
427
428 if (dataseg) {
429 /*
430 * A data segment is always non-conforming except when it's
431 * descriptor is a readable, conforming code segment.
432 */
433 if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0)
434 conforming = true;
435 else
436 conforming = false;
437
438 if (!conforming && (rpl > dpl || cpl > dpl)) {
439 sel_exception(vcpu, IDT_TS, sel, ts->ext);
440 return (1);
441 }
442 }
443 *seg_desc = usd_to_seg_desc(&usd);
444 return (0);
445 }
446
447 static void
tss32_save(struct vcpu * vcpu,struct vm_task_switch * task_switch,uint32_t eip,struct tss32 * tss,struct iovec * iov)448 tss32_save(struct vcpu *vcpu, struct vm_task_switch *task_switch,
449 uint32_t eip, struct tss32 *tss, struct iovec *iov)
450 {
451
452 /* General purpose registers */
453 tss->tss_eax = GETREG(vcpu, VM_REG_GUEST_RAX);
454 tss->tss_ecx = GETREG(vcpu, VM_REG_GUEST_RCX);
455 tss->tss_edx = GETREG(vcpu, VM_REG_GUEST_RDX);
456 tss->tss_ebx = GETREG(vcpu, VM_REG_GUEST_RBX);
457 tss->tss_esp = GETREG(vcpu, VM_REG_GUEST_RSP);
458 tss->tss_ebp = GETREG(vcpu, VM_REG_GUEST_RBP);
459 tss->tss_esi = GETREG(vcpu, VM_REG_GUEST_RSI);
460 tss->tss_edi = GETREG(vcpu, VM_REG_GUEST_RDI);
461
462 /* Segment selectors */
463 tss->tss_es = GETREG(vcpu, VM_REG_GUEST_ES);
464 tss->tss_cs = GETREG(vcpu, VM_REG_GUEST_CS);
465 tss->tss_ss = GETREG(vcpu, VM_REG_GUEST_SS);
466 tss->tss_ds = GETREG(vcpu, VM_REG_GUEST_DS);
467 tss->tss_fs = GETREG(vcpu, VM_REG_GUEST_FS);
468 tss->tss_gs = GETREG(vcpu, VM_REG_GUEST_GS);
469
470 /* eflags and eip */
471 tss->tss_eflags = GETREG(vcpu, VM_REG_GUEST_RFLAGS);
472 if (task_switch->reason == TSR_IRET)
473 tss->tss_eflags &= ~PSL_NT;
474 tss->tss_eip = eip;
475
476 /* Copy updated old TSS into guest memory */
477 vm_copyout(tss, iov, sizeof(struct tss32));
478 }
479
480 static void
update_seg_desc(struct vcpu * vcpu,int reg,struct seg_desc * sd)481 update_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *sd)
482 {
483 int error;
484
485 error = vm_set_desc(vcpu, reg, sd->base, sd->limit, sd->access);
486 assert(error == 0);
487 }
488
489 /*
490 * Update the vcpu registers to reflect the state of the new task.
491 */
492 static int
tss32_restore(struct vmctx * ctx,struct vcpu * vcpu,struct vm_task_switch * ts,uint16_t ot_sel,struct tss32 * tss,struct iovec * iov,int * faultptr)493 tss32_restore(struct vmctx *ctx, struct vcpu *vcpu, struct vm_task_switch *ts,
494 uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr)
495 {
496 struct seg_desc seg_desc, seg_desc2;
497 uint64_t *pdpte, maxphyaddr, reserved;
498 uint32_t eflags;
499 int error, i;
500 bool nested;
501
502 nested = false;
503 if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) {
504 tss->tss_link = ot_sel;
505 nested = true;
506 }
507
508 eflags = tss->tss_eflags;
509 if (nested)
510 eflags |= PSL_NT;
511
512 /* LDTR */
513 SETREG(vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt);
514
515 /* PBDR */
516 if (ts->paging.paging_mode != PAGING_MODE_FLAT) {
517 if (ts->paging.paging_mode == PAGING_MODE_PAE) {
518 /*
519 * XXX Assuming 36-bit MAXPHYADDR.
520 */
521 maxphyaddr = (1UL << 36) - 1;
522 pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32);
523 for (i = 0; i < 4; i++) {
524 /* Check reserved bits if the PDPTE is valid */
525 if (!(pdpte[i] & 0x1))
526 continue;
527 /*
528 * Bits 2:1, 8:5 and bits above the processor's
529 * maximum physical address are reserved.
530 */
531 reserved = ~maxphyaddr | 0x1E6;
532 if (pdpte[i] & reserved) {
533 vm_inject_gp(vcpu);
534 return (1);
535 }
536 }
537 SETREG(vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]);
538 SETREG(vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]);
539 SETREG(vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]);
540 SETREG(vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]);
541 }
542 SETREG(vcpu, VM_REG_GUEST_CR3, tss->tss_cr3);
543 ts->paging.cr3 = tss->tss_cr3;
544 }
545
546 /* eflags and eip */
547 SETREG(vcpu, VM_REG_GUEST_RFLAGS, eflags);
548 SETREG(vcpu, VM_REG_GUEST_RIP, tss->tss_eip);
549
550 /* General purpose registers */
551 SETREG(vcpu, VM_REG_GUEST_RAX, tss->tss_eax);
552 SETREG(vcpu, VM_REG_GUEST_RCX, tss->tss_ecx);
553 SETREG(vcpu, VM_REG_GUEST_RDX, tss->tss_edx);
554 SETREG(vcpu, VM_REG_GUEST_RBX, tss->tss_ebx);
555 SETREG(vcpu, VM_REG_GUEST_RSP, tss->tss_esp);
556 SETREG(vcpu, VM_REG_GUEST_RBP, tss->tss_ebp);
557 SETREG(vcpu, VM_REG_GUEST_RSI, tss->tss_esi);
558 SETREG(vcpu, VM_REG_GUEST_RDI, tss->tss_edi);
559
560 /* Segment selectors */
561 SETREG(vcpu, VM_REG_GUEST_ES, tss->tss_es);
562 SETREG(vcpu, VM_REG_GUEST_CS, tss->tss_cs);
563 SETREG(vcpu, VM_REG_GUEST_SS, tss->tss_ss);
564 SETREG(vcpu, VM_REG_GUEST_DS, tss->tss_ds);
565 SETREG(vcpu, VM_REG_GUEST_FS, tss->tss_fs);
566 SETREG(vcpu, VM_REG_GUEST_GS, tss->tss_gs);
567
568 /*
569 * If this is a nested task then write out the new TSS to update
570 * the previous link field.
571 */
572 if (nested)
573 vm_copyout(tss, iov, sizeof(*tss));
574
575 /* Validate segment descriptors */
576 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc,
577 faultptr);
578 if (error || *faultptr)
579 return (error);
580 update_seg_desc(vcpu, VM_REG_GUEST_LDTR, &seg_desc);
581
582 /*
583 * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3.
584 *
585 * The SS and CS attribute checks on VM-entry are inter-dependent so
586 * we need to make sure that both segments are valid before updating
587 * either of them. This ensures that the VMCS state can pass the
588 * VM-entry checks so the guest can handle any exception injected
589 * during task switch emulation.
590 */
591 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_CS, &seg_desc,
592 faultptr);
593 if (error || *faultptr)
594 return (error);
595
596 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_SS, &seg_desc2,
597 faultptr);
598 if (error || *faultptr)
599 return (error);
600 update_seg_desc(vcpu, VM_REG_GUEST_CS, &seg_desc);
601 update_seg_desc(vcpu, VM_REG_GUEST_SS, &seg_desc2);
602 ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK;
603
604 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_DS, &seg_desc,
605 faultptr);
606 if (error || *faultptr)
607 return (error);
608 update_seg_desc(vcpu, VM_REG_GUEST_DS, &seg_desc);
609
610 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_ES, &seg_desc,
611 faultptr);
612 if (error || *faultptr)
613 return (error);
614 update_seg_desc(vcpu, VM_REG_GUEST_ES, &seg_desc);
615
616 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_FS, &seg_desc,
617 faultptr);
618 if (error || *faultptr)
619 return (error);
620 update_seg_desc(vcpu, VM_REG_GUEST_FS, &seg_desc);
621
622 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_GS, &seg_desc,
623 faultptr);
624 if (error || *faultptr)
625 return (error);
626 update_seg_desc(vcpu, VM_REG_GUEST_GS, &seg_desc);
627
628 return (0);
629 }
630
631
632 /*
633 * Copy of vie_alignment_check() from vmm_instruction_emul.c
634 */
635 static int
alignment_check(int cpl,int size,uint64_t cr0,uint64_t rf,uint64_t gla)636 alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
637 {
638 assert(size == 1 || size == 2 || size == 4 || size == 8);
639 assert(cpl >= 0 && cpl <= 3);
640
641 if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
642 return (0);
643
644 return ((gla & (size - 1)) ? 1 : 0);
645 }
646
647 /*
648 * Copy of vie_size2mask() from vmm_instruction_emul.c
649 */
650 static uint64_t
size2mask(int size)651 size2mask(int size)
652 {
653 switch (size) {
654 case 1:
655 return (0xff);
656 case 2:
657 return (0xffff);
658 case 4:
659 return (0xffffffff);
660 case 8:
661 return (0xffffffffffffffff);
662 default:
663 assert(0);
664 /* not reached */
665 return (0);
666 }
667 }
668
669 /*
670 * Copy of vie_calculate_gla() from vmm_instruction_emul.c
671 */
672 static int
calculate_gla(enum vm_cpu_mode cpu_mode,enum vm_reg_name seg,struct seg_desc * desc,uint64_t offset,int length,int addrsize,int prot,uint64_t * gla)673 calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
674 struct seg_desc *desc, uint64_t offset, int length, int addrsize,
675 int prot, uint64_t *gla)
676 {
677 uint64_t firstoff, low_limit, high_limit, segbase;
678 int glasize, type;
679
680 assert(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS);
681 assert((length == 1 || length == 2 || length == 4 || length == 8));
682 assert((prot & ~(PROT_READ | PROT_WRITE)) == 0);
683
684 firstoff = offset;
685 if (cpu_mode == CPU_MODE_64BIT) {
686 assert(addrsize == 4 || addrsize == 8);
687 glasize = 8;
688 } else {
689 assert(addrsize == 2 || addrsize == 4);
690 glasize = 4;
691 /*
692 * If the segment selector is loaded with a NULL selector
693 * then the descriptor is unusable and attempting to use
694 * it results in a #GP(0).
695 */
696 if (SEG_DESC_UNUSABLE(desc->access))
697 return (-1);
698
699 /*
700 * The processor generates a #NP exception when a segment
701 * register is loaded with a selector that points to a
702 * descriptor that is not present. If this was the case then
703 * it would have been checked before the VM-exit.
704 */
705 assert(SEG_DESC_PRESENT(desc->access));
706
707 /*
708 * The descriptor type must indicate a code/data segment.
709 */
710 type = SEG_DESC_TYPE(desc->access);
711 assert(type >= 16 && type <= 31);
712
713 if (prot & PROT_READ) {
714 /* #GP on a read access to a exec-only code segment */
715 if ((type & 0xA) == 0x8)
716 return (-1);
717 }
718
719 if (prot & PROT_WRITE) {
720 /*
721 * #GP on a write access to a code segment or a
722 * read-only data segment.
723 */
724 if (type & 0x8) /* code segment */
725 return (-1);
726
727 if ((type & 0xA) == 0) /* read-only data seg */
728 return (-1);
729 }
730
731 /*
732 * 'desc->limit' is fully expanded taking granularity into
733 * account.
734 */
735 if ((type & 0xC) == 0x4) {
736 /* expand-down data segment */
737 low_limit = desc->limit + 1;
738 high_limit = SEG_DESC_DEF32(desc->access) ?
739 0xffffffff : 0xffff;
740 } else {
741 /* code segment or expand-up data segment */
742 low_limit = 0;
743 high_limit = desc->limit;
744 }
745
746 while (length > 0) {
747 offset &= size2mask(addrsize);
748 if (offset < low_limit || offset > high_limit)
749 return (-1);
750 offset++;
751 length--;
752 }
753 }
754
755 /*
756 * In 64-bit mode all segments except %fs and %gs have a segment
757 * base address of 0.
758 */
759 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
760 seg != VM_REG_GUEST_GS) {
761 segbase = 0;
762 } else {
763 segbase = desc->base;
764 }
765
766 /*
767 * Truncate 'firstoff' to the effective address size before adding
768 * it to the segment base.
769 */
770 firstoff &= size2mask(addrsize);
771 *gla = (segbase + firstoff) & size2mask(glasize);
772 return (0);
773 }
774
775 /*
776 * Push an error code on the stack of the new task. This is needed if the
777 * task switch was triggered by a hardware exception that causes an error
778 * code to be saved (e.g. #PF).
779 */
780 static int
push_errcode(struct vcpu * vcpu,struct vm_guest_paging * paging,int task_type,uint32_t errcode,int * faultptr)781 push_errcode(struct vcpu *vcpu, struct vm_guest_paging *paging,
782 int task_type, uint32_t errcode, int *faultptr)
783 {
784 struct iovec iov[2];
785 struct seg_desc seg_desc;
786 int stacksize, bytes, error;
787 uint64_t gla, cr0, rflags;
788 uint32_t esp;
789 uint16_t stacksel;
790
791 *faultptr = 0;
792
793 cr0 = GETREG(vcpu, VM_REG_GUEST_CR0);
794 rflags = GETREG(vcpu, VM_REG_GUEST_RFLAGS);
795 stacksel = GETREG(vcpu, VM_REG_GUEST_SS);
796
797 error = vm_get_desc(vcpu, VM_REG_GUEST_SS, &seg_desc.base,
798 &seg_desc.limit, &seg_desc.access);
799 assert(error == 0);
800
801 /*
802 * Section "Error Code" in the Intel SDM vol 3: the error code is
803 * pushed on the stack as a doubleword or word (depending on the
804 * default interrupt, trap or task gate size).
805 */
806 if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS)
807 bytes = 4;
808 else
809 bytes = 2;
810
811 /*
812 * PUSH instruction from Intel SDM vol 2: the 'B' flag in the
813 * stack-segment descriptor determines the size of the stack
814 * pointer outside of 64-bit mode.
815 */
816 if (SEG_DESC_DEF32(seg_desc.access))
817 stacksize = 4;
818 else
819 stacksize = 2;
820
821 esp = GETREG(vcpu, VM_REG_GUEST_RSP);
822 esp -= bytes;
823
824 if (calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS,
825 &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) {
826 sel_exception(vcpu, IDT_SS, stacksel, 1);
827 *faultptr = 1;
828 return (0);
829 }
830
831 if (alignment_check(paging->cpl, bytes, cr0, rflags, gla)) {
832 vm_inject_ac(vcpu, 1);
833 *faultptr = 1;
834 return (0);
835 }
836
837 error = vm_copy_setup(vcpu, paging, gla, bytes, PROT_WRITE,
838 iov, nitems(iov), faultptr);
839 if (error || *faultptr)
840 return (error);
841
842 vm_copyout(&errcode, iov, bytes);
843 SETREG(vcpu, VM_REG_GUEST_RSP, esp);
844 return (0);
845 }
846
847 /*
848 * Evaluate return value from helper functions and potentially return to
849 * the VM run loop.
850 */
851 #define CHKERR(error,fault) \
852 do { \
853 assert((error == 0) || (error == EFAULT)); \
854 if (error) \
855 return (VMEXIT_ABORT); \
856 else if (fault) \
857 return (VMEXIT_CONTINUE); \
858 } while (0)
859
860 int
vmexit_task_switch(struct vmctx * ctx,struct vcpu * vcpu,struct vm_exit * vmexit)861 vmexit_task_switch(struct vmctx *ctx, struct vcpu *vcpu, struct vm_exit *vmexit)
862 {
863 struct seg_desc nt;
864 struct tss32 oldtss, newtss;
865 struct vm_task_switch *task_switch;
866 struct vm_guest_paging *paging, sup_paging;
867 struct user_segment_descriptor nt_desc, ot_desc;
868 struct iovec nt_iov[2], ot_iov[2];
869 uint64_t cr0, ot_base;
870 uint32_t eip, ot_lim, access;
871 int error, ext, fault, minlimit, nt_type, ot_type;
872 enum task_switch_reason reason;
873 uint16_t nt_sel, ot_sel;
874
875 task_switch = &vmexit->u.task_switch;
876 nt_sel = task_switch->tsssel;
877 ext = vmexit->u.task_switch.ext;
878 reason = vmexit->u.task_switch.reason;
879 paging = &vmexit->u.task_switch.paging;
880
881 assert(paging->cpu_mode == CPU_MODE_PROTECTED);
882
883 /*
884 * Calculate the instruction pointer to store in the old TSS.
885 */
886 eip = vmexit->rip + vmexit->inst_length;
887
888 /*
889 * Section 4.6, "Access Rights" in Intel SDM Vol 3.
890 * The following page table accesses are implicitly supervisor mode:
891 * - accesses to GDT or LDT to load segment descriptors
892 * - accesses to the task state segment during task switch
893 */
894 sup_paging = *paging;
895 sup_paging.cpl = 0; /* implicit supervisor mode */
896
897 /* Fetch the new TSS descriptor */
898 error = read_tss_descriptor(vcpu, task_switch, nt_sel, &nt_desc,
899 &fault);
900 CHKERR(error, fault);
901
902 nt = usd_to_seg_desc(&nt_desc);
903
904 /* Verify the type of the new TSS */
905 nt_type = SEG_DESC_TYPE(nt.access);
906 if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS &&
907 nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) {
908 sel_exception(vcpu, IDT_TS, nt_sel, ext);
909 goto done;
910 }
911
912 /* TSS descriptor must have present bit set */
913 if (!SEG_DESC_PRESENT(nt.access)) {
914 sel_exception(vcpu, IDT_NP, nt_sel, ext);
915 goto done;
916 }
917
918 /*
919 * TSS must have a minimum length of 104 bytes for a 32-bit TSS and
920 * 44 bytes for a 16-bit TSS.
921 */
922 if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS)
923 minlimit = 104 - 1;
924 else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS)
925 minlimit = 44 - 1;
926 else
927 minlimit = 0;
928
929 assert(minlimit > 0);
930 if (nt.limit < (unsigned int)minlimit) {
931 sel_exception(vcpu, IDT_TS, nt_sel, ext);
932 goto done;
933 }
934
935 /* TSS must be busy if task switch is due to IRET */
936 if (reason == TSR_IRET && !TSS_BUSY(nt_type)) {
937 sel_exception(vcpu, IDT_TS, nt_sel, ext);
938 goto done;
939 }
940
941 /*
942 * TSS must be available (not busy) if task switch reason is
943 * CALL, JMP, exception or interrupt.
944 */
945 if (reason != TSR_IRET && TSS_BUSY(nt_type)) {
946 sel_exception(vcpu, IDT_GP, nt_sel, ext);
947 goto done;
948 }
949
950 /* Fetch the new TSS */
951 error = vm_copy_setup(vcpu, &sup_paging, nt.base, minlimit + 1,
952 PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault);
953 CHKERR(error, fault);
954 vm_copyin(nt_iov, &newtss, minlimit + 1);
955
956 /* Get the old TSS selector from the guest's task register */
957 ot_sel = GETREG(vcpu, VM_REG_GUEST_TR);
958 if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) {
959 /*
960 * This might happen if a task switch was attempted without
961 * ever loading the task register with LTR. In this case the
962 * TR would contain the values from power-on:
963 * (sel = 0, base = 0, limit = 0xffff).
964 */
965 sel_exception(vcpu, IDT_TS, ot_sel, task_switch->ext);
966 goto done;
967 }
968
969 /* Get the old TSS base and limit from the guest's task register */
970 error = vm_get_desc(vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim,
971 &access);
972 assert(error == 0);
973 assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access));
974 ot_type = SEG_DESC_TYPE(access);
975 assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY);
976
977 /* Fetch the old TSS descriptor */
978 error = read_tss_descriptor(vcpu, task_switch, ot_sel, &ot_desc,
979 &fault);
980 CHKERR(error, fault);
981
982 /* Get the old TSS */
983 error = vm_copy_setup(vcpu, &sup_paging, ot_base, minlimit + 1,
984 PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault);
985 CHKERR(error, fault);
986 vm_copyin(ot_iov, &oldtss, minlimit + 1);
987
988 /*
989 * Clear the busy bit in the old TSS descriptor if the task switch
990 * due to an IRET or JMP instruction.
991 */
992 if (reason == TSR_IRET || reason == TSR_JMP) {
993 ot_desc.sd_type &= ~0x2;
994 error = desc_table_write(vcpu, &sup_paging, ot_sel,
995 &ot_desc, &fault);
996 CHKERR(error, fault);
997 }
998
999 if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) {
1000 EPRINTLN("Task switch to 16-bit TSS not supported");
1001 return (VMEXIT_ABORT);
1002 }
1003
1004 /* Save processor state in old TSS */
1005 tss32_save(vcpu, task_switch, eip, &oldtss, ot_iov);
1006
1007 /*
1008 * If the task switch was triggered for any reason other than IRET
1009 * then set the busy bit in the new TSS descriptor.
1010 */
1011 if (reason != TSR_IRET) {
1012 nt_desc.sd_type |= 0x2;
1013 error = desc_table_write(vcpu, &sup_paging, nt_sel,
1014 &nt_desc, &fault);
1015 CHKERR(error, fault);
1016 }
1017
1018 /* Update task register to point at the new TSS */
1019 SETREG(vcpu, VM_REG_GUEST_TR, nt_sel);
1020
1021 /* Update the hidden descriptor state of the task register */
1022 nt = usd_to_seg_desc(&nt_desc);
1023 update_seg_desc(vcpu, VM_REG_GUEST_TR, &nt);
1024
1025 /* Set CR0.TS */
1026 cr0 = GETREG(vcpu, VM_REG_GUEST_CR0);
1027 SETREG(vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS);
1028
1029 /*
1030 * We are now committed to the task switch. Any exceptions encountered
1031 * after this point will be handled in the context of the new task and
1032 * the saved instruction pointer will belong to the new task.
1033 */
1034 error = vm_set_register(vcpu, VM_REG_GUEST_RIP, newtss.tss_eip);
1035 assert(error == 0);
1036
1037 /* Load processor state from new TSS */
1038 error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov,
1039 &fault);
1040 CHKERR(error, fault);
1041
1042 /*
1043 * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception
1044 * caused an error code to be generated, this error code is copied
1045 * to the stack of the new task.
1046 */
1047 if (task_switch->errcode_valid) {
1048 assert(task_switch->ext);
1049 assert(task_switch->reason == TSR_IDT_GATE);
1050 error = push_errcode(vcpu, &task_switch->paging, nt_type,
1051 task_switch->errcode, &fault);
1052 CHKERR(error, fault);
1053 }
1054
1055 /*
1056 * Treatment of virtual-NMI blocking if NMI is delivered through
1057 * a task gate.
1058 *
1059 * Section "Architectural State Before A VM Exit", Intel SDM, Vol3:
1060 * If the virtual NMIs VM-execution control is 1, VM entry injects
1061 * an NMI, and delivery of the NMI causes a task switch that causes
1062 * a VM exit, virtual-NMI blocking is in effect before the VM exit
1063 * commences.
1064 *
1065 * Thus, virtual-NMI blocking is in effect at the time of the task
1066 * switch VM exit.
1067 */
1068
1069 /*
1070 * Treatment of virtual-NMI unblocking on IRET from NMI handler task.
1071 *
1072 * Section "Changes to Instruction Behavior in VMX Non-Root Operation"
1073 * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking.
1074 * This unblocking of virtual-NMI occurs even if IRET causes a fault.
1075 *
1076 * Thus, virtual-NMI blocking is cleared at the time of the task switch
1077 * VM exit.
1078 */
1079
1080 /*
1081 * If the task switch was triggered by an event delivered through
1082 * the IDT then extinguish the pending event from the vcpu's
1083 * exitintinfo.
1084 */
1085 if (task_switch->reason == TSR_IDT_GATE) {
1086 error = vm_set_intinfo(vcpu, 0);
1087 assert(error == 0);
1088 }
1089
1090 /*
1091 * XXX should inject debug exception if 'T' bit is 1
1092 */
1093 done:
1094 return (VMEXIT_CONTINUE);
1095 }
1096