1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2014 Neel Natu <neel@freebsd.org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 /* 29 * This file and its contents are supplied under the terms of the 30 * Common Development and Distribution License ("CDDL"), version 1.0. 31 * You may only use this file in accordance with the terms of version 32 * 1.0 of the CDDL. 33 * 34 * A full copy of the text of the CDDL should have accompanied this 35 * source. A copy of the CDDL is also available via the Internet at 36 * http://www.illumos.org/license/CDDL. 37 * 38 * Copyright 2020 Oxide Computer Company 39 */ 40 41 #include <sys/cdefs.h> 42 __FBSDID("$FreeBSD$"); 43 44 #include <sys/param.h> 45 #include <sys/_iovec.h> 46 #include <sys/mman.h> 47 48 #include <x86/psl.h> 49 #include <x86/segments.h> 50 #include <x86/specialreg.h> 51 #include <machine/vmm.h> 52 53 #include <assert.h> 54 #include <errno.h> 55 #include <stdbool.h> 56 #include <stdio.h> 57 #include <stdlib.h> 58 59 #include <vmmapi.h> 60 61 #include "bhyverun.h" 62 #include "debug.h" 63 64 /* 65 * Using 'struct i386tss' is tempting but causes myriad sign extension 66 * issues because all of its fields are defined as signed integers. 67 */ 68 struct tss32 { 69 uint16_t tss_link; 70 uint16_t rsvd1; 71 uint32_t tss_esp0; 72 uint16_t tss_ss0; 73 uint16_t rsvd2; 74 uint32_t tss_esp1; 75 uint16_t tss_ss1; 76 uint16_t rsvd3; 77 uint32_t tss_esp2; 78 uint16_t tss_ss2; 79 uint16_t rsvd4; 80 uint32_t tss_cr3; 81 uint32_t tss_eip; 82 uint32_t tss_eflags; 83 uint32_t tss_eax; 84 uint32_t tss_ecx; 85 uint32_t tss_edx; 86 uint32_t tss_ebx; 87 uint32_t tss_esp; 88 uint32_t tss_ebp; 89 uint32_t tss_esi; 90 uint32_t tss_edi; 91 uint16_t tss_es; 92 uint16_t rsvd5; 93 uint16_t tss_cs; 94 uint16_t rsvd6; 95 uint16_t tss_ss; 96 uint16_t rsvd7; 97 uint16_t tss_ds; 98 uint16_t rsvd8; 99 uint16_t tss_fs; 100 uint16_t rsvd9; 101 uint16_t tss_gs; 102 uint16_t rsvd10; 103 uint16_t tss_ldt; 104 uint16_t rsvd11; 105 uint16_t tss_trap; 106 uint16_t tss_iomap; 107 }; 108 static_assert(sizeof(struct tss32) == 104, "compile-time assertion failed"); 109 110 #define SEL_START(sel) (((sel) & ~0x7)) 111 #define SEL_LIMIT(sel) (((sel) | 0x7)) 112 #define TSS_BUSY(type) (((type) & 0x2) != 0) 113 114 static uint64_t 115 GETREG(struct vmctx *ctx, int vcpu, int reg) 116 { 117 uint64_t val; 118 int error; 119 120 error = vm_get_register(ctx, vcpu, reg, &val); 121 assert(error == 0); 122 return (val); 123 } 124 125 static void 126 SETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val) 127 { 128 int error; 129 130 error = vm_set_register(ctx, vcpu, reg, val); 131 assert(error == 0); 132 } 133 134 static struct seg_desc 135 usd_to_seg_desc(struct user_segment_descriptor *usd) 136 { 137 struct seg_desc seg_desc; 138 139 seg_desc.base = (u_int)USD_GETBASE(usd); 140 if (usd->sd_gran) 141 seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff; 142 else 143 seg_desc.limit = (u_int)USD_GETLIMIT(usd); 144 seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7; 145 seg_desc.access |= usd->sd_xx << 12; 146 seg_desc.access |= usd->sd_def32 << 14; 147 seg_desc.access |= usd->sd_gran << 15; 148 149 return (seg_desc); 150 } 151 152 /* 153 * Inject an exception with an error code that is a segment selector. 154 * The format of the error code is described in section 6.13, "Error Code", 155 * Intel SDM volume 3. 156 * 157 * Bit 0 (EXT) denotes whether the exception occurred during delivery 158 * of an external event like an interrupt. 159 * 160 * Bit 1 (IDT) indicates whether the selector points to a gate descriptor 161 * in the IDT. 162 * 163 * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI). 164 */ 165 static void 166 sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext) 167 { 168 /* 169 * Bit 2 from the selector is retained as-is in the error code. 170 * 171 * Bit 1 can be safely cleared because none of the selectors 172 * encountered during task switch emulation refer to a task 173 * gate in the IDT. 174 * 175 * Bit 0 is set depending on the value of 'ext'. 176 */ 177 sel &= ~0x3; 178 if (ext) 179 sel |= 0x1; 180 vm_inject_fault(ctx, vcpu, vector, 1, sel); 181 } 182 183 /* 184 * Return 0 if the selector 'sel' in within the limits of the GDT/LDT 185 * and non-zero otherwise. 186 */ 187 static int 188 desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel) 189 { 190 uint64_t base; 191 uint32_t limit, access; 192 int error, reg; 193 194 reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; 195 error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access); 196 assert(error == 0); 197 198 if (reg == VM_REG_GUEST_LDTR) { 199 if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access)) 200 return (-1); 201 } 202 203 if (limit < SEL_LIMIT(sel)) 204 return (-1); 205 else 206 return (0); 207 } 208 209 /* 210 * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced 211 * by the selector 'sel'. 212 * 213 * Returns 0 on success. 214 * Returns 1 if an exception was injected into the guest. 215 * Returns -1 otherwise. 216 */ 217 static int 218 desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, 219 uint16_t sel, struct user_segment_descriptor *desc, bool doread, 220 int *faultptr) 221 { 222 struct iovec iov[2]; 223 uint64_t base; 224 uint32_t limit, access; 225 int error, reg; 226 227 reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; 228 error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access); 229 assert(error == 0); 230 assert(limit >= SEL_LIMIT(sel)); 231 232 error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel), 233 sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov), 234 faultptr); 235 if (error || *faultptr) 236 return (error); 237 238 if (doread) 239 vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc)); 240 else 241 vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc)); 242 return (0); 243 } 244 245 static int 246 desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, 247 uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) 248 { 249 return (desc_table_rw(ctx, vcpu, paging, sel, desc, true, faultptr)); 250 } 251 252 static int 253 desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, 254 uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) 255 { 256 return (desc_table_rw(ctx, vcpu, paging, sel, desc, false, faultptr)); 257 } 258 259 /* 260 * Read the TSS descriptor referenced by 'sel' into 'desc'. 261 * 262 * Returns 0 on success. 263 * Returns 1 if an exception was injected into the guest. 264 * Returns -1 otherwise. 265 */ 266 static int 267 read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, 268 uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) 269 { 270 struct vm_guest_paging sup_paging; 271 int error; 272 273 assert(!ISLDT(sel)); 274 assert(IDXSEL(sel) != 0); 275 276 /* Fetch the new TSS descriptor */ 277 if (desc_table_limit_check(ctx, vcpu, sel)) { 278 if (ts->reason == TSR_IRET) 279 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 280 else 281 sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext); 282 return (1); 283 } 284 285 sup_paging = ts->paging; 286 sup_paging.cpl = 0; /* implicit supervisor mode */ 287 error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc, faultptr); 288 return (error); 289 } 290 291 static bool 292 code_desc(int sd_type) 293 { 294 /* code descriptor */ 295 return ((sd_type & 0x18) == 0x18); 296 } 297 298 static bool 299 stack_desc(int sd_type) 300 { 301 /* writable data descriptor */ 302 return ((sd_type & 0x1A) == 0x12); 303 } 304 305 static bool 306 data_desc(int sd_type) 307 { 308 /* data descriptor or a readable code descriptor */ 309 return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A); 310 } 311 312 static bool 313 ldt_desc(int sd_type) 314 { 315 316 return (sd_type == SDT_SYSLDT); 317 } 318 319 /* 320 * Validate the descriptor 'seg_desc' associated with 'segment'. 321 */ 322 static int 323 validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, 324 int segment, struct seg_desc *seg_desc, int *faultptr) 325 { 326 struct vm_guest_paging sup_paging; 327 struct user_segment_descriptor usd; 328 int error, idtvec; 329 int cpl, dpl, rpl; 330 uint16_t sel, cs; 331 bool ldtseg, codeseg, stackseg, dataseg, conforming; 332 333 ldtseg = codeseg = stackseg = dataseg = false; 334 switch (segment) { 335 case VM_REG_GUEST_LDTR: 336 ldtseg = true; 337 break; 338 case VM_REG_GUEST_CS: 339 codeseg = true; 340 break; 341 case VM_REG_GUEST_SS: 342 stackseg = true; 343 break; 344 case VM_REG_GUEST_DS: 345 case VM_REG_GUEST_ES: 346 case VM_REG_GUEST_FS: 347 case VM_REG_GUEST_GS: 348 dataseg = true; 349 break; 350 default: 351 assert(0); 352 } 353 354 /* Get the segment selector */ 355 sel = GETREG(ctx, vcpu, segment); 356 357 /* LDT selector must point into the GDT */ 358 if (ldtseg && ISLDT(sel)) { 359 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 360 return (1); 361 } 362 363 /* Descriptor table limit check */ 364 if (desc_table_limit_check(ctx, vcpu, sel)) { 365 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 366 return (1); 367 } 368 369 /* NULL selector */ 370 if (IDXSEL(sel) == 0) { 371 /* Code and stack segment selectors cannot be NULL */ 372 if (codeseg || stackseg) { 373 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 374 return (1); 375 } 376 seg_desc->base = 0; 377 seg_desc->limit = 0; 378 seg_desc->access = 0x10000; /* unusable */ 379 return (0); 380 } 381 382 /* Read the descriptor from the GDT/LDT */ 383 sup_paging = ts->paging; 384 sup_paging.cpl = 0; /* implicit supervisor mode */ 385 error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd, faultptr); 386 if (error || *faultptr) 387 return (error); 388 389 /* Verify that the descriptor type is compatible with the segment */ 390 if ((ldtseg && !ldt_desc(usd.sd_type)) || 391 (codeseg && !code_desc(usd.sd_type)) || 392 (dataseg && !data_desc(usd.sd_type)) || 393 (stackseg && !stack_desc(usd.sd_type))) { 394 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 395 return (1); 396 } 397 398 /* Segment must be marked present */ 399 if (!usd.sd_p) { 400 if (ldtseg) 401 idtvec = IDT_TS; 402 else if (stackseg) 403 idtvec = IDT_SS; 404 else 405 idtvec = IDT_NP; 406 sel_exception(ctx, vcpu, idtvec, sel, ts->ext); 407 return (1); 408 } 409 410 cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS); 411 cpl = cs & SEL_RPL_MASK; 412 rpl = sel & SEL_RPL_MASK; 413 dpl = usd.sd_dpl; 414 415 if (stackseg && (rpl != cpl || dpl != cpl)) { 416 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 417 return (1); 418 } 419 420 if (codeseg) { 421 conforming = (usd.sd_type & 0x4) ? true : false; 422 if ((conforming && (cpl < dpl)) || 423 (!conforming && (cpl != dpl))) { 424 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 425 return (1); 426 } 427 } 428 429 if (dataseg) { 430 /* 431 * A data segment is always non-conforming except when it's 432 * descriptor is a readable, conforming code segment. 433 */ 434 if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0) 435 conforming = true; 436 else 437 conforming = false; 438 439 if (!conforming && (rpl > dpl || cpl > dpl)) { 440 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 441 return (1); 442 } 443 } 444 *seg_desc = usd_to_seg_desc(&usd); 445 return (0); 446 } 447 448 static void 449 tss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch, 450 uint32_t eip, struct tss32 *tss, struct iovec *iov) 451 { 452 453 /* General purpose registers */ 454 tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX); 455 tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX); 456 tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX); 457 tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX); 458 tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP); 459 tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP); 460 tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI); 461 tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI); 462 463 /* Segment selectors */ 464 tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES); 465 tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS); 466 tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS); 467 tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS); 468 tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS); 469 tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS); 470 471 /* eflags and eip */ 472 tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS); 473 if (task_switch->reason == TSR_IRET) 474 tss->tss_eflags &= ~PSL_NT; 475 tss->tss_eip = eip; 476 477 /* Copy updated old TSS into guest memory */ 478 vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32)); 479 } 480 481 static void 482 update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd) 483 { 484 int error; 485 486 error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access); 487 assert(error == 0); 488 } 489 490 /* 491 * Update the vcpu registers to reflect the state of the new task. 492 */ 493 static int 494 tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, 495 uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr) 496 { 497 struct seg_desc seg_desc, seg_desc2; 498 uint64_t *pdpte, maxphyaddr, reserved; 499 uint32_t eflags; 500 int error, i; 501 bool nested; 502 503 nested = false; 504 if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) { 505 tss->tss_link = ot_sel; 506 nested = true; 507 } 508 509 eflags = tss->tss_eflags; 510 if (nested) 511 eflags |= PSL_NT; 512 513 /* LDTR */ 514 SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt); 515 516 /* PBDR */ 517 if (ts->paging.paging_mode != PAGING_MODE_FLAT) { 518 if (ts->paging.paging_mode == PAGING_MODE_PAE) { 519 /* 520 * XXX Assuming 36-bit MAXPHYADDR. 521 */ 522 maxphyaddr = (1UL << 36) - 1; 523 pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32); 524 for (i = 0; i < 4; i++) { 525 /* Check reserved bits if the PDPTE is valid */ 526 if (!(pdpte[i] & 0x1)) 527 continue; 528 /* 529 * Bits 2:1, 8:5 and bits above the processor's 530 * maximum physical address are reserved. 531 */ 532 reserved = ~maxphyaddr | 0x1E6; 533 if (pdpte[i] & reserved) { 534 vm_inject_gp(ctx, vcpu); 535 return (1); 536 } 537 } 538 SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]); 539 SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]); 540 SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]); 541 SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]); 542 } 543 SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3); 544 ts->paging.cr3 = tss->tss_cr3; 545 } 546 547 /* eflags and eip */ 548 SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags); 549 SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip); 550 551 /* General purpose registers */ 552 SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax); 553 SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx); 554 SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx); 555 SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx); 556 SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp); 557 SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp); 558 SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi); 559 SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi); 560 561 /* Segment selectors */ 562 SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es); 563 SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs); 564 SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss); 565 SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds); 566 SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs); 567 SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs); 568 569 /* 570 * If this is a nested task then write out the new TSS to update 571 * the previous link field. 572 */ 573 if (nested) 574 vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss)); 575 576 /* Validate segment descriptors */ 577 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc, 578 faultptr); 579 if (error || *faultptr) 580 return (error); 581 update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc); 582 583 /* 584 * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3. 585 * 586 * The SS and CS attribute checks on VM-entry are inter-dependent so 587 * we need to make sure that both segments are valid before updating 588 * either of them. This ensures that the VMCS state can pass the 589 * VM-entry checks so the guest can handle any exception injected 590 * during task switch emulation. 591 */ 592 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc, 593 faultptr); 594 if (error || *faultptr) 595 return (error); 596 597 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2, 598 faultptr); 599 if (error || *faultptr) 600 return (error); 601 update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc); 602 update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2); 603 ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK; 604 605 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc, 606 faultptr); 607 if (error || *faultptr) 608 return (error); 609 update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc); 610 611 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc, 612 faultptr); 613 if (error || *faultptr) 614 return (error); 615 update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc); 616 617 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc, 618 faultptr); 619 if (error || *faultptr) 620 return (error); 621 update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc); 622 623 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc, 624 faultptr); 625 if (error || *faultptr) 626 return (error); 627 update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc); 628 629 return (0); 630 } 631 632 633 /* 634 * Copy of vie_alignment_check() from vmm_instruction_emul.c 635 */ 636 static int 637 alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) 638 { 639 assert(size == 1 || size == 2 || size == 4 || size == 8); 640 assert(cpl >= 0 && cpl <= 3); 641 642 if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) 643 return (0); 644 645 return ((gla & (size - 1)) ? 1 : 0); 646 } 647 648 /* 649 * Copy of vie_size2mask() from vmm_instruction_emul.c 650 */ 651 static uint64_t 652 size2mask(int size) 653 { 654 switch (size) { 655 case 1: 656 return (0xff); 657 case 2: 658 return (0xffff); 659 case 4: 660 return (0xffffffff); 661 case 8: 662 return (0xffffffffffffffff); 663 default: 664 assert(0); 665 /* not reached */ 666 return (0); 667 } 668 } 669 670 /* 671 * Copy of vie_calculate_gla() from vmm_instruction_emul.c 672 */ 673 static int 674 calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, 675 struct seg_desc *desc, uint64_t offset, int length, int addrsize, 676 int prot, uint64_t *gla) 677 { 678 uint64_t firstoff, low_limit, high_limit, segbase; 679 int glasize, type; 680 681 assert(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS); 682 assert((length == 1 || length == 2 || length == 4 || length == 8)); 683 assert((prot & ~(PROT_READ | PROT_WRITE)) == 0); 684 685 firstoff = offset; 686 if (cpu_mode == CPU_MODE_64BIT) { 687 assert(addrsize == 4 || addrsize == 8); 688 glasize = 8; 689 } else { 690 assert(addrsize == 2 || addrsize == 4); 691 glasize = 4; 692 /* 693 * If the segment selector is loaded with a NULL selector 694 * then the descriptor is unusable and attempting to use 695 * it results in a #GP(0). 696 */ 697 if (SEG_DESC_UNUSABLE(desc->access)) 698 return (-1); 699 700 /* 701 * The processor generates a #NP exception when a segment 702 * register is loaded with a selector that points to a 703 * descriptor that is not present. If this was the case then 704 * it would have been checked before the VM-exit. 705 */ 706 assert(SEG_DESC_PRESENT(desc->access)); 707 708 /* 709 * The descriptor type must indicate a code/data segment. 710 */ 711 type = SEG_DESC_TYPE(desc->access); 712 assert(type >= 16 && type <= 31); 713 714 if (prot & PROT_READ) { 715 /* #GP on a read access to a exec-only code segment */ 716 if ((type & 0xA) == 0x8) 717 return (-1); 718 } 719 720 if (prot & PROT_WRITE) { 721 /* 722 * #GP on a write access to a code segment or a 723 * read-only data segment. 724 */ 725 if (type & 0x8) /* code segment */ 726 return (-1); 727 728 if ((type & 0xA) == 0) /* read-only data seg */ 729 return (-1); 730 } 731 732 /* 733 * 'desc->limit' is fully expanded taking granularity into 734 * account. 735 */ 736 if ((type & 0xC) == 0x4) { 737 /* expand-down data segment */ 738 low_limit = desc->limit + 1; 739 high_limit = SEG_DESC_DEF32(desc->access) ? 740 0xffffffff : 0xffff; 741 } else { 742 /* code segment or expand-up data segment */ 743 low_limit = 0; 744 high_limit = desc->limit; 745 } 746 747 while (length > 0) { 748 offset &= size2mask(addrsize); 749 if (offset < low_limit || offset > high_limit) 750 return (-1); 751 offset++; 752 length--; 753 } 754 } 755 756 /* 757 * In 64-bit mode all segments except %fs and %gs have a segment 758 * base address of 0. 759 */ 760 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && 761 seg != VM_REG_GUEST_GS) { 762 segbase = 0; 763 } else { 764 segbase = desc->base; 765 } 766 767 /* 768 * Truncate 'firstoff' to the effective address size before adding 769 * it to the segment base. 770 */ 771 firstoff &= size2mask(addrsize); 772 *gla = (segbase + firstoff) & size2mask(glasize); 773 return (0); 774 } 775 776 /* 777 * Push an error code on the stack of the new task. This is needed if the 778 * task switch was triggered by a hardware exception that causes an error 779 * code to be saved (e.g. #PF). 780 */ 781 static int 782 push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, 783 int task_type, uint32_t errcode, int *faultptr) 784 { 785 struct iovec iov[2]; 786 struct seg_desc seg_desc; 787 int stacksize, bytes, error; 788 uint64_t gla, cr0, rflags; 789 uint32_t esp; 790 uint16_t stacksel; 791 792 *faultptr = 0; 793 794 cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0); 795 rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS); 796 stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS); 797 798 error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base, 799 &seg_desc.limit, &seg_desc.access); 800 assert(error == 0); 801 802 /* 803 * Section "Error Code" in the Intel SDM vol 3: the error code is 804 * pushed on the stack as a doubleword or word (depending on the 805 * default interrupt, trap or task gate size). 806 */ 807 if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS) 808 bytes = 4; 809 else 810 bytes = 2; 811 812 /* 813 * PUSH instruction from Intel SDM vol 2: the 'B' flag in the 814 * stack-segment descriptor determines the size of the stack 815 * pointer outside of 64-bit mode. 816 */ 817 if (SEG_DESC_DEF32(seg_desc.access)) 818 stacksize = 4; 819 else 820 stacksize = 2; 821 822 esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP); 823 esp -= bytes; 824 825 if (calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, 826 &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) { 827 sel_exception(ctx, vcpu, IDT_SS, stacksel, 1); 828 *faultptr = 1; 829 return (0); 830 } 831 832 if (alignment_check(paging->cpl, bytes, cr0, rflags, gla)) { 833 vm_inject_ac(ctx, vcpu, 1); 834 *faultptr = 1; 835 return (0); 836 } 837 838 error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE, 839 iov, nitems(iov), faultptr); 840 if (error || *faultptr) 841 return (error); 842 843 vm_copyout(ctx, vcpu, &errcode, iov, bytes); 844 SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp); 845 return (0); 846 } 847 848 /* 849 * Evaluate return value from helper functions and potentially return to 850 * the VM run loop. 851 */ 852 #define CHKERR(error,fault) \ 853 do { \ 854 assert((error == 0) || (error == EFAULT)); \ 855 if (error) \ 856 return (VMEXIT_ABORT); \ 857 else if (fault) \ 858 return (VMEXIT_CONTINUE); \ 859 } while (0) 860 861 int 862 vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 863 { 864 struct seg_desc nt; 865 struct tss32 oldtss, newtss; 866 struct vm_task_switch *task_switch; 867 struct vm_guest_paging *paging, sup_paging; 868 struct user_segment_descriptor nt_desc, ot_desc; 869 struct iovec nt_iov[2], ot_iov[2]; 870 uint64_t cr0, ot_base; 871 uint32_t eip, ot_lim, access; 872 int error, ext, fault, minlimit, nt_type, ot_type, vcpu; 873 enum task_switch_reason reason; 874 uint16_t nt_sel, ot_sel; 875 876 task_switch = &vmexit->u.task_switch; 877 nt_sel = task_switch->tsssel; 878 ext = vmexit->u.task_switch.ext; 879 reason = vmexit->u.task_switch.reason; 880 paging = &vmexit->u.task_switch.paging; 881 vcpu = *pvcpu; 882 883 assert(paging->cpu_mode == CPU_MODE_PROTECTED); 884 885 /* 886 * Calculate the instruction pointer to store in the old TSS. 887 */ 888 eip = vmexit->rip + vmexit->inst_length; 889 890 /* 891 * Section 4.6, "Access Rights" in Intel SDM Vol 3. 892 * The following page table accesses are implicitly supervisor mode: 893 * - accesses to GDT or LDT to load segment descriptors 894 * - accesses to the task state segment during task switch 895 */ 896 sup_paging = *paging; 897 sup_paging.cpl = 0; /* implicit supervisor mode */ 898 899 /* Fetch the new TSS descriptor */ 900 error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc, 901 &fault); 902 CHKERR(error, fault); 903 904 nt = usd_to_seg_desc(&nt_desc); 905 906 /* Verify the type of the new TSS */ 907 nt_type = SEG_DESC_TYPE(nt.access); 908 if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS && 909 nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) { 910 sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); 911 goto done; 912 } 913 914 /* TSS descriptor must have present bit set */ 915 if (!SEG_DESC_PRESENT(nt.access)) { 916 sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext); 917 goto done; 918 } 919 920 /* 921 * TSS must have a minimum length of 104 bytes for a 32-bit TSS and 922 * 44 bytes for a 16-bit TSS. 923 */ 924 if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS) 925 minlimit = 104 - 1; 926 else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) 927 minlimit = 44 - 1; 928 else 929 minlimit = 0; 930 931 assert(minlimit > 0); 932 if (nt.limit < minlimit) { 933 sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); 934 goto done; 935 } 936 937 /* TSS must be busy if task switch is due to IRET */ 938 if (reason == TSR_IRET && !TSS_BUSY(nt_type)) { 939 sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); 940 goto done; 941 } 942 943 /* 944 * TSS must be available (not busy) if task switch reason is 945 * CALL, JMP, exception or interrupt. 946 */ 947 if (reason != TSR_IRET && TSS_BUSY(nt_type)) { 948 sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext); 949 goto done; 950 } 951 952 /* Fetch the new TSS */ 953 error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1, 954 PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault); 955 CHKERR(error, fault); 956 vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1); 957 958 /* Get the old TSS selector from the guest's task register */ 959 ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR); 960 if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) { 961 /* 962 * This might happen if a task switch was attempted without 963 * ever loading the task register with LTR. In this case the 964 * TR would contain the values from power-on: 965 * (sel = 0, base = 0, limit = 0xffff). 966 */ 967 sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext); 968 goto done; 969 } 970 971 /* Get the old TSS base and limit from the guest's task register */ 972 error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim, 973 &access); 974 assert(error == 0); 975 assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access)); 976 ot_type = SEG_DESC_TYPE(access); 977 assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY); 978 979 /* Fetch the old TSS descriptor */ 980 error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc, 981 &fault); 982 CHKERR(error, fault); 983 984 /* Get the old TSS */ 985 error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1, 986 PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault); 987 CHKERR(error, fault); 988 vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1); 989 990 /* 991 * Clear the busy bit in the old TSS descriptor if the task switch 992 * due to an IRET or JMP instruction. 993 */ 994 if (reason == TSR_IRET || reason == TSR_JMP) { 995 ot_desc.sd_type &= ~0x2; 996 error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel, 997 &ot_desc, &fault); 998 CHKERR(error, fault); 999 } 1000 1001 if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) { 1002 EPRINTLN("Task switch to 16-bit TSS not supported"); 1003 return (VMEXIT_ABORT); 1004 } 1005 1006 /* Save processor state in old TSS */ 1007 tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov); 1008 1009 /* 1010 * If the task switch was triggered for any reason other than IRET 1011 * then set the busy bit in the new TSS descriptor. 1012 */ 1013 if (reason != TSR_IRET) { 1014 nt_desc.sd_type |= 0x2; 1015 error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel, 1016 &nt_desc, &fault); 1017 CHKERR(error, fault); 1018 } 1019 1020 /* Update task register to point at the new TSS */ 1021 SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel); 1022 1023 /* Update the hidden descriptor state of the task register */ 1024 nt = usd_to_seg_desc(&nt_desc); 1025 update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt); 1026 1027 /* Set CR0.TS */ 1028 cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0); 1029 SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS); 1030 1031 /* 1032 * We are now committed to the task switch. Any exceptions encountered 1033 * after this point will be handled in the context of the new task and 1034 * the saved instruction pointer will belong to the new task. 1035 */ 1036 error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, newtss.tss_eip); 1037 assert(error == 0); 1038 1039 /* Load processor state from new TSS */ 1040 error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov, 1041 &fault); 1042 CHKERR(error, fault); 1043 1044 /* 1045 * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception 1046 * caused an error code to be generated, this error code is copied 1047 * to the stack of the new task. 1048 */ 1049 if (task_switch->errcode_valid) { 1050 assert(task_switch->ext); 1051 assert(task_switch->reason == TSR_IDT_GATE); 1052 error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type, 1053 task_switch->errcode, &fault); 1054 CHKERR(error, fault); 1055 } 1056 1057 /* 1058 * Treatment of virtual-NMI blocking if NMI is delivered through 1059 * a task gate. 1060 * 1061 * Section "Architectural State Before A VM Exit", Intel SDM, Vol3: 1062 * If the virtual NMIs VM-execution control is 1, VM entry injects 1063 * an NMI, and delivery of the NMI causes a task switch that causes 1064 * a VM exit, virtual-NMI blocking is in effect before the VM exit 1065 * commences. 1066 * 1067 * Thus, virtual-NMI blocking is in effect at the time of the task 1068 * switch VM exit. 1069 */ 1070 1071 /* 1072 * Treatment of virtual-NMI unblocking on IRET from NMI handler task. 1073 * 1074 * Section "Changes to Instruction Behavior in VMX Non-Root Operation" 1075 * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking. 1076 * This unblocking of virtual-NMI occurs even if IRET causes a fault. 1077 * 1078 * Thus, virtual-NMI blocking is cleared at the time of the task switch 1079 * VM exit. 1080 */ 1081 1082 /* 1083 * If the task switch was triggered by an event delivered through 1084 * the IDT then extinguish the pending event from the vcpu's 1085 * exitintinfo. 1086 */ 1087 if (task_switch->reason == TSR_IDT_GATE) { 1088 error = vm_set_intinfo(ctx, vcpu, 0); 1089 assert(error == 0); 1090 } 1091 1092 /* 1093 * XXX should inject debug exception if 'T' bit is 1 1094 */ 1095 done: 1096 return (VMEXIT_CONTINUE); 1097 } 1098