1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2014 Neel Natu <neel@freebsd.org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 /* 29 * This file and its contents are supplied under the terms of the 30 * Common Development and Distribution License ("CDDL"), version 1.0. 31 * You may only use this file in accordance with the terms of version 32 * 1.0 of the CDDL. 33 * 34 * A full copy of the text of the CDDL should have accompanied this 35 * source. A copy of the CDDL is also available via the Internet at 36 * http://www.illumos.org/license/CDDL. 37 * 38 * Copyright 2020 Oxide Computer Company 39 */ 40 41 #include <sys/cdefs.h> 42 43 #include <sys/param.h> 44 #include <sys/_iovec.h> 45 #include <sys/mman.h> 46 47 #include <x86/psl.h> 48 #include <x86/segments.h> 49 #include <x86/specialreg.h> 50 #include <machine/vmm.h> 51 52 #include <assert.h> 53 #include <errno.h> 54 #include <stdbool.h> 55 #include <stdio.h> 56 #include <stdlib.h> 57 58 #include <vmmapi.h> 59 60 #include "bhyverun.h" 61 #include "debug.h" 62 63 /* 64 * Using 'struct i386tss' is tempting but causes myriad sign extension 65 * issues because all of its fields are defined as signed integers. 66 */ 67 struct tss32 { 68 uint16_t tss_link; 69 uint16_t rsvd1; 70 uint32_t tss_esp0; 71 uint16_t tss_ss0; 72 uint16_t rsvd2; 73 uint32_t tss_esp1; 74 uint16_t tss_ss1; 75 uint16_t rsvd3; 76 uint32_t tss_esp2; 77 uint16_t tss_ss2; 78 uint16_t rsvd4; 79 uint32_t tss_cr3; 80 uint32_t tss_eip; 81 uint32_t tss_eflags; 82 uint32_t tss_eax; 83 uint32_t tss_ecx; 84 uint32_t tss_edx; 85 uint32_t tss_ebx; 86 uint32_t tss_esp; 87 uint32_t tss_ebp; 88 uint32_t tss_esi; 89 uint32_t tss_edi; 90 uint16_t tss_es; 91 uint16_t rsvd5; 92 uint16_t tss_cs; 93 uint16_t rsvd6; 94 uint16_t tss_ss; 95 uint16_t rsvd7; 96 uint16_t tss_ds; 97 uint16_t rsvd8; 98 uint16_t tss_fs; 99 uint16_t rsvd9; 100 uint16_t tss_gs; 101 uint16_t rsvd10; 102 uint16_t tss_ldt; 103 uint16_t rsvd11; 104 uint16_t tss_trap; 105 uint16_t tss_iomap; 106 }; 107 static_assert(sizeof(struct tss32) == 104, "compile-time assertion failed"); 108 109 #define SEL_START(sel) (((sel) & ~0x7)) 110 #define SEL_LIMIT(sel) (((sel) | 0x7)) 111 #define TSS_BUSY(type) (((type) & 0x2) != 0) 112 113 static uint64_t 114 GETREG(struct vcpu *vcpu, int reg) 115 { 116 uint64_t val; 117 int error; 118 119 error = vm_get_register(vcpu, reg, &val); 120 assert(error == 0); 121 return (val); 122 } 123 124 static void 125 SETREG(struct vcpu *vcpu, int reg, uint64_t val) 126 { 127 int error; 128 129 error = vm_set_register(vcpu, reg, val); 130 assert(error == 0); 131 } 132 133 static struct seg_desc 134 usd_to_seg_desc(struct user_segment_descriptor *usd) 135 { 136 struct seg_desc seg_desc; 137 138 seg_desc.base = (u_int)USD_GETBASE(usd); 139 if (usd->sd_gran) 140 seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff; 141 else 142 seg_desc.limit = (u_int)USD_GETLIMIT(usd); 143 seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7; 144 seg_desc.access |= usd->sd_xx << 12; 145 seg_desc.access |= usd->sd_def32 << 14; 146 seg_desc.access |= usd->sd_gran << 15; 147 148 return (seg_desc); 149 } 150 151 /* 152 * Inject an exception with an error code that is a segment selector. 153 * The format of the error code is described in section 6.13, "Error Code", 154 * Intel SDM volume 3. 155 * 156 * Bit 0 (EXT) denotes whether the exception occurred during delivery 157 * of an external event like an interrupt. 158 * 159 * Bit 1 (IDT) indicates whether the selector points to a gate descriptor 160 * in the IDT. 161 * 162 * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI). 163 */ 164 static void 165 sel_exception(struct vcpu *vcpu, int vector, uint16_t sel, int ext) 166 { 167 /* 168 * Bit 2 from the selector is retained as-is in the error code. 169 * 170 * Bit 1 can be safely cleared because none of the selectors 171 * encountered during task switch emulation refer to a task 172 * gate in the IDT. 173 * 174 * Bit 0 is set depending on the value of 'ext'. 175 */ 176 sel &= ~0x3; 177 if (ext) 178 sel |= 0x1; 179 vm_inject_fault(vcpu, vector, 1, sel); 180 } 181 182 /* 183 * Return 0 if the selector 'sel' in within the limits of the GDT/LDT 184 * and non-zero otherwise. 185 */ 186 static int 187 desc_table_limit_check(struct vcpu *vcpu, uint16_t sel) 188 { 189 uint64_t base; 190 uint32_t limit, access; 191 int error, reg; 192 193 reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; 194 error = vm_get_desc(vcpu, reg, &base, &limit, &access); 195 assert(error == 0); 196 197 if (reg == VM_REG_GUEST_LDTR) { 198 if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access)) 199 return (-1); 200 } 201 202 if (limit < SEL_LIMIT(sel)) 203 return (-1); 204 else 205 return (0); 206 } 207 208 /* 209 * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced 210 * by the selector 'sel'. 211 * 212 * Returns 0 on success. 213 * Returns 1 if an exception was injected into the guest. 214 * Returns -1 otherwise. 215 */ 216 static int 217 desc_table_rw(struct vcpu *vcpu, struct vm_guest_paging *paging, 218 uint16_t sel, struct user_segment_descriptor *desc, bool doread, 219 int *faultptr) 220 { 221 struct iovec iov[2]; 222 uint64_t base; 223 uint32_t limit, access; 224 int error, reg; 225 226 reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; 227 error = vm_get_desc(vcpu, reg, &base, &limit, &access); 228 assert(error == 0); 229 assert(limit >= SEL_LIMIT(sel)); 230 231 error = vm_copy_setup(vcpu, paging, base + SEL_START(sel), 232 sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov), 233 faultptr); 234 if (error || *faultptr) 235 return (error); 236 237 if (doread) 238 vm_copyin(iov, desc, sizeof(*desc)); 239 else 240 vm_copyout(desc, iov, sizeof(*desc)); 241 return (0); 242 } 243 244 static int 245 desc_table_read(struct vcpu *vcpu, struct vm_guest_paging *paging, 246 uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) 247 { 248 return (desc_table_rw(vcpu, paging, sel, desc, true, faultptr)); 249 } 250 251 static int 252 desc_table_write(struct vcpu *vcpu, struct vm_guest_paging *paging, 253 uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) 254 { 255 return (desc_table_rw(vcpu, paging, sel, desc, false, faultptr)); 256 } 257 258 /* 259 * Read the TSS descriptor referenced by 'sel' into 'desc'. 260 * 261 * Returns 0 on success. 262 * Returns 1 if an exception was injected into the guest. 263 * Returns -1 otherwise. 264 */ 265 static int 266 read_tss_descriptor(struct vcpu *vcpu, struct vm_task_switch *ts, 267 uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) 268 { 269 struct vm_guest_paging sup_paging; 270 int error; 271 272 assert(!ISLDT(sel)); 273 assert(IDXSEL(sel) != 0); 274 275 /* Fetch the new TSS descriptor */ 276 if (desc_table_limit_check(vcpu, sel)) { 277 if (ts->reason == TSR_IRET) 278 sel_exception(vcpu, IDT_TS, sel, ts->ext); 279 else 280 sel_exception(vcpu, IDT_GP, sel, ts->ext); 281 return (1); 282 } 283 284 sup_paging = ts->paging; 285 sup_paging.cpl = 0; /* implicit supervisor mode */ 286 error = desc_table_read(vcpu, &sup_paging, sel, desc, faultptr); 287 return (error); 288 } 289 290 static bool 291 code_desc(int sd_type) 292 { 293 /* code descriptor */ 294 return ((sd_type & 0x18) == 0x18); 295 } 296 297 static bool 298 stack_desc(int sd_type) 299 { 300 /* writable data descriptor */ 301 return ((sd_type & 0x1A) == 0x12); 302 } 303 304 static bool 305 data_desc(int sd_type) 306 { 307 /* data descriptor or a readable code descriptor */ 308 return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A); 309 } 310 311 static bool 312 ldt_desc(int sd_type) 313 { 314 315 return (sd_type == SDT_SYSLDT); 316 } 317 318 /* 319 * Validate the descriptor 'seg_desc' associated with 'segment'. 320 */ 321 static int 322 validate_seg_desc(struct vcpu *vcpu, struct vm_task_switch *ts, 323 int segment, struct seg_desc *seg_desc, int *faultptr) 324 { 325 struct vm_guest_paging sup_paging; 326 struct user_segment_descriptor usd; 327 int error, idtvec; 328 int cpl, dpl, rpl; 329 uint16_t sel, cs; 330 bool ldtseg, codeseg, stackseg, dataseg, conforming; 331 332 ldtseg = codeseg = stackseg = dataseg = false; 333 switch (segment) { 334 case VM_REG_GUEST_LDTR: 335 ldtseg = true; 336 break; 337 case VM_REG_GUEST_CS: 338 codeseg = true; 339 break; 340 case VM_REG_GUEST_SS: 341 stackseg = true; 342 break; 343 case VM_REG_GUEST_DS: 344 case VM_REG_GUEST_ES: 345 case VM_REG_GUEST_FS: 346 case VM_REG_GUEST_GS: 347 dataseg = true; 348 break; 349 default: 350 assert(0); 351 } 352 353 /* Get the segment selector */ 354 sel = GETREG(vcpu, segment); 355 356 /* LDT selector must point into the GDT */ 357 if (ldtseg && ISLDT(sel)) { 358 sel_exception(vcpu, IDT_TS, sel, ts->ext); 359 return (1); 360 } 361 362 /* Descriptor table limit check */ 363 if (desc_table_limit_check(vcpu, sel)) { 364 sel_exception(vcpu, IDT_TS, sel, ts->ext); 365 return (1); 366 } 367 368 /* NULL selector */ 369 if (IDXSEL(sel) == 0) { 370 /* Code and stack segment selectors cannot be NULL */ 371 if (codeseg || stackseg) { 372 sel_exception(vcpu, IDT_TS, sel, ts->ext); 373 return (1); 374 } 375 seg_desc->base = 0; 376 seg_desc->limit = 0; 377 seg_desc->access = 0x10000; /* unusable */ 378 return (0); 379 } 380 381 /* Read the descriptor from the GDT/LDT */ 382 sup_paging = ts->paging; 383 sup_paging.cpl = 0; /* implicit supervisor mode */ 384 error = desc_table_read(vcpu, &sup_paging, sel, &usd, faultptr); 385 if (error || *faultptr) 386 return (error); 387 388 /* Verify that the descriptor type is compatible with the segment */ 389 if ((ldtseg && !ldt_desc(usd.sd_type)) || 390 (codeseg && !code_desc(usd.sd_type)) || 391 (dataseg && !data_desc(usd.sd_type)) || 392 (stackseg && !stack_desc(usd.sd_type))) { 393 sel_exception(vcpu, IDT_TS, sel, ts->ext); 394 return (1); 395 } 396 397 /* Segment must be marked present */ 398 if (!usd.sd_p) { 399 if (ldtseg) 400 idtvec = IDT_TS; 401 else if (stackseg) 402 idtvec = IDT_SS; 403 else 404 idtvec = IDT_NP; 405 sel_exception(vcpu, idtvec, sel, ts->ext); 406 return (1); 407 } 408 409 cs = GETREG(vcpu, VM_REG_GUEST_CS); 410 cpl = cs & SEL_RPL_MASK; 411 rpl = sel & SEL_RPL_MASK; 412 dpl = usd.sd_dpl; 413 414 if (stackseg && (rpl != cpl || dpl != cpl)) { 415 sel_exception(vcpu, IDT_TS, sel, ts->ext); 416 return (1); 417 } 418 419 if (codeseg) { 420 conforming = (usd.sd_type & 0x4) ? true : false; 421 if ((conforming && (cpl < dpl)) || 422 (!conforming && (cpl != dpl))) { 423 sel_exception(vcpu, IDT_TS, sel, ts->ext); 424 return (1); 425 } 426 } 427 428 if (dataseg) { 429 /* 430 * A data segment is always non-conforming except when it's 431 * descriptor is a readable, conforming code segment. 432 */ 433 if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0) 434 conforming = true; 435 else 436 conforming = false; 437 438 if (!conforming && (rpl > dpl || cpl > dpl)) { 439 sel_exception(vcpu, IDT_TS, sel, ts->ext); 440 return (1); 441 } 442 } 443 *seg_desc = usd_to_seg_desc(&usd); 444 return (0); 445 } 446 447 static void 448 tss32_save(struct vcpu *vcpu, struct vm_task_switch *task_switch, 449 uint32_t eip, struct tss32 *tss, struct iovec *iov) 450 { 451 452 /* General purpose registers */ 453 tss->tss_eax = GETREG(vcpu, VM_REG_GUEST_RAX); 454 tss->tss_ecx = GETREG(vcpu, VM_REG_GUEST_RCX); 455 tss->tss_edx = GETREG(vcpu, VM_REG_GUEST_RDX); 456 tss->tss_ebx = GETREG(vcpu, VM_REG_GUEST_RBX); 457 tss->tss_esp = GETREG(vcpu, VM_REG_GUEST_RSP); 458 tss->tss_ebp = GETREG(vcpu, VM_REG_GUEST_RBP); 459 tss->tss_esi = GETREG(vcpu, VM_REG_GUEST_RSI); 460 tss->tss_edi = GETREG(vcpu, VM_REG_GUEST_RDI); 461 462 /* Segment selectors */ 463 tss->tss_es = GETREG(vcpu, VM_REG_GUEST_ES); 464 tss->tss_cs = GETREG(vcpu, VM_REG_GUEST_CS); 465 tss->tss_ss = GETREG(vcpu, VM_REG_GUEST_SS); 466 tss->tss_ds = GETREG(vcpu, VM_REG_GUEST_DS); 467 tss->tss_fs = GETREG(vcpu, VM_REG_GUEST_FS); 468 tss->tss_gs = GETREG(vcpu, VM_REG_GUEST_GS); 469 470 /* eflags and eip */ 471 tss->tss_eflags = GETREG(vcpu, VM_REG_GUEST_RFLAGS); 472 if (task_switch->reason == TSR_IRET) 473 tss->tss_eflags &= ~PSL_NT; 474 tss->tss_eip = eip; 475 476 /* Copy updated old TSS into guest memory */ 477 vm_copyout(tss, iov, sizeof(struct tss32)); 478 } 479 480 static void 481 update_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *sd) 482 { 483 int error; 484 485 error = vm_set_desc(vcpu, reg, sd->base, sd->limit, sd->access); 486 assert(error == 0); 487 } 488 489 /* 490 * Update the vcpu registers to reflect the state of the new task. 491 */ 492 static int 493 tss32_restore(struct vmctx *ctx, struct vcpu *vcpu, struct vm_task_switch *ts, 494 uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr) 495 { 496 struct seg_desc seg_desc, seg_desc2; 497 uint64_t *pdpte, maxphyaddr, reserved; 498 uint32_t eflags; 499 int error, i; 500 bool nested; 501 502 nested = false; 503 if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) { 504 tss->tss_link = ot_sel; 505 nested = true; 506 } 507 508 eflags = tss->tss_eflags; 509 if (nested) 510 eflags |= PSL_NT; 511 512 /* LDTR */ 513 SETREG(vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt); 514 515 /* PBDR */ 516 if (ts->paging.paging_mode != PAGING_MODE_FLAT) { 517 if (ts->paging.paging_mode == PAGING_MODE_PAE) { 518 /* 519 * XXX Assuming 36-bit MAXPHYADDR. 520 */ 521 maxphyaddr = (1UL << 36) - 1; 522 pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32); 523 for (i = 0; i < 4; i++) { 524 /* Check reserved bits if the PDPTE is valid */ 525 if (!(pdpte[i] & 0x1)) 526 continue; 527 /* 528 * Bits 2:1, 8:5 and bits above the processor's 529 * maximum physical address are reserved. 530 */ 531 reserved = ~maxphyaddr | 0x1E6; 532 if (pdpte[i] & reserved) { 533 vm_inject_gp(vcpu); 534 return (1); 535 } 536 } 537 SETREG(vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]); 538 SETREG(vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]); 539 SETREG(vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]); 540 SETREG(vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]); 541 } 542 SETREG(vcpu, VM_REG_GUEST_CR3, tss->tss_cr3); 543 ts->paging.cr3 = tss->tss_cr3; 544 } 545 546 /* eflags and eip */ 547 SETREG(vcpu, VM_REG_GUEST_RFLAGS, eflags); 548 SETREG(vcpu, VM_REG_GUEST_RIP, tss->tss_eip); 549 550 /* General purpose registers */ 551 SETREG(vcpu, VM_REG_GUEST_RAX, tss->tss_eax); 552 SETREG(vcpu, VM_REG_GUEST_RCX, tss->tss_ecx); 553 SETREG(vcpu, VM_REG_GUEST_RDX, tss->tss_edx); 554 SETREG(vcpu, VM_REG_GUEST_RBX, tss->tss_ebx); 555 SETREG(vcpu, VM_REG_GUEST_RSP, tss->tss_esp); 556 SETREG(vcpu, VM_REG_GUEST_RBP, tss->tss_ebp); 557 SETREG(vcpu, VM_REG_GUEST_RSI, tss->tss_esi); 558 SETREG(vcpu, VM_REG_GUEST_RDI, tss->tss_edi); 559 560 /* Segment selectors */ 561 SETREG(vcpu, VM_REG_GUEST_ES, tss->tss_es); 562 SETREG(vcpu, VM_REG_GUEST_CS, tss->tss_cs); 563 SETREG(vcpu, VM_REG_GUEST_SS, tss->tss_ss); 564 SETREG(vcpu, VM_REG_GUEST_DS, tss->tss_ds); 565 SETREG(vcpu, VM_REG_GUEST_FS, tss->tss_fs); 566 SETREG(vcpu, VM_REG_GUEST_GS, tss->tss_gs); 567 568 /* 569 * If this is a nested task then write out the new TSS to update 570 * the previous link field. 571 */ 572 if (nested) 573 vm_copyout(tss, iov, sizeof(*tss)); 574 575 /* Validate segment descriptors */ 576 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc, 577 faultptr); 578 if (error || *faultptr) 579 return (error); 580 update_seg_desc(vcpu, VM_REG_GUEST_LDTR, &seg_desc); 581 582 /* 583 * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3. 584 * 585 * The SS and CS attribute checks on VM-entry are inter-dependent so 586 * we need to make sure that both segments are valid before updating 587 * either of them. This ensures that the VMCS state can pass the 588 * VM-entry checks so the guest can handle any exception injected 589 * during task switch emulation. 590 */ 591 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_CS, &seg_desc, 592 faultptr); 593 if (error || *faultptr) 594 return (error); 595 596 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_SS, &seg_desc2, 597 faultptr); 598 if (error || *faultptr) 599 return (error); 600 update_seg_desc(vcpu, VM_REG_GUEST_CS, &seg_desc); 601 update_seg_desc(vcpu, VM_REG_GUEST_SS, &seg_desc2); 602 ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK; 603 604 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_DS, &seg_desc, 605 faultptr); 606 if (error || *faultptr) 607 return (error); 608 update_seg_desc(vcpu, VM_REG_GUEST_DS, &seg_desc); 609 610 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_ES, &seg_desc, 611 faultptr); 612 if (error || *faultptr) 613 return (error); 614 update_seg_desc(vcpu, VM_REG_GUEST_ES, &seg_desc); 615 616 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_FS, &seg_desc, 617 faultptr); 618 if (error || *faultptr) 619 return (error); 620 update_seg_desc(vcpu, VM_REG_GUEST_FS, &seg_desc); 621 622 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_GS, &seg_desc, 623 faultptr); 624 if (error || *faultptr) 625 return (error); 626 update_seg_desc(vcpu, VM_REG_GUEST_GS, &seg_desc); 627 628 return (0); 629 } 630 631 632 /* 633 * Copy of vie_alignment_check() from vmm_instruction_emul.c 634 */ 635 static int 636 alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) 637 { 638 assert(size == 1 || size == 2 || size == 4 || size == 8); 639 assert(cpl >= 0 && cpl <= 3); 640 641 if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) 642 return (0); 643 644 return ((gla & (size - 1)) ? 1 : 0); 645 } 646 647 /* 648 * Copy of vie_size2mask() from vmm_instruction_emul.c 649 */ 650 static uint64_t 651 size2mask(int size) 652 { 653 switch (size) { 654 case 1: 655 return (0xff); 656 case 2: 657 return (0xffff); 658 case 4: 659 return (0xffffffff); 660 case 8: 661 return (0xffffffffffffffff); 662 default: 663 assert(0); 664 /* not reached */ 665 return (0); 666 } 667 } 668 669 /* 670 * Copy of vie_calculate_gla() from vmm_instruction_emul.c 671 */ 672 static int 673 calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, 674 struct seg_desc *desc, uint64_t offset, int length, int addrsize, 675 int prot, uint64_t *gla) 676 { 677 uint64_t firstoff, low_limit, high_limit, segbase; 678 int glasize, type; 679 680 assert(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS); 681 assert((length == 1 || length == 2 || length == 4 || length == 8)); 682 assert((prot & ~(PROT_READ | PROT_WRITE)) == 0); 683 684 firstoff = offset; 685 if (cpu_mode == CPU_MODE_64BIT) { 686 assert(addrsize == 4 || addrsize == 8); 687 glasize = 8; 688 } else { 689 assert(addrsize == 2 || addrsize == 4); 690 glasize = 4; 691 /* 692 * If the segment selector is loaded with a NULL selector 693 * then the descriptor is unusable and attempting to use 694 * it results in a #GP(0). 695 */ 696 if (SEG_DESC_UNUSABLE(desc->access)) 697 return (-1); 698 699 /* 700 * The processor generates a #NP exception when a segment 701 * register is loaded with a selector that points to a 702 * descriptor that is not present. If this was the case then 703 * it would have been checked before the VM-exit. 704 */ 705 assert(SEG_DESC_PRESENT(desc->access)); 706 707 /* 708 * The descriptor type must indicate a code/data segment. 709 */ 710 type = SEG_DESC_TYPE(desc->access); 711 assert(type >= 16 && type <= 31); 712 713 if (prot & PROT_READ) { 714 /* #GP on a read access to a exec-only code segment */ 715 if ((type & 0xA) == 0x8) 716 return (-1); 717 } 718 719 if (prot & PROT_WRITE) { 720 /* 721 * #GP on a write access to a code segment or a 722 * read-only data segment. 723 */ 724 if (type & 0x8) /* code segment */ 725 return (-1); 726 727 if ((type & 0xA) == 0) /* read-only data seg */ 728 return (-1); 729 } 730 731 /* 732 * 'desc->limit' is fully expanded taking granularity into 733 * account. 734 */ 735 if ((type & 0xC) == 0x4) { 736 /* expand-down data segment */ 737 low_limit = desc->limit + 1; 738 high_limit = SEG_DESC_DEF32(desc->access) ? 739 0xffffffff : 0xffff; 740 } else { 741 /* code segment or expand-up data segment */ 742 low_limit = 0; 743 high_limit = desc->limit; 744 } 745 746 while (length > 0) { 747 offset &= size2mask(addrsize); 748 if (offset < low_limit || offset > high_limit) 749 return (-1); 750 offset++; 751 length--; 752 } 753 } 754 755 /* 756 * In 64-bit mode all segments except %fs and %gs have a segment 757 * base address of 0. 758 */ 759 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && 760 seg != VM_REG_GUEST_GS) { 761 segbase = 0; 762 } else { 763 segbase = desc->base; 764 } 765 766 /* 767 * Truncate 'firstoff' to the effective address size before adding 768 * it to the segment base. 769 */ 770 firstoff &= size2mask(addrsize); 771 *gla = (segbase + firstoff) & size2mask(glasize); 772 return (0); 773 } 774 775 /* 776 * Push an error code on the stack of the new task. This is needed if the 777 * task switch was triggered by a hardware exception that causes an error 778 * code to be saved (e.g. #PF). 779 */ 780 static int 781 push_errcode(struct vcpu *vcpu, struct vm_guest_paging *paging, 782 int task_type, uint32_t errcode, int *faultptr) 783 { 784 struct iovec iov[2]; 785 struct seg_desc seg_desc; 786 int stacksize, bytes, error; 787 uint64_t gla, cr0, rflags; 788 uint32_t esp; 789 uint16_t stacksel; 790 791 *faultptr = 0; 792 793 cr0 = GETREG(vcpu, VM_REG_GUEST_CR0); 794 rflags = GETREG(vcpu, VM_REG_GUEST_RFLAGS); 795 stacksel = GETREG(vcpu, VM_REG_GUEST_SS); 796 797 error = vm_get_desc(vcpu, VM_REG_GUEST_SS, &seg_desc.base, 798 &seg_desc.limit, &seg_desc.access); 799 assert(error == 0); 800 801 /* 802 * Section "Error Code" in the Intel SDM vol 3: the error code is 803 * pushed on the stack as a doubleword or word (depending on the 804 * default interrupt, trap or task gate size). 805 */ 806 if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS) 807 bytes = 4; 808 else 809 bytes = 2; 810 811 /* 812 * PUSH instruction from Intel SDM vol 2: the 'B' flag in the 813 * stack-segment descriptor determines the size of the stack 814 * pointer outside of 64-bit mode. 815 */ 816 if (SEG_DESC_DEF32(seg_desc.access)) 817 stacksize = 4; 818 else 819 stacksize = 2; 820 821 esp = GETREG(vcpu, VM_REG_GUEST_RSP); 822 esp -= bytes; 823 824 if (calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, 825 &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) { 826 sel_exception(vcpu, IDT_SS, stacksel, 1); 827 *faultptr = 1; 828 return (0); 829 } 830 831 if (alignment_check(paging->cpl, bytes, cr0, rflags, gla)) { 832 vm_inject_ac(vcpu, 1); 833 *faultptr = 1; 834 return (0); 835 } 836 837 error = vm_copy_setup(vcpu, paging, gla, bytes, PROT_WRITE, 838 iov, nitems(iov), faultptr); 839 if (error || *faultptr) 840 return (error); 841 842 vm_copyout(&errcode, iov, bytes); 843 SETREG(vcpu, VM_REG_GUEST_RSP, esp); 844 return (0); 845 } 846 847 /* 848 * Evaluate return value from helper functions and potentially return to 849 * the VM run loop. 850 */ 851 #define CHKERR(error,fault) \ 852 do { \ 853 assert((error == 0) || (error == EFAULT)); \ 854 if (error) \ 855 return (VMEXIT_ABORT); \ 856 else if (fault) \ 857 return (VMEXIT_CONTINUE); \ 858 } while (0) 859 860 int 861 vmexit_task_switch(struct vmctx *ctx, struct vcpu *vcpu, struct vm_exit *vmexit) 862 { 863 struct seg_desc nt; 864 struct tss32 oldtss, newtss; 865 struct vm_task_switch *task_switch; 866 struct vm_guest_paging *paging, sup_paging; 867 struct user_segment_descriptor nt_desc, ot_desc; 868 struct iovec nt_iov[2], ot_iov[2]; 869 uint64_t cr0, ot_base; 870 uint32_t eip, ot_lim, access; 871 int error, ext, fault, minlimit, nt_type, ot_type; 872 enum task_switch_reason reason; 873 uint16_t nt_sel, ot_sel; 874 875 task_switch = &vmexit->u.task_switch; 876 nt_sel = task_switch->tsssel; 877 ext = vmexit->u.task_switch.ext; 878 reason = vmexit->u.task_switch.reason; 879 paging = &vmexit->u.task_switch.paging; 880 881 assert(paging->cpu_mode == CPU_MODE_PROTECTED); 882 883 /* 884 * Calculate the instruction pointer to store in the old TSS. 885 */ 886 eip = vmexit->rip + vmexit->inst_length; 887 888 /* 889 * Section 4.6, "Access Rights" in Intel SDM Vol 3. 890 * The following page table accesses are implicitly supervisor mode: 891 * - accesses to GDT or LDT to load segment descriptors 892 * - accesses to the task state segment during task switch 893 */ 894 sup_paging = *paging; 895 sup_paging.cpl = 0; /* implicit supervisor mode */ 896 897 /* Fetch the new TSS descriptor */ 898 error = read_tss_descriptor(vcpu, task_switch, nt_sel, &nt_desc, 899 &fault); 900 CHKERR(error, fault); 901 902 nt = usd_to_seg_desc(&nt_desc); 903 904 /* Verify the type of the new TSS */ 905 nt_type = SEG_DESC_TYPE(nt.access); 906 if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS && 907 nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) { 908 sel_exception(vcpu, IDT_TS, nt_sel, ext); 909 goto done; 910 } 911 912 /* TSS descriptor must have present bit set */ 913 if (!SEG_DESC_PRESENT(nt.access)) { 914 sel_exception(vcpu, IDT_NP, nt_sel, ext); 915 goto done; 916 } 917 918 /* 919 * TSS must have a minimum length of 104 bytes for a 32-bit TSS and 920 * 44 bytes for a 16-bit TSS. 921 */ 922 if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS) 923 minlimit = 104 - 1; 924 else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) 925 minlimit = 44 - 1; 926 else 927 minlimit = 0; 928 929 assert(minlimit > 0); 930 if (nt.limit < (unsigned int)minlimit) { 931 sel_exception(vcpu, IDT_TS, nt_sel, ext); 932 goto done; 933 } 934 935 /* TSS must be busy if task switch is due to IRET */ 936 if (reason == TSR_IRET && !TSS_BUSY(nt_type)) { 937 sel_exception(vcpu, IDT_TS, nt_sel, ext); 938 goto done; 939 } 940 941 /* 942 * TSS must be available (not busy) if task switch reason is 943 * CALL, JMP, exception or interrupt. 944 */ 945 if (reason != TSR_IRET && TSS_BUSY(nt_type)) { 946 sel_exception(vcpu, IDT_GP, nt_sel, ext); 947 goto done; 948 } 949 950 /* Fetch the new TSS */ 951 error = vm_copy_setup(vcpu, &sup_paging, nt.base, minlimit + 1, 952 PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault); 953 CHKERR(error, fault); 954 vm_copyin(nt_iov, &newtss, minlimit + 1); 955 956 /* Get the old TSS selector from the guest's task register */ 957 ot_sel = GETREG(vcpu, VM_REG_GUEST_TR); 958 if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) { 959 /* 960 * This might happen if a task switch was attempted without 961 * ever loading the task register with LTR. In this case the 962 * TR would contain the values from power-on: 963 * (sel = 0, base = 0, limit = 0xffff). 964 */ 965 sel_exception(vcpu, IDT_TS, ot_sel, task_switch->ext); 966 goto done; 967 } 968 969 /* Get the old TSS base and limit from the guest's task register */ 970 error = vm_get_desc(vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim, 971 &access); 972 assert(error == 0); 973 assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access)); 974 ot_type = SEG_DESC_TYPE(access); 975 assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY); 976 977 /* Fetch the old TSS descriptor */ 978 error = read_tss_descriptor(vcpu, task_switch, ot_sel, &ot_desc, 979 &fault); 980 CHKERR(error, fault); 981 982 /* Get the old TSS */ 983 error = vm_copy_setup(vcpu, &sup_paging, ot_base, minlimit + 1, 984 PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault); 985 CHKERR(error, fault); 986 vm_copyin(ot_iov, &oldtss, minlimit + 1); 987 988 /* 989 * Clear the busy bit in the old TSS descriptor if the task switch 990 * due to an IRET or JMP instruction. 991 */ 992 if (reason == TSR_IRET || reason == TSR_JMP) { 993 ot_desc.sd_type &= ~0x2; 994 error = desc_table_write(vcpu, &sup_paging, ot_sel, 995 &ot_desc, &fault); 996 CHKERR(error, fault); 997 } 998 999 if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) { 1000 EPRINTLN("Task switch to 16-bit TSS not supported"); 1001 return (VMEXIT_ABORT); 1002 } 1003 1004 /* Save processor state in old TSS */ 1005 tss32_save(vcpu, task_switch, eip, &oldtss, ot_iov); 1006 1007 /* 1008 * If the task switch was triggered for any reason other than IRET 1009 * then set the busy bit in the new TSS descriptor. 1010 */ 1011 if (reason != TSR_IRET) { 1012 nt_desc.sd_type |= 0x2; 1013 error = desc_table_write(vcpu, &sup_paging, nt_sel, 1014 &nt_desc, &fault); 1015 CHKERR(error, fault); 1016 } 1017 1018 /* Update task register to point at the new TSS */ 1019 SETREG(vcpu, VM_REG_GUEST_TR, nt_sel); 1020 1021 /* Update the hidden descriptor state of the task register */ 1022 nt = usd_to_seg_desc(&nt_desc); 1023 update_seg_desc(vcpu, VM_REG_GUEST_TR, &nt); 1024 1025 /* Set CR0.TS */ 1026 cr0 = GETREG(vcpu, VM_REG_GUEST_CR0); 1027 SETREG(vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS); 1028 1029 /* 1030 * We are now committed to the task switch. Any exceptions encountered 1031 * after this point will be handled in the context of the new task and 1032 * the saved instruction pointer will belong to the new task. 1033 */ 1034 error = vm_set_register(vcpu, VM_REG_GUEST_RIP, newtss.tss_eip); 1035 assert(error == 0); 1036 1037 /* Load processor state from new TSS */ 1038 error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov, 1039 &fault); 1040 CHKERR(error, fault); 1041 1042 /* 1043 * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception 1044 * caused an error code to be generated, this error code is copied 1045 * to the stack of the new task. 1046 */ 1047 if (task_switch->errcode_valid) { 1048 assert(task_switch->ext); 1049 assert(task_switch->reason == TSR_IDT_GATE); 1050 error = push_errcode(vcpu, &task_switch->paging, nt_type, 1051 task_switch->errcode, &fault); 1052 CHKERR(error, fault); 1053 } 1054 1055 /* 1056 * Treatment of virtual-NMI blocking if NMI is delivered through 1057 * a task gate. 1058 * 1059 * Section "Architectural State Before A VM Exit", Intel SDM, Vol3: 1060 * If the virtual NMIs VM-execution control is 1, VM entry injects 1061 * an NMI, and delivery of the NMI causes a task switch that causes 1062 * a VM exit, virtual-NMI blocking is in effect before the VM exit 1063 * commences. 1064 * 1065 * Thus, virtual-NMI blocking is in effect at the time of the task 1066 * switch VM exit. 1067 */ 1068 1069 /* 1070 * Treatment of virtual-NMI unblocking on IRET from NMI handler task. 1071 * 1072 * Section "Changes to Instruction Behavior in VMX Non-Root Operation" 1073 * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking. 1074 * This unblocking of virtual-NMI occurs even if IRET causes a fault. 1075 * 1076 * Thus, virtual-NMI blocking is cleared at the time of the task switch 1077 * VM exit. 1078 */ 1079 1080 /* 1081 * If the task switch was triggered by an event delivered through 1082 * the IDT then extinguish the pending event from the vcpu's 1083 * exitintinfo. 1084 */ 1085 if (task_switch->reason == TSR_IDT_GATE) { 1086 error = vm_set_intinfo(vcpu, 0); 1087 assert(error == 0); 1088 } 1089 1090 /* 1091 * XXX should inject debug exception if 'T' bit is 1 1092 */ 1093 done: 1094 return (VMEXIT_CONTINUE); 1095 } 1096