1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2014 Neel Natu <neel@freebsd.org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 /* 29 * This file and its contents are supplied under the terms of the 30 * Common Development and Distribution License ("CDDL"), version 1.0. 31 * You may only use this file in accordance with the terms of version 32 * 1.0 of the CDDL. 33 * 34 * A full copy of the text of the CDDL should have accompanied this 35 * source. A copy of the CDDL is also available via the Internet at 36 * http://www.illumos.org/license/CDDL. 37 * 38 * Copyright 2020 Oxide Computer Company 39 */ 40 41 42 #include <sys/param.h> 43 #include <sys/_iovec.h> 44 #include <sys/mman.h> 45 46 #include <x86/psl.h> 47 #include <x86/segments.h> 48 #include <x86/specialreg.h> 49 #include <machine/vmm.h> 50 51 #include <assert.h> 52 #include <errno.h> 53 #include <stdbool.h> 54 #include <stdio.h> 55 #include <stdlib.h> 56 57 #include <vmmapi.h> 58 59 #include "bhyverun.h" 60 #include "debug.h" 61 62 /* 63 * Using 'struct i386tss' is tempting but causes myriad sign extension 64 * issues because all of its fields are defined as signed integers. 65 */ 66 struct tss32 { 67 uint16_t tss_link; 68 uint16_t rsvd1; 69 uint32_t tss_esp0; 70 uint16_t tss_ss0; 71 uint16_t rsvd2; 72 uint32_t tss_esp1; 73 uint16_t tss_ss1; 74 uint16_t rsvd3; 75 uint32_t tss_esp2; 76 uint16_t tss_ss2; 77 uint16_t rsvd4; 78 uint32_t tss_cr3; 79 uint32_t tss_eip; 80 uint32_t tss_eflags; 81 uint32_t tss_eax; 82 uint32_t tss_ecx; 83 uint32_t tss_edx; 84 uint32_t tss_ebx; 85 uint32_t tss_esp; 86 uint32_t tss_ebp; 87 uint32_t tss_esi; 88 uint32_t tss_edi; 89 uint16_t tss_es; 90 uint16_t rsvd5; 91 uint16_t tss_cs; 92 uint16_t rsvd6; 93 uint16_t tss_ss; 94 uint16_t rsvd7; 95 uint16_t tss_ds; 96 uint16_t rsvd8; 97 uint16_t tss_fs; 98 uint16_t rsvd9; 99 uint16_t tss_gs; 100 uint16_t rsvd10; 101 uint16_t tss_ldt; 102 uint16_t rsvd11; 103 uint16_t tss_trap; 104 uint16_t tss_iomap; 105 }; 106 static_assert(sizeof(struct tss32) == 104, "compile-time assertion failed"); 107 108 #define SEL_START(sel) (((sel) & ~0x7)) 109 #define SEL_LIMIT(sel) (((sel) | 0x7)) 110 #define TSS_BUSY(type) (((type) & 0x2) != 0) 111 112 static uint64_t 113 GETREG(struct vcpu *vcpu, int reg) 114 { 115 uint64_t val; 116 int error; 117 118 error = vm_get_register(vcpu, reg, &val); 119 assert(error == 0); 120 return (val); 121 } 122 123 static void 124 SETREG(struct vcpu *vcpu, int reg, uint64_t val) 125 { 126 int error; 127 128 error = vm_set_register(vcpu, reg, val); 129 assert(error == 0); 130 } 131 132 static struct seg_desc 133 usd_to_seg_desc(struct user_segment_descriptor *usd) 134 { 135 struct seg_desc seg_desc; 136 137 seg_desc.base = (u_int)USD_GETBASE(usd); 138 if (usd->sd_gran) 139 seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff; 140 else 141 seg_desc.limit = (u_int)USD_GETLIMIT(usd); 142 seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7; 143 seg_desc.access |= usd->sd_xx << 12; 144 seg_desc.access |= usd->sd_def32 << 14; 145 seg_desc.access |= usd->sd_gran << 15; 146 147 return (seg_desc); 148 } 149 150 /* 151 * Inject an exception with an error code that is a segment selector. 152 * The format of the error code is described in section 6.13, "Error Code", 153 * Intel SDM volume 3. 154 * 155 * Bit 0 (EXT) denotes whether the exception occurred during delivery 156 * of an external event like an interrupt. 157 * 158 * Bit 1 (IDT) indicates whether the selector points to a gate descriptor 159 * in the IDT. 160 * 161 * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI). 162 */ 163 static void 164 sel_exception(struct vcpu *vcpu, int vector, uint16_t sel, int ext) 165 { 166 /* 167 * Bit 2 from the selector is retained as-is in the error code. 168 * 169 * Bit 1 can be safely cleared because none of the selectors 170 * encountered during task switch emulation refer to a task 171 * gate in the IDT. 172 * 173 * Bit 0 is set depending on the value of 'ext'. 174 */ 175 sel &= ~0x3; 176 if (ext) 177 sel |= 0x1; 178 vm_inject_fault(vcpu, vector, 1, sel); 179 } 180 181 /* 182 * Return 0 if the selector 'sel' in within the limits of the GDT/LDT 183 * and non-zero otherwise. 184 */ 185 static int 186 desc_table_limit_check(struct vcpu *vcpu, uint16_t sel) 187 { 188 uint64_t base; 189 uint32_t limit, access; 190 int error, reg; 191 192 reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; 193 error = vm_get_desc(vcpu, reg, &base, &limit, &access); 194 assert(error == 0); 195 196 if (reg == VM_REG_GUEST_LDTR) { 197 if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access)) 198 return (-1); 199 } 200 201 if (limit < SEL_LIMIT(sel)) 202 return (-1); 203 else 204 return (0); 205 } 206 207 /* 208 * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced 209 * by the selector 'sel'. 210 * 211 * Returns 0 on success. 212 * Returns 1 if an exception was injected into the guest. 213 * Returns -1 otherwise. 214 */ 215 static int 216 desc_table_rw(struct vcpu *vcpu, struct vm_guest_paging *paging, 217 uint16_t sel, struct user_segment_descriptor *desc, bool doread, 218 int *faultptr) 219 { 220 struct iovec iov[2]; 221 uint64_t base; 222 uint32_t limit, access; 223 int error, reg; 224 225 reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; 226 error = vm_get_desc(vcpu, reg, &base, &limit, &access); 227 assert(error == 0); 228 assert(limit >= SEL_LIMIT(sel)); 229 230 error = vm_copy_setup(vcpu, paging, base + SEL_START(sel), 231 sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov), 232 faultptr); 233 if (error || *faultptr) 234 return (error); 235 236 if (doread) 237 vm_copyin(iov, desc, sizeof(*desc)); 238 else 239 vm_copyout(desc, iov, sizeof(*desc)); 240 return (0); 241 } 242 243 static int 244 desc_table_read(struct vcpu *vcpu, struct vm_guest_paging *paging, 245 uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) 246 { 247 return (desc_table_rw(vcpu, paging, sel, desc, true, faultptr)); 248 } 249 250 static int 251 desc_table_write(struct vcpu *vcpu, struct vm_guest_paging *paging, 252 uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) 253 { 254 return (desc_table_rw(vcpu, paging, sel, desc, false, faultptr)); 255 } 256 257 /* 258 * Read the TSS descriptor referenced by 'sel' into 'desc'. 259 * 260 * Returns 0 on success. 261 * Returns 1 if an exception was injected into the guest. 262 * Returns -1 otherwise. 263 */ 264 static int 265 read_tss_descriptor(struct vcpu *vcpu, struct vm_task_switch *ts, 266 uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) 267 { 268 struct vm_guest_paging sup_paging; 269 int error; 270 271 assert(!ISLDT(sel)); 272 assert(IDXSEL(sel) != 0); 273 274 /* Fetch the new TSS descriptor */ 275 if (desc_table_limit_check(vcpu, sel)) { 276 if (ts->reason == TSR_IRET) 277 sel_exception(vcpu, IDT_TS, sel, ts->ext); 278 else 279 sel_exception(vcpu, IDT_GP, sel, ts->ext); 280 return (1); 281 } 282 283 sup_paging = ts->paging; 284 sup_paging.cpl = 0; /* implicit supervisor mode */ 285 error = desc_table_read(vcpu, &sup_paging, sel, desc, faultptr); 286 return (error); 287 } 288 289 static bool 290 code_desc(int sd_type) 291 { 292 /* code descriptor */ 293 return ((sd_type & 0x18) == 0x18); 294 } 295 296 static bool 297 stack_desc(int sd_type) 298 { 299 /* writable data descriptor */ 300 return ((sd_type & 0x1A) == 0x12); 301 } 302 303 static bool 304 data_desc(int sd_type) 305 { 306 /* data descriptor or a readable code descriptor */ 307 return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A); 308 } 309 310 static bool 311 ldt_desc(int sd_type) 312 { 313 314 return (sd_type == SDT_SYSLDT); 315 } 316 317 /* 318 * Validate the descriptor 'seg_desc' associated with 'segment'. 319 */ 320 static int 321 validate_seg_desc(struct vcpu *vcpu, struct vm_task_switch *ts, 322 int segment, struct seg_desc *seg_desc, int *faultptr) 323 { 324 struct vm_guest_paging sup_paging; 325 struct user_segment_descriptor usd; 326 int error, idtvec; 327 int cpl, dpl, rpl; 328 uint16_t sel, cs; 329 bool ldtseg, codeseg, stackseg, dataseg, conforming; 330 331 ldtseg = codeseg = stackseg = dataseg = false; 332 switch (segment) { 333 case VM_REG_GUEST_LDTR: 334 ldtseg = true; 335 break; 336 case VM_REG_GUEST_CS: 337 codeseg = true; 338 break; 339 case VM_REG_GUEST_SS: 340 stackseg = true; 341 break; 342 case VM_REG_GUEST_DS: 343 case VM_REG_GUEST_ES: 344 case VM_REG_GUEST_FS: 345 case VM_REG_GUEST_GS: 346 dataseg = true; 347 break; 348 default: 349 assert(0); 350 } 351 352 /* Get the segment selector */ 353 sel = GETREG(vcpu, segment); 354 355 /* LDT selector must point into the GDT */ 356 if (ldtseg && ISLDT(sel)) { 357 sel_exception(vcpu, IDT_TS, sel, ts->ext); 358 return (1); 359 } 360 361 /* Descriptor table limit check */ 362 if (desc_table_limit_check(vcpu, sel)) { 363 sel_exception(vcpu, IDT_TS, sel, ts->ext); 364 return (1); 365 } 366 367 /* NULL selector */ 368 if (IDXSEL(sel) == 0) { 369 /* Code and stack segment selectors cannot be NULL */ 370 if (codeseg || stackseg) { 371 sel_exception(vcpu, IDT_TS, sel, ts->ext); 372 return (1); 373 } 374 seg_desc->base = 0; 375 seg_desc->limit = 0; 376 seg_desc->access = 0x10000; /* unusable */ 377 return (0); 378 } 379 380 /* Read the descriptor from the GDT/LDT */ 381 sup_paging = ts->paging; 382 sup_paging.cpl = 0; /* implicit supervisor mode */ 383 error = desc_table_read(vcpu, &sup_paging, sel, &usd, faultptr); 384 if (error || *faultptr) 385 return (error); 386 387 /* Verify that the descriptor type is compatible with the segment */ 388 if ((ldtseg && !ldt_desc(usd.sd_type)) || 389 (codeseg && !code_desc(usd.sd_type)) || 390 (dataseg && !data_desc(usd.sd_type)) || 391 (stackseg && !stack_desc(usd.sd_type))) { 392 sel_exception(vcpu, IDT_TS, sel, ts->ext); 393 return (1); 394 } 395 396 /* Segment must be marked present */ 397 if (!usd.sd_p) { 398 if (ldtseg) 399 idtvec = IDT_TS; 400 else if (stackseg) 401 idtvec = IDT_SS; 402 else 403 idtvec = IDT_NP; 404 sel_exception(vcpu, idtvec, sel, ts->ext); 405 return (1); 406 } 407 408 cs = GETREG(vcpu, VM_REG_GUEST_CS); 409 cpl = cs & SEL_RPL_MASK; 410 rpl = sel & SEL_RPL_MASK; 411 dpl = usd.sd_dpl; 412 413 if (stackseg && (rpl != cpl || dpl != cpl)) { 414 sel_exception(vcpu, IDT_TS, sel, ts->ext); 415 return (1); 416 } 417 418 if (codeseg) { 419 conforming = (usd.sd_type & 0x4) ? true : false; 420 if ((conforming && (cpl < dpl)) || 421 (!conforming && (cpl != dpl))) { 422 sel_exception(vcpu, IDT_TS, sel, ts->ext); 423 return (1); 424 } 425 } 426 427 if (dataseg) { 428 /* 429 * A data segment is always non-conforming except when it's 430 * descriptor is a readable, conforming code segment. 431 */ 432 if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0) 433 conforming = true; 434 else 435 conforming = false; 436 437 if (!conforming && (rpl > dpl || cpl > dpl)) { 438 sel_exception(vcpu, IDT_TS, sel, ts->ext); 439 return (1); 440 } 441 } 442 *seg_desc = usd_to_seg_desc(&usd); 443 return (0); 444 } 445 446 static void 447 tss32_save(struct vcpu *vcpu, struct vm_task_switch *task_switch, 448 uint32_t eip, struct tss32 *tss, struct iovec *iov) 449 { 450 451 /* General purpose registers */ 452 tss->tss_eax = GETREG(vcpu, VM_REG_GUEST_RAX); 453 tss->tss_ecx = GETREG(vcpu, VM_REG_GUEST_RCX); 454 tss->tss_edx = GETREG(vcpu, VM_REG_GUEST_RDX); 455 tss->tss_ebx = GETREG(vcpu, VM_REG_GUEST_RBX); 456 tss->tss_esp = GETREG(vcpu, VM_REG_GUEST_RSP); 457 tss->tss_ebp = GETREG(vcpu, VM_REG_GUEST_RBP); 458 tss->tss_esi = GETREG(vcpu, VM_REG_GUEST_RSI); 459 tss->tss_edi = GETREG(vcpu, VM_REG_GUEST_RDI); 460 461 /* Segment selectors */ 462 tss->tss_es = GETREG(vcpu, VM_REG_GUEST_ES); 463 tss->tss_cs = GETREG(vcpu, VM_REG_GUEST_CS); 464 tss->tss_ss = GETREG(vcpu, VM_REG_GUEST_SS); 465 tss->tss_ds = GETREG(vcpu, VM_REG_GUEST_DS); 466 tss->tss_fs = GETREG(vcpu, VM_REG_GUEST_FS); 467 tss->tss_gs = GETREG(vcpu, VM_REG_GUEST_GS); 468 469 /* eflags and eip */ 470 tss->tss_eflags = GETREG(vcpu, VM_REG_GUEST_RFLAGS); 471 if (task_switch->reason == TSR_IRET) 472 tss->tss_eflags &= ~PSL_NT; 473 tss->tss_eip = eip; 474 475 /* Copy updated old TSS into guest memory */ 476 vm_copyout(tss, iov, sizeof(struct tss32)); 477 } 478 479 static void 480 update_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *sd) 481 { 482 int error; 483 484 error = vm_set_desc(vcpu, reg, sd->base, sd->limit, sd->access); 485 assert(error == 0); 486 } 487 488 /* 489 * Update the vcpu registers to reflect the state of the new task. 490 */ 491 static int 492 tss32_restore(struct vmctx *ctx, struct vcpu *vcpu, struct vm_task_switch *ts, 493 uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr) 494 { 495 struct seg_desc seg_desc, seg_desc2; 496 uint64_t *pdpte, maxphyaddr, reserved; 497 uint32_t eflags; 498 int error, i; 499 bool nested; 500 501 nested = false; 502 if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) { 503 tss->tss_link = ot_sel; 504 nested = true; 505 } 506 507 eflags = tss->tss_eflags; 508 if (nested) 509 eflags |= PSL_NT; 510 511 /* LDTR */ 512 SETREG(vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt); 513 514 /* PBDR */ 515 if (ts->paging.paging_mode != PAGING_MODE_FLAT) { 516 if (ts->paging.paging_mode == PAGING_MODE_PAE) { 517 /* 518 * XXX Assuming 36-bit MAXPHYADDR. 519 */ 520 maxphyaddr = (1UL << 36) - 1; 521 pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32); 522 for (i = 0; i < 4; i++) { 523 /* Check reserved bits if the PDPTE is valid */ 524 if (!(pdpte[i] & 0x1)) 525 continue; 526 /* 527 * Bits 2:1, 8:5 and bits above the processor's 528 * maximum physical address are reserved. 529 */ 530 reserved = ~maxphyaddr | 0x1E6; 531 if (pdpte[i] & reserved) { 532 vm_inject_gp(vcpu); 533 return (1); 534 } 535 } 536 SETREG(vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]); 537 SETREG(vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]); 538 SETREG(vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]); 539 SETREG(vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]); 540 } 541 SETREG(vcpu, VM_REG_GUEST_CR3, tss->tss_cr3); 542 ts->paging.cr3 = tss->tss_cr3; 543 } 544 545 /* eflags and eip */ 546 SETREG(vcpu, VM_REG_GUEST_RFLAGS, eflags); 547 SETREG(vcpu, VM_REG_GUEST_RIP, tss->tss_eip); 548 549 /* General purpose registers */ 550 SETREG(vcpu, VM_REG_GUEST_RAX, tss->tss_eax); 551 SETREG(vcpu, VM_REG_GUEST_RCX, tss->tss_ecx); 552 SETREG(vcpu, VM_REG_GUEST_RDX, tss->tss_edx); 553 SETREG(vcpu, VM_REG_GUEST_RBX, tss->tss_ebx); 554 SETREG(vcpu, VM_REG_GUEST_RSP, tss->tss_esp); 555 SETREG(vcpu, VM_REG_GUEST_RBP, tss->tss_ebp); 556 SETREG(vcpu, VM_REG_GUEST_RSI, tss->tss_esi); 557 SETREG(vcpu, VM_REG_GUEST_RDI, tss->tss_edi); 558 559 /* Segment selectors */ 560 SETREG(vcpu, VM_REG_GUEST_ES, tss->tss_es); 561 SETREG(vcpu, VM_REG_GUEST_CS, tss->tss_cs); 562 SETREG(vcpu, VM_REG_GUEST_SS, tss->tss_ss); 563 SETREG(vcpu, VM_REG_GUEST_DS, tss->tss_ds); 564 SETREG(vcpu, VM_REG_GUEST_FS, tss->tss_fs); 565 SETREG(vcpu, VM_REG_GUEST_GS, tss->tss_gs); 566 567 /* 568 * If this is a nested task then write out the new TSS to update 569 * the previous link field. 570 */ 571 if (nested) 572 vm_copyout(tss, iov, sizeof(*tss)); 573 574 /* Validate segment descriptors */ 575 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc, 576 faultptr); 577 if (error || *faultptr) 578 return (error); 579 update_seg_desc(vcpu, VM_REG_GUEST_LDTR, &seg_desc); 580 581 /* 582 * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3. 583 * 584 * The SS and CS attribute checks on VM-entry are inter-dependent so 585 * we need to make sure that both segments are valid before updating 586 * either of them. This ensures that the VMCS state can pass the 587 * VM-entry checks so the guest can handle any exception injected 588 * during task switch emulation. 589 */ 590 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_CS, &seg_desc, 591 faultptr); 592 if (error || *faultptr) 593 return (error); 594 595 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_SS, &seg_desc2, 596 faultptr); 597 if (error || *faultptr) 598 return (error); 599 update_seg_desc(vcpu, VM_REG_GUEST_CS, &seg_desc); 600 update_seg_desc(vcpu, VM_REG_GUEST_SS, &seg_desc2); 601 ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK; 602 603 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_DS, &seg_desc, 604 faultptr); 605 if (error || *faultptr) 606 return (error); 607 update_seg_desc(vcpu, VM_REG_GUEST_DS, &seg_desc); 608 609 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_ES, &seg_desc, 610 faultptr); 611 if (error || *faultptr) 612 return (error); 613 update_seg_desc(vcpu, VM_REG_GUEST_ES, &seg_desc); 614 615 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_FS, &seg_desc, 616 faultptr); 617 if (error || *faultptr) 618 return (error); 619 update_seg_desc(vcpu, VM_REG_GUEST_FS, &seg_desc); 620 621 error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_GS, &seg_desc, 622 faultptr); 623 if (error || *faultptr) 624 return (error); 625 update_seg_desc(vcpu, VM_REG_GUEST_GS, &seg_desc); 626 627 return (0); 628 } 629 630 631 /* 632 * Copy of vie_alignment_check() from vmm_instruction_emul.c 633 */ 634 static int 635 alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) 636 { 637 assert(size == 1 || size == 2 || size == 4 || size == 8); 638 assert(cpl >= 0 && cpl <= 3); 639 640 if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) 641 return (0); 642 643 return ((gla & (size - 1)) ? 1 : 0); 644 } 645 646 /* 647 * Copy of vie_size2mask() from vmm_instruction_emul.c 648 */ 649 static uint64_t 650 size2mask(int size) 651 { 652 switch (size) { 653 case 1: 654 return (0xff); 655 case 2: 656 return (0xffff); 657 case 4: 658 return (0xffffffff); 659 case 8: 660 return (0xffffffffffffffff); 661 default: 662 assert(0); 663 /* not reached */ 664 return (0); 665 } 666 } 667 668 /* 669 * Copy of vie_calculate_gla() from vmm_instruction_emul.c 670 */ 671 static int 672 calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, 673 struct seg_desc *desc, uint64_t offset, int length, int addrsize, 674 int prot, uint64_t *gla) 675 { 676 uint64_t firstoff, low_limit, high_limit, segbase; 677 int glasize, type; 678 679 assert(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS); 680 assert((length == 1 || length == 2 || length == 4 || length == 8)); 681 assert((prot & ~(PROT_READ | PROT_WRITE)) == 0); 682 683 firstoff = offset; 684 if (cpu_mode == CPU_MODE_64BIT) { 685 assert(addrsize == 4 || addrsize == 8); 686 glasize = 8; 687 } else { 688 assert(addrsize == 2 || addrsize == 4); 689 glasize = 4; 690 /* 691 * If the segment selector is loaded with a NULL selector 692 * then the descriptor is unusable and attempting to use 693 * it results in a #GP(0). 694 */ 695 if (SEG_DESC_UNUSABLE(desc->access)) 696 return (-1); 697 698 /* 699 * The processor generates a #NP exception when a segment 700 * register is loaded with a selector that points to a 701 * descriptor that is not present. If this was the case then 702 * it would have been checked before the VM-exit. 703 */ 704 assert(SEG_DESC_PRESENT(desc->access)); 705 706 /* 707 * The descriptor type must indicate a code/data segment. 708 */ 709 type = SEG_DESC_TYPE(desc->access); 710 assert(type >= 16 && type <= 31); 711 712 if (prot & PROT_READ) { 713 /* #GP on a read access to a exec-only code segment */ 714 if ((type & 0xA) == 0x8) 715 return (-1); 716 } 717 718 if (prot & PROT_WRITE) { 719 /* 720 * #GP on a write access to a code segment or a 721 * read-only data segment. 722 */ 723 if (type & 0x8) /* code segment */ 724 return (-1); 725 726 if ((type & 0xA) == 0) /* read-only data seg */ 727 return (-1); 728 } 729 730 /* 731 * 'desc->limit' is fully expanded taking granularity into 732 * account. 733 */ 734 if ((type & 0xC) == 0x4) { 735 /* expand-down data segment */ 736 low_limit = desc->limit + 1; 737 high_limit = SEG_DESC_DEF32(desc->access) ? 738 0xffffffff : 0xffff; 739 } else { 740 /* code segment or expand-up data segment */ 741 low_limit = 0; 742 high_limit = desc->limit; 743 } 744 745 while (length > 0) { 746 offset &= size2mask(addrsize); 747 if (offset < low_limit || offset > high_limit) 748 return (-1); 749 offset++; 750 length--; 751 } 752 } 753 754 /* 755 * In 64-bit mode all segments except %fs and %gs have a segment 756 * base address of 0. 757 */ 758 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && 759 seg != VM_REG_GUEST_GS) { 760 segbase = 0; 761 } else { 762 segbase = desc->base; 763 } 764 765 /* 766 * Truncate 'firstoff' to the effective address size before adding 767 * it to the segment base. 768 */ 769 firstoff &= size2mask(addrsize); 770 *gla = (segbase + firstoff) & size2mask(glasize); 771 return (0); 772 } 773 774 /* 775 * Push an error code on the stack of the new task. This is needed if the 776 * task switch was triggered by a hardware exception that causes an error 777 * code to be saved (e.g. #PF). 778 */ 779 static int 780 push_errcode(struct vcpu *vcpu, struct vm_guest_paging *paging, 781 int task_type, uint32_t errcode, int *faultptr) 782 { 783 struct iovec iov[2]; 784 struct seg_desc seg_desc; 785 int stacksize, bytes, error; 786 uint64_t gla, cr0, rflags; 787 uint32_t esp; 788 uint16_t stacksel; 789 790 *faultptr = 0; 791 792 cr0 = GETREG(vcpu, VM_REG_GUEST_CR0); 793 rflags = GETREG(vcpu, VM_REG_GUEST_RFLAGS); 794 stacksel = GETREG(vcpu, VM_REG_GUEST_SS); 795 796 error = vm_get_desc(vcpu, VM_REG_GUEST_SS, &seg_desc.base, 797 &seg_desc.limit, &seg_desc.access); 798 assert(error == 0); 799 800 /* 801 * Section "Error Code" in the Intel SDM vol 3: the error code is 802 * pushed on the stack as a doubleword or word (depending on the 803 * default interrupt, trap or task gate size). 804 */ 805 if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS) 806 bytes = 4; 807 else 808 bytes = 2; 809 810 /* 811 * PUSH instruction from Intel SDM vol 2: the 'B' flag in the 812 * stack-segment descriptor determines the size of the stack 813 * pointer outside of 64-bit mode. 814 */ 815 if (SEG_DESC_DEF32(seg_desc.access)) 816 stacksize = 4; 817 else 818 stacksize = 2; 819 820 esp = GETREG(vcpu, VM_REG_GUEST_RSP); 821 esp -= bytes; 822 823 if (calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, 824 &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) { 825 sel_exception(vcpu, IDT_SS, stacksel, 1); 826 *faultptr = 1; 827 return (0); 828 } 829 830 if (alignment_check(paging->cpl, bytes, cr0, rflags, gla)) { 831 vm_inject_ac(vcpu, 1); 832 *faultptr = 1; 833 return (0); 834 } 835 836 error = vm_copy_setup(vcpu, paging, gla, bytes, PROT_WRITE, 837 iov, nitems(iov), faultptr); 838 if (error || *faultptr) 839 return (error); 840 841 vm_copyout(&errcode, iov, bytes); 842 SETREG(vcpu, VM_REG_GUEST_RSP, esp); 843 return (0); 844 } 845 846 /* 847 * Evaluate return value from helper functions and potentially return to 848 * the VM run loop. 849 */ 850 #define CHKERR(error,fault) \ 851 do { \ 852 assert((error == 0) || (error == EFAULT)); \ 853 if (error) \ 854 return (VMEXIT_ABORT); \ 855 else if (fault) \ 856 return (VMEXIT_CONTINUE); \ 857 } while (0) 858 859 int 860 vmexit_task_switch(struct vmctx *ctx, struct vcpu *vcpu, struct vm_exit *vmexit) 861 { 862 struct seg_desc nt; 863 struct tss32 oldtss, newtss; 864 struct vm_task_switch *task_switch; 865 struct vm_guest_paging *paging, sup_paging; 866 struct user_segment_descriptor nt_desc, ot_desc; 867 struct iovec nt_iov[2], ot_iov[2]; 868 uint64_t cr0, ot_base; 869 uint32_t eip, ot_lim, access; 870 int error, ext, fault, minlimit, nt_type, ot_type; 871 enum task_switch_reason reason; 872 uint16_t nt_sel, ot_sel; 873 874 task_switch = &vmexit->u.task_switch; 875 nt_sel = task_switch->tsssel; 876 ext = vmexit->u.task_switch.ext; 877 reason = vmexit->u.task_switch.reason; 878 paging = &vmexit->u.task_switch.paging; 879 880 assert(paging->cpu_mode == CPU_MODE_PROTECTED); 881 882 /* 883 * Calculate the instruction pointer to store in the old TSS. 884 */ 885 eip = vmexit->rip + vmexit->inst_length; 886 887 /* 888 * Section 4.6, "Access Rights" in Intel SDM Vol 3. 889 * The following page table accesses are implicitly supervisor mode: 890 * - accesses to GDT or LDT to load segment descriptors 891 * - accesses to the task state segment during task switch 892 */ 893 sup_paging = *paging; 894 sup_paging.cpl = 0; /* implicit supervisor mode */ 895 896 /* Fetch the new TSS descriptor */ 897 error = read_tss_descriptor(vcpu, task_switch, nt_sel, &nt_desc, 898 &fault); 899 CHKERR(error, fault); 900 901 nt = usd_to_seg_desc(&nt_desc); 902 903 /* Verify the type of the new TSS */ 904 nt_type = SEG_DESC_TYPE(nt.access); 905 if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS && 906 nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) { 907 sel_exception(vcpu, IDT_TS, nt_sel, ext); 908 goto done; 909 } 910 911 /* TSS descriptor must have present bit set */ 912 if (!SEG_DESC_PRESENT(nt.access)) { 913 sel_exception(vcpu, IDT_NP, nt_sel, ext); 914 goto done; 915 } 916 917 /* 918 * TSS must have a minimum length of 104 bytes for a 32-bit TSS and 919 * 44 bytes for a 16-bit TSS. 920 */ 921 if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS) 922 minlimit = 104 - 1; 923 else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) 924 minlimit = 44 - 1; 925 else 926 minlimit = 0; 927 928 assert(minlimit > 0); 929 if (nt.limit < (unsigned int)minlimit) { 930 sel_exception(vcpu, IDT_TS, nt_sel, ext); 931 goto done; 932 } 933 934 /* TSS must be busy if task switch is due to IRET */ 935 if (reason == TSR_IRET && !TSS_BUSY(nt_type)) { 936 sel_exception(vcpu, IDT_TS, nt_sel, ext); 937 goto done; 938 } 939 940 /* 941 * TSS must be available (not busy) if task switch reason is 942 * CALL, JMP, exception or interrupt. 943 */ 944 if (reason != TSR_IRET && TSS_BUSY(nt_type)) { 945 sel_exception(vcpu, IDT_GP, nt_sel, ext); 946 goto done; 947 } 948 949 /* Fetch the new TSS */ 950 error = vm_copy_setup(vcpu, &sup_paging, nt.base, minlimit + 1, 951 PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault); 952 CHKERR(error, fault); 953 vm_copyin(nt_iov, &newtss, minlimit + 1); 954 955 /* Get the old TSS selector from the guest's task register */ 956 ot_sel = GETREG(vcpu, VM_REG_GUEST_TR); 957 if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) { 958 /* 959 * This might happen if a task switch was attempted without 960 * ever loading the task register with LTR. In this case the 961 * TR would contain the values from power-on: 962 * (sel = 0, base = 0, limit = 0xffff). 963 */ 964 sel_exception(vcpu, IDT_TS, ot_sel, task_switch->ext); 965 goto done; 966 } 967 968 /* Get the old TSS base and limit from the guest's task register */ 969 error = vm_get_desc(vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim, 970 &access); 971 assert(error == 0); 972 assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access)); 973 ot_type = SEG_DESC_TYPE(access); 974 assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY); 975 976 /* Fetch the old TSS descriptor */ 977 error = read_tss_descriptor(vcpu, task_switch, ot_sel, &ot_desc, 978 &fault); 979 CHKERR(error, fault); 980 981 /* Get the old TSS */ 982 error = vm_copy_setup(vcpu, &sup_paging, ot_base, minlimit + 1, 983 PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault); 984 CHKERR(error, fault); 985 vm_copyin(ot_iov, &oldtss, minlimit + 1); 986 987 /* 988 * Clear the busy bit in the old TSS descriptor if the task switch 989 * due to an IRET or JMP instruction. 990 */ 991 if (reason == TSR_IRET || reason == TSR_JMP) { 992 ot_desc.sd_type &= ~0x2; 993 error = desc_table_write(vcpu, &sup_paging, ot_sel, 994 &ot_desc, &fault); 995 CHKERR(error, fault); 996 } 997 998 if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) { 999 EPRINTLN("Task switch to 16-bit TSS not supported"); 1000 return (VMEXIT_ABORT); 1001 } 1002 1003 /* Save processor state in old TSS */ 1004 tss32_save(vcpu, task_switch, eip, &oldtss, ot_iov); 1005 1006 /* 1007 * If the task switch was triggered for any reason other than IRET 1008 * then set the busy bit in the new TSS descriptor. 1009 */ 1010 if (reason != TSR_IRET) { 1011 nt_desc.sd_type |= 0x2; 1012 error = desc_table_write(vcpu, &sup_paging, nt_sel, 1013 &nt_desc, &fault); 1014 CHKERR(error, fault); 1015 } 1016 1017 /* Update task register to point at the new TSS */ 1018 SETREG(vcpu, VM_REG_GUEST_TR, nt_sel); 1019 1020 /* Update the hidden descriptor state of the task register */ 1021 nt = usd_to_seg_desc(&nt_desc); 1022 update_seg_desc(vcpu, VM_REG_GUEST_TR, &nt); 1023 1024 /* Set CR0.TS */ 1025 cr0 = GETREG(vcpu, VM_REG_GUEST_CR0); 1026 SETREG(vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS); 1027 1028 /* 1029 * We are now committed to the task switch. Any exceptions encountered 1030 * after this point will be handled in the context of the new task and 1031 * the saved instruction pointer will belong to the new task. 1032 */ 1033 error = vm_set_register(vcpu, VM_REG_GUEST_RIP, newtss.tss_eip); 1034 assert(error == 0); 1035 1036 /* Load processor state from new TSS */ 1037 error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov, 1038 &fault); 1039 CHKERR(error, fault); 1040 1041 /* 1042 * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception 1043 * caused an error code to be generated, this error code is copied 1044 * to the stack of the new task. 1045 */ 1046 if (task_switch->errcode_valid) { 1047 assert(task_switch->ext); 1048 assert(task_switch->reason == TSR_IDT_GATE); 1049 error = push_errcode(vcpu, &task_switch->paging, nt_type, 1050 task_switch->errcode, &fault); 1051 CHKERR(error, fault); 1052 } 1053 1054 /* 1055 * Treatment of virtual-NMI blocking if NMI is delivered through 1056 * a task gate. 1057 * 1058 * Section "Architectural State Before A VM Exit", Intel SDM, Vol3: 1059 * If the virtual NMIs VM-execution control is 1, VM entry injects 1060 * an NMI, and delivery of the NMI causes a task switch that causes 1061 * a VM exit, virtual-NMI blocking is in effect before the VM exit 1062 * commences. 1063 * 1064 * Thus, virtual-NMI blocking is in effect at the time of the task 1065 * switch VM exit. 1066 */ 1067 1068 /* 1069 * Treatment of virtual-NMI unblocking on IRET from NMI handler task. 1070 * 1071 * Section "Changes to Instruction Behavior in VMX Non-Root Operation" 1072 * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking. 1073 * This unblocking of virtual-NMI occurs even if IRET causes a fault. 1074 * 1075 * Thus, virtual-NMI blocking is cleared at the time of the task switch 1076 * VM exit. 1077 */ 1078 1079 /* 1080 * If the task switch was triggered by an event delivered through 1081 * the IDT then extinguish the pending event from the vcpu's 1082 * exitintinfo. 1083 */ 1084 if (task_switch->reason == TSR_IDT_GATE) { 1085 error = vm_set_intinfo(vcpu, 0); 1086 assert(error == 0); 1087 } 1088 1089 /* 1090 * XXX should inject debug exception if 'T' bit is 1 1091 */ 1092 done: 1093 return (VMEXIT_CONTINUE); 1094 } 1095