1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2012 Sandvine, Inc. 5 * Copyright (c) 2012 NetApp, Inc. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * $FreeBSD$ 30 */ 31 /* 32 * This file and its contents are supplied under the terms of the 33 * Common Development and Distribution License ("CDDL"), version 1.0. 34 * You may only use this file in accordance with the terms of version 35 * 1.0 of the CDDL. 36 * 37 * A full copy of the text of the CDDL should have accompanied this 38 * source. A copy of the CDDL is also available via the Internet at 39 * http://www.illumos.org/license/CDDL. 40 * 41 * Copyright 2015 Pluribus Networks Inc. 42 * Copyright 2018 Joyent, Inc. 43 * Copyright 2021 Oxide Computer Company 44 * Copyright 2022 OmniOS Community Edition (OmniOSce) Association. 45 */ 46 47 #include <sys/cdefs.h> 48 __FBSDID("$FreeBSD$"); 49 50 #include <sys/param.h> 51 #include <sys/pcpu.h> 52 #include <sys/systm.h> 53 #include <sys/proc.h> 54 55 #include <machine/vmparam.h> 56 #include <machine/vmm.h> 57 #include <sys/vmm_kernel.h> 58 #include <sys/vmm_vm.h> 59 60 #include <sys/vmm_instruction_emul.h> 61 #include <x86/psl.h> 62 #include <x86/specialreg.h> 63 64 #include "vmm_ioport.h" 65 66 enum vie_status { 67 VIES_INIT = (1U << 0), 68 VIES_MMIO = (1U << 1), 69 VIES_INOUT = (1U << 2), 70 VIES_OTHER = (1U << 3), 71 VIES_INST_FETCH = (1U << 4), 72 VIES_INST_DECODE = (1U << 5), 73 VIES_PENDING_MMIO = (1U << 6), 74 VIES_PENDING_INOUT = (1U << 7), 75 VIES_REPEAT = (1U << 8), 76 VIES_USER_FALLBACK = (1U << 9), 77 VIES_COMPLETE = (1U << 10), 78 }; 79 80 /* State of request to perform emulated access (inout or MMIO) */ 81 enum vie_req { 82 VR_NONE, 83 VR_PENDING, 84 VR_DONE, 85 }; 86 87 struct vie_mmio { 88 uint64_t data; 89 uint64_t gpa; 90 uint8_t bytes; 91 enum vie_req state; 92 }; 93 94 struct vie_op { 95 uint8_t op_byte; /* actual opcode byte */ 96 uint8_t op_type; /* type of operation (e.g. MOV) */ 97 uint16_t op_flags; 98 }; 99 100 #define VIE_INST_SIZE 15 101 struct vie { 102 uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */ 103 uint8_t num_valid; /* size of the instruction */ 104 uint8_t num_processed; 105 106 uint8_t addrsize:4, opsize:4; /* address and operand sizes */ 107 uint8_t rex_w:1, /* REX prefix */ 108 rex_r:1, 109 rex_x:1, 110 rex_b:1, 111 rex_present:1, 112 repz_present:1, /* REP/REPE/REPZ prefix */ 113 repnz_present:1, /* REPNE/REPNZ prefix */ 114 opsize_override:1, /* Operand size override */ 115 addrsize_override:1, /* Address size override */ 116 segment_override:1; /* Segment override */ 117 118 uint8_t mod:2, /* ModRM byte */ 119 reg:4, 120 rm:4; 121 122 uint8_t ss:2, /* SIB byte */ 123 vex_present:1, /* VEX prefixed */ 124 vex_l:1, /* L bit */ 125 index:4, /* SIB byte */ 126 base:4; /* SIB byte */ 127 128 uint8_t disp_bytes; 129 uint8_t imm_bytes; 130 131 uint8_t scale; 132 133 uint8_t vex_reg:4, /* vvvv: first source reg specifier */ 134 vex_pp:2, /* pp */ 135 _sparebits:2; 136 137 uint8_t _sparebytes[2]; 138 139 int base_register; /* VM_REG_GUEST_xyz */ 140 int index_register; /* VM_REG_GUEST_xyz */ 141 int segment_register; /* VM_REG_GUEST_xyz */ 142 143 int64_t displacement; /* optional addr displacement */ 144 int64_t immediate; /* optional immediate operand */ 145 146 struct vie_op op; /* opcode description */ 147 148 enum vie_status status; 149 150 struct vm_guest_paging paging; /* guest paging state */ 151 152 uint64_t mmio_gpa; /* faulting GPA */ 153 struct vie_mmio mmio_req_read; 154 struct vie_mmio mmio_req_write; 155 156 struct vm_inout inout; /* active in/out op */ 157 enum vie_req inout_req_state; 158 uint32_t inout_req_val; /* value from userspace */ 159 }; 160 161 162 /* struct vie_op.op_type */ 163 enum { 164 VIE_OP_TYPE_NONE = 0, 165 VIE_OP_TYPE_MOV, 166 VIE_OP_TYPE_MOVSX, 167 VIE_OP_TYPE_MOVZX, 168 VIE_OP_TYPE_MOV_CR, 169 VIE_OP_TYPE_AND, 170 VIE_OP_TYPE_OR, 171 VIE_OP_TYPE_SUB, 172 VIE_OP_TYPE_TWO_BYTE, 173 VIE_OP_TYPE_PUSH, 174 VIE_OP_TYPE_CMP, 175 VIE_OP_TYPE_POP, 176 VIE_OP_TYPE_MOVS, 177 VIE_OP_TYPE_GROUP1, 178 VIE_OP_TYPE_STOS, 179 VIE_OP_TYPE_BITTEST, 180 VIE_OP_TYPE_TWOB_GRP15, 181 VIE_OP_TYPE_ADD, 182 VIE_OP_TYPE_TEST, 183 VIE_OP_TYPE_BEXTR, 184 VIE_OP_TYPE_CLTS, 185 VIE_OP_TYPE_LAST 186 }; 187 188 /* struct vie_op.op_flags */ 189 #define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */ 190 #define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ 191 #define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */ 192 #define VIE_OP_F_NO_MODRM (1 << 3) 193 #define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4) 194 #define VIE_OP_F_REG_REG (1 << 5) /* special-case for mov-cr */ 195 196 static const struct vie_op three_byte_opcodes_0f38[256] = { 197 [0xF7] = { 198 .op_byte = 0xF7, 199 .op_type = VIE_OP_TYPE_BEXTR, 200 }, 201 }; 202 203 static const struct vie_op two_byte_opcodes[256] = { 204 [0x06] = { 205 .op_byte = 0x06, 206 .op_type = VIE_OP_TYPE_CLTS, 207 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 208 }, 209 [0x20] = { 210 .op_byte = 0x20, 211 .op_type = VIE_OP_TYPE_MOV_CR, 212 .op_flags = VIE_OP_F_REG_REG | VIE_OP_F_NO_GLA_VERIFICATION 213 }, 214 [0x22] = { 215 .op_byte = 0x22, 216 .op_type = VIE_OP_TYPE_MOV_CR, 217 .op_flags = VIE_OP_F_REG_REG | VIE_OP_F_NO_GLA_VERIFICATION 218 }, 219 [0xAE] = { 220 .op_byte = 0xAE, 221 .op_type = VIE_OP_TYPE_TWOB_GRP15, 222 }, 223 [0xB6] = { 224 .op_byte = 0xB6, 225 .op_type = VIE_OP_TYPE_MOVZX, 226 }, 227 [0xB7] = { 228 .op_byte = 0xB7, 229 .op_type = VIE_OP_TYPE_MOVZX, 230 }, 231 [0xBA] = { 232 .op_byte = 0xBA, 233 .op_type = VIE_OP_TYPE_BITTEST, 234 .op_flags = VIE_OP_F_IMM8, 235 }, 236 [0xBE] = { 237 .op_byte = 0xBE, 238 .op_type = VIE_OP_TYPE_MOVSX, 239 }, 240 }; 241 242 static const struct vie_op one_byte_opcodes[256] = { 243 [0x03] = { 244 .op_byte = 0x03, 245 .op_type = VIE_OP_TYPE_ADD, 246 }, 247 [0x0F] = { 248 .op_byte = 0x0F, 249 .op_type = VIE_OP_TYPE_TWO_BYTE 250 }, 251 [0x0B] = { 252 .op_byte = 0x0B, 253 .op_type = VIE_OP_TYPE_OR, 254 }, 255 [0x2B] = { 256 .op_byte = 0x2B, 257 .op_type = VIE_OP_TYPE_SUB, 258 }, 259 [0x39] = { 260 .op_byte = 0x39, 261 .op_type = VIE_OP_TYPE_CMP, 262 }, 263 [0x3B] = { 264 .op_byte = 0x3B, 265 .op_type = VIE_OP_TYPE_CMP, 266 }, 267 [0x88] = { 268 .op_byte = 0x88, 269 .op_type = VIE_OP_TYPE_MOV, 270 }, 271 [0x89] = { 272 .op_byte = 0x89, 273 .op_type = VIE_OP_TYPE_MOV, 274 }, 275 [0x8A] = { 276 .op_byte = 0x8A, 277 .op_type = VIE_OP_TYPE_MOV, 278 }, 279 [0x8B] = { 280 .op_byte = 0x8B, 281 .op_type = VIE_OP_TYPE_MOV, 282 }, 283 [0xA1] = { 284 .op_byte = 0xA1, 285 .op_type = VIE_OP_TYPE_MOV, 286 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 287 }, 288 [0xA3] = { 289 .op_byte = 0xA3, 290 .op_type = VIE_OP_TYPE_MOV, 291 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 292 }, 293 [0xA4] = { 294 .op_byte = 0xA4, 295 .op_type = VIE_OP_TYPE_MOVS, 296 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 297 }, 298 [0xA5] = { 299 .op_byte = 0xA5, 300 .op_type = VIE_OP_TYPE_MOVS, 301 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 302 }, 303 [0xAA] = { 304 .op_byte = 0xAA, 305 .op_type = VIE_OP_TYPE_STOS, 306 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 307 }, 308 [0xAB] = { 309 .op_byte = 0xAB, 310 .op_type = VIE_OP_TYPE_STOS, 311 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 312 }, 313 [0xC6] = { 314 /* XXX Group 11 extended opcode - not just MOV */ 315 .op_byte = 0xC6, 316 .op_type = VIE_OP_TYPE_MOV, 317 .op_flags = VIE_OP_F_IMM8, 318 }, 319 [0xC7] = { 320 .op_byte = 0xC7, 321 .op_type = VIE_OP_TYPE_MOV, 322 .op_flags = VIE_OP_F_IMM, 323 }, 324 [0x23] = { 325 .op_byte = 0x23, 326 .op_type = VIE_OP_TYPE_AND, 327 }, 328 [0x80] = { 329 /* Group 1 extended opcode */ 330 .op_byte = 0x80, 331 .op_type = VIE_OP_TYPE_GROUP1, 332 .op_flags = VIE_OP_F_IMM8, 333 }, 334 [0x81] = { 335 /* Group 1 extended opcode */ 336 .op_byte = 0x81, 337 .op_type = VIE_OP_TYPE_GROUP1, 338 .op_flags = VIE_OP_F_IMM, 339 }, 340 [0x83] = { 341 /* Group 1 extended opcode */ 342 .op_byte = 0x83, 343 .op_type = VIE_OP_TYPE_GROUP1, 344 .op_flags = VIE_OP_F_IMM8, 345 }, 346 [0x8F] = { 347 /* XXX Group 1A extended opcode - not just POP */ 348 .op_byte = 0x8F, 349 .op_type = VIE_OP_TYPE_POP, 350 }, 351 [0xF6] = { 352 /* XXX Group 3 extended opcode - not just TEST */ 353 .op_byte = 0xF6, 354 .op_type = VIE_OP_TYPE_TEST, 355 .op_flags = VIE_OP_F_IMM8, 356 }, 357 [0xF7] = { 358 /* XXX Group 3 extended opcode - not just TEST */ 359 .op_byte = 0xF7, 360 .op_type = VIE_OP_TYPE_TEST, 361 .op_flags = VIE_OP_F_IMM, 362 }, 363 [0xFF] = { 364 /* XXX Group 5 extended opcode - not just PUSH */ 365 .op_byte = 0xFF, 366 .op_type = VIE_OP_TYPE_PUSH, 367 } 368 }; 369 370 /* struct vie.mod */ 371 #define VIE_MOD_INDIRECT 0 372 #define VIE_MOD_INDIRECT_DISP8 1 373 #define VIE_MOD_INDIRECT_DISP32 2 374 #define VIE_MOD_DIRECT 3 375 376 /* struct vie.rm */ 377 #define VIE_RM_SIB 4 378 #define VIE_RM_DISP32 5 379 380 #define GB (1024 * 1024 * 1024) 381 382 383 /* 384 * Paging defines, previously pulled in from machine/pmap.h 385 */ 386 #define PG_V (1 << 0) /* Present */ 387 #define PG_RW (1 << 1) /* Read/Write */ 388 #define PG_U (1 << 2) /* User/Supervisor */ 389 #define PG_A (1 << 5) /* Accessed */ 390 #define PG_M (1 << 6) /* Dirty */ 391 #define PG_PS (1 << 7) /* Largepage */ 392 393 /* 394 * Paging except defines, previously pulled in from machine/pmap.h 395 */ 396 #define PGEX_P (1 << 0) /* Non-present/Protection */ 397 #define PGEX_W (1 << 1) /* Read/Write */ 398 #define PGEX_U (1 << 2) /* User/Supervisor */ 399 #define PGEX_RSV (1 << 3) /* (Non-)Reserved */ 400 #define PGEX_I (1 << 4) /* Instruction */ 401 402 403 static enum vm_reg_name gpr_map[16] = { 404 VM_REG_GUEST_RAX, 405 VM_REG_GUEST_RCX, 406 VM_REG_GUEST_RDX, 407 VM_REG_GUEST_RBX, 408 VM_REG_GUEST_RSP, 409 VM_REG_GUEST_RBP, 410 VM_REG_GUEST_RSI, 411 VM_REG_GUEST_RDI, 412 VM_REG_GUEST_R8, 413 VM_REG_GUEST_R9, 414 VM_REG_GUEST_R10, 415 VM_REG_GUEST_R11, 416 VM_REG_GUEST_R12, 417 VM_REG_GUEST_R13, 418 VM_REG_GUEST_R14, 419 VM_REG_GUEST_R15 420 }; 421 422 static enum vm_reg_name cr_map[16] = { 423 VM_REG_GUEST_CR0, 424 VM_REG_LAST, 425 VM_REG_GUEST_CR2, 426 VM_REG_GUEST_CR3, 427 VM_REG_GUEST_CR4, 428 VM_REG_LAST, 429 VM_REG_LAST, 430 VM_REG_LAST, 431 VM_REG_LAST, 432 VM_REG_LAST, 433 VM_REG_LAST, 434 VM_REG_LAST, 435 VM_REG_LAST, 436 VM_REG_LAST, 437 VM_REG_LAST, 438 VM_REG_LAST 439 }; 440 441 static uint64_t size2mask[] = { 442 [1] = 0xff, 443 [2] = 0xffff, 444 [4] = 0xffffffff, 445 [8] = 0xffffffffffffffff, 446 }; 447 448 449 static int vie_mmio_read(struct vie *vie, struct vm *vm, int cpuid, 450 uint64_t gpa, uint64_t *rval, int bytes); 451 static int vie_mmio_write(struct vie *vie, struct vm *vm, int cpuid, 452 uint64_t gpa, uint64_t wval, int bytes); 453 static int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, 454 struct seg_desc *desc, uint64_t offset, int length, int addrsize, 455 int prot, uint64_t *gla); 456 static int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla); 457 static int vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, 458 uint64_t gla); 459 static uint64_t vie_size2mask(int size); 460 461 struct vie * 462 vie_alloc() 463 { 464 return (kmem_zalloc(sizeof (struct vie), KM_SLEEP)); 465 } 466 467 void 468 vie_free(struct vie *vie) 469 { 470 kmem_free(vie, sizeof (struct vie)); 471 } 472 473 enum vm_reg_name 474 vie_regnum_map(uint8_t regnum) 475 { 476 VERIFY3U(regnum, <, 16); 477 return (gpr_map[regnum]); 478 } 479 480 static void 481 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr) 482 { 483 *lhbr = 0; 484 *reg = gpr_map[vie->reg]; 485 486 /* 487 * 64-bit mode imposes limitations on accessing legacy high byte 488 * registers (lhbr). 489 * 490 * The legacy high-byte registers cannot be addressed if the REX 491 * prefix is present. In this case the values 4, 5, 6 and 7 of the 492 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively. 493 * 494 * If the REX prefix is not present then the values 4, 5, 6 and 7 495 * of the 'ModRM:reg' field address the legacy high-byte registers, 496 * %ah, %ch, %dh and %bh respectively. 497 */ 498 if (!vie->rex_present) { 499 if (vie->reg & 0x4) { 500 *lhbr = 1; 501 *reg = gpr_map[vie->reg & 0x3]; 502 } 503 } 504 } 505 506 static int 507 vie_read_bytereg(struct vie *vie, struct vm *vm, int vcpuid, uint8_t *rval) 508 { 509 uint64_t val; 510 int error, lhbr; 511 enum vm_reg_name reg; 512 513 vie_calc_bytereg(vie, ®, &lhbr); 514 error = vm_get_register(vm, vcpuid, reg, &val); 515 516 /* 517 * To obtain the value of a legacy high byte register shift the 518 * base register right by 8 bits (%ah = %rax >> 8). 519 */ 520 if (lhbr) 521 *rval = val >> 8; 522 else 523 *rval = val; 524 return (error); 525 } 526 527 static int 528 vie_write_bytereg(struct vie *vie, struct vm *vm, int vcpuid, uint8_t byte) 529 { 530 uint64_t origval, val, mask; 531 int error, lhbr; 532 enum vm_reg_name reg; 533 534 vie_calc_bytereg(vie, ®, &lhbr); 535 error = vm_get_register(vm, vcpuid, reg, &origval); 536 if (error == 0) { 537 val = byte; 538 mask = 0xff; 539 if (lhbr) { 540 /* 541 * Shift left by 8 to store 'byte' in a legacy high 542 * byte register. 543 */ 544 val <<= 8; 545 mask <<= 8; 546 } 547 val |= origval & ~mask; 548 error = vm_set_register(vm, vcpuid, reg, val); 549 } 550 return (error); 551 } 552 553 static int 554 vie_update_register(struct vm *vm, int vcpuid, enum vm_reg_name reg, 555 uint64_t val, int size) 556 { 557 int error; 558 uint64_t origval; 559 560 switch (size) { 561 case 1: 562 case 2: 563 error = vm_get_register(vm, vcpuid, reg, &origval); 564 if (error) 565 return (error); 566 val &= size2mask[size]; 567 val |= origval & ~size2mask[size]; 568 break; 569 case 4: 570 val &= 0xffffffffUL; 571 break; 572 case 8: 573 break; 574 default: 575 return (EINVAL); 576 } 577 578 error = vm_set_register(vm, vcpuid, reg, val); 579 return (error); 580 } 581 582 static int 583 vie_repeat(struct vie *vie) 584 { 585 vie->status |= VIES_REPEAT; 586 587 /* 588 * Clear out any cached operation values so the repeated instruction can 589 * begin without using that stale state. Other state, such as the 590 * decoding results, are kept around as it will not vary between 591 * iterations of a rep-prefixed instruction. 592 */ 593 if ((vie->status & VIES_MMIO) != 0) { 594 vie->mmio_req_read.state = VR_NONE; 595 vie->mmio_req_write.state = VR_NONE; 596 } else if ((vie->status & VIES_INOUT) != 0) { 597 vie->inout_req_state = VR_NONE; 598 } else { 599 panic("unexpected emulation state"); 600 } 601 602 return (EAGAIN); 603 } 604 605 #define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V) 606 607 /* 608 * Return the status flags that would result from doing (x - y). 609 */ 610 /* BEGIN CSTYLED */ 611 #define GETCC(sz) \ 612 static ulong_t \ 613 getcc##sz(uint##sz##_t x, uint##sz##_t y) \ 614 { \ 615 ulong_t rflags; \ 616 \ 617 __asm __volatile("sub %2,%1; pushfq; popq %0" : \ 618 "=r" (rflags), "+r" (x) : "m" (y)); \ 619 return (rflags); \ 620 } struct __hack 621 /* END CSTYLED */ 622 623 GETCC(8); 624 GETCC(16); 625 GETCC(32); 626 GETCC(64); 627 628 static ulong_t 629 getcc(int opsize, uint64_t x, uint64_t y) 630 { 631 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 632 ("getcc: invalid operand size %d", opsize)); 633 634 if (opsize == 1) 635 return (getcc8(x, y)); 636 else if (opsize == 2) 637 return (getcc16(x, y)); 638 else if (opsize == 4) 639 return (getcc32(x, y)); 640 else 641 return (getcc64(x, y)); 642 } 643 644 /* 645 * Macro creation of functions getaddflags{8,16,32,64} 646 */ 647 /* BEGIN CSTYLED */ 648 #define GETADDFLAGS(sz) \ 649 static ulong_t \ 650 getaddflags##sz(uint##sz##_t x, uint##sz##_t y) \ 651 { \ 652 ulong_t rflags; \ 653 \ 654 __asm __volatile("add %2,%1; pushfq; popq %0" : \ 655 "=r" (rflags), "+r" (x) : "m" (y)); \ 656 return (rflags); \ 657 } struct __hack 658 /* END CSTYLED */ 659 660 GETADDFLAGS(8); 661 GETADDFLAGS(16); 662 GETADDFLAGS(32); 663 GETADDFLAGS(64); 664 665 static ulong_t 666 getaddflags(int opsize, uint64_t x, uint64_t y) 667 { 668 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 669 ("getaddflags: invalid operand size %d", opsize)); 670 671 if (opsize == 1) 672 return (getaddflags8(x, y)); 673 else if (opsize == 2) 674 return (getaddflags16(x, y)); 675 else if (opsize == 4) 676 return (getaddflags32(x, y)); 677 else 678 return (getaddflags64(x, y)); 679 } 680 681 /* 682 * Return the status flags that would result from doing (x & y). 683 */ 684 /* BEGIN CSTYLED */ 685 #define GETANDFLAGS(sz) \ 686 static ulong_t \ 687 getandflags##sz(uint##sz##_t x, uint##sz##_t y) \ 688 { \ 689 ulong_t rflags; \ 690 \ 691 __asm __volatile("and %2,%1; pushfq; popq %0" : \ 692 "=r" (rflags), "+r" (x) : "m" (y)); \ 693 return (rflags); \ 694 } struct __hack 695 /* END CSTYLED */ 696 697 GETANDFLAGS(8); 698 GETANDFLAGS(16); 699 GETANDFLAGS(32); 700 GETANDFLAGS(64); 701 702 static ulong_t 703 getandflags(int opsize, uint64_t x, uint64_t y) 704 { 705 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 706 ("getandflags: invalid operand size %d", opsize)); 707 708 if (opsize == 1) 709 return (getandflags8(x, y)); 710 else if (opsize == 2) 711 return (getandflags16(x, y)); 712 else if (opsize == 4) 713 return (getandflags32(x, y)); 714 else 715 return (getandflags64(x, y)); 716 } 717 718 static int 719 vie_emulate_mov_cr(struct vie *vie, struct vm *vm, int vcpuid) 720 { 721 uint64_t val; 722 int err; 723 enum vm_reg_name gpr = gpr_map[vie->rm]; 724 enum vm_reg_name cr = cr_map[vie->reg]; 725 726 uint_t size = 4; 727 if (vie->paging.cpu_mode == CPU_MODE_64BIT) { 728 size = 8; 729 } 730 731 switch (vie->op.op_byte) { 732 case 0x20: 733 /* 734 * MOV control register (ModRM:reg) to reg (ModRM:r/m) 735 * 20/r: mov r32, CR0-CR7 736 * 20/r: mov r64, CR0-CR7 737 * REX.R + 20/0: mov r64, CR8 738 */ 739 if (vie->paging.cpl != 0) { 740 vm_inject_gp(vm, vcpuid); 741 vie->num_processed = 0; 742 return (0); 743 } 744 err = vm_get_register(vm, vcpuid, cr, &val); 745 if (err != 0) { 746 /* #UD for access to non-existent CRs */ 747 vm_inject_ud(vm, vcpuid); 748 vie->num_processed = 0; 749 return (0); 750 } 751 err = vie_update_register(vm, vcpuid, gpr, val, size); 752 break; 753 case 0x22: { 754 /* 755 * MOV reg (ModRM:r/m) to control register (ModRM:reg) 756 * 22/r: mov CR0-CR7, r32 757 * 22/r: mov CR0-CR7, r64 758 * REX.R + 22/0: mov CR8, r64 759 */ 760 uint64_t old, diff; 761 762 if (vie->paging.cpl != 0) { 763 vm_inject_gp(vm, vcpuid); 764 vie->num_processed = 0; 765 return (0); 766 } 767 err = vm_get_register(vm, vcpuid, cr, &old); 768 if (err != 0) { 769 /* #UD for access to non-existent CRs */ 770 vm_inject_ud(vm, vcpuid); 771 vie->num_processed = 0; 772 return (0); 773 } 774 err = vm_get_register(vm, vcpuid, gpr, &val); 775 VERIFY0(err); 776 val &= size2mask[size]; 777 diff = old ^ val; 778 779 switch (cr) { 780 case VM_REG_GUEST_CR0: 781 if ((diff & CR0_PG) != 0) { 782 uint64_t efer; 783 784 err = vm_get_register(vm, vcpuid, 785 VM_REG_GUEST_EFER, &efer); 786 VERIFY0(err); 787 788 /* Keep the long-mode state in EFER in sync */ 789 if ((val & CR0_PG) != 0 && 790 (efer & EFER_LME) != 0) { 791 efer |= EFER_LMA; 792 } 793 if ((val & CR0_PG) == 0 && 794 (efer & EFER_LME) != 0) { 795 efer &= ~EFER_LMA; 796 } 797 798 err = vm_set_register(vm, vcpuid, 799 VM_REG_GUEST_EFER, efer); 800 VERIFY0(err); 801 } 802 /* TODO: enforce more of the #GP checks */ 803 err = vm_set_register(vm, vcpuid, cr, val); 804 VERIFY0(err); 805 break; 806 case VM_REG_GUEST_CR2: 807 case VM_REG_GUEST_CR3: 808 case VM_REG_GUEST_CR4: 809 /* TODO: enforce more of the #GP checks */ 810 err = vm_set_register(vm, vcpuid, cr, val); 811 break; 812 default: 813 /* The cr_map mapping should prevent this */ 814 panic("invalid cr %d", cr); 815 } 816 break; 817 } 818 default: 819 return (EINVAL); 820 } 821 return (err); 822 } 823 824 static int 825 vie_emulate_mov(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 826 { 827 int error, size; 828 enum vm_reg_name reg; 829 uint8_t byte; 830 uint64_t val; 831 832 size = vie->opsize; 833 error = EINVAL; 834 835 switch (vie->op.op_byte) { 836 case 0x88: 837 /* 838 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m) 839 * 88/r: mov r/m8, r8 840 * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available) 841 */ 842 size = 1; /* override for byte operation */ 843 error = vie_read_bytereg(vie, vm, vcpuid, &byte); 844 if (error == 0) { 845 error = vie_mmio_write(vie, vm, vcpuid, gpa, byte, 846 size); 847 } 848 break; 849 case 0x89: 850 /* 851 * MOV from reg (ModRM:reg) to mem (ModRM:r/m) 852 * 89/r: mov r/m16, r16 853 * 89/r: mov r/m32, r32 854 * REX.W + 89/r mov r/m64, r64 855 */ 856 reg = gpr_map[vie->reg]; 857 error = vm_get_register(vm, vcpuid, reg, &val); 858 if (error == 0) { 859 val &= size2mask[size]; 860 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size); 861 } 862 break; 863 case 0x8A: 864 /* 865 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg) 866 * 8A/r: mov r8, r/m8 867 * REX + 8A/r: mov r8, r/m8 868 */ 869 size = 1; /* override for byte operation */ 870 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size); 871 if (error == 0) 872 error = vie_write_bytereg(vie, vm, vcpuid, val); 873 break; 874 case 0x8B: 875 /* 876 * MOV from mem (ModRM:r/m) to reg (ModRM:reg) 877 * 8B/r: mov r16, r/m16 878 * 8B/r: mov r32, r/m32 879 * REX.W 8B/r: mov r64, r/m64 880 */ 881 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size); 882 if (error == 0) { 883 reg = gpr_map[vie->reg]; 884 error = vie_update_register(vm, vcpuid, reg, val, size); 885 } 886 break; 887 case 0xA1: 888 /* 889 * MOV from seg:moffset to AX/EAX/RAX 890 * A1: mov AX, moffs16 891 * A1: mov EAX, moffs32 892 * REX.W + A1: mov RAX, moffs64 893 */ 894 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size); 895 if (error == 0) { 896 reg = VM_REG_GUEST_RAX; 897 error = vie_update_register(vm, vcpuid, reg, val, size); 898 } 899 break; 900 case 0xA3: 901 /* 902 * MOV from AX/EAX/RAX to seg:moffset 903 * A3: mov moffs16, AX 904 * A3: mov moffs32, EAX 905 * REX.W + A3: mov moffs64, RAX 906 */ 907 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); 908 if (error == 0) { 909 val &= size2mask[size]; 910 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size); 911 } 912 break; 913 case 0xC6: 914 /* 915 * MOV from imm8 to mem (ModRM:r/m) 916 * C6/0 mov r/m8, imm8 917 * REX + C6/0 mov r/m8, imm8 918 */ 919 size = 1; /* override for byte operation */ 920 val = vie->immediate; 921 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size); 922 break; 923 case 0xC7: 924 /* 925 * MOV from imm16/imm32 to mem (ModRM:r/m) 926 * C7/0 mov r/m16, imm16 927 * C7/0 mov r/m32, imm32 928 * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) 929 */ 930 val = vie->immediate & size2mask[size]; 931 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size); 932 break; 933 default: 934 break; 935 } 936 937 return (error); 938 } 939 940 static int 941 vie_emulate_movx(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 942 { 943 int error, size; 944 enum vm_reg_name reg; 945 uint64_t val; 946 947 size = vie->opsize; 948 error = EINVAL; 949 950 switch (vie->op.op_byte) { 951 case 0xB6: 952 /* 953 * MOV and zero extend byte from mem (ModRM:r/m) to 954 * reg (ModRM:reg). 955 * 956 * 0F B6/r movzx r16, r/m8 957 * 0F B6/r movzx r32, r/m8 958 * REX.W + 0F B6/r movzx r64, r/m8 959 */ 960 961 /* get the first operand */ 962 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 1); 963 if (error) 964 break; 965 966 /* get the second operand */ 967 reg = gpr_map[vie->reg]; 968 969 /* zero-extend byte */ 970 val = (uint8_t)val; 971 972 /* write the result */ 973 error = vie_update_register(vm, vcpuid, reg, val, size); 974 break; 975 case 0xB7: 976 /* 977 * MOV and zero extend word from mem (ModRM:r/m) to 978 * reg (ModRM:reg). 979 * 980 * 0F B7/r movzx r32, r/m16 981 * REX.W + 0F B7/r movzx r64, r/m16 982 */ 983 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 2); 984 if (error) 985 return (error); 986 987 reg = gpr_map[vie->reg]; 988 989 /* zero-extend word */ 990 val = (uint16_t)val; 991 992 error = vie_update_register(vm, vcpuid, reg, val, size); 993 break; 994 case 0xBE: 995 /* 996 * MOV and sign extend byte from mem (ModRM:r/m) to 997 * reg (ModRM:reg). 998 * 999 * 0F BE/r movsx r16, r/m8 1000 * 0F BE/r movsx r32, r/m8 1001 * REX.W + 0F BE/r movsx r64, r/m8 1002 */ 1003 1004 /* get the first operand */ 1005 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 1); 1006 if (error) 1007 break; 1008 1009 /* get the second operand */ 1010 reg = gpr_map[vie->reg]; 1011 1012 /* sign extend byte */ 1013 val = (int8_t)val; 1014 1015 /* write the result */ 1016 error = vie_update_register(vm, vcpuid, reg, val, size); 1017 break; 1018 default: 1019 break; 1020 } 1021 return (error); 1022 } 1023 1024 /* 1025 * Helper function to calculate and validate a linear address. 1026 */ 1027 static int 1028 vie_get_gla(struct vie *vie, struct vm *vm, int vcpuid, int opsize, 1029 int addrsize, int prot, enum vm_reg_name seg, enum vm_reg_name gpr, 1030 uint64_t *gla) 1031 { 1032 struct seg_desc desc; 1033 uint64_t cr0, val, rflags; 1034 int error; 1035 struct vm_guest_paging *paging; 1036 1037 paging = &vie->paging; 1038 1039 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); 1040 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); 1041 1042 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1043 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 1044 1045 error = vm_get_seg_desc(vm, vcpuid, seg, &desc); 1046 KASSERT(error == 0, ("%s: error %d getting segment descriptor %d", 1047 __func__, error, seg)); 1048 1049 error = vm_get_register(vm, vcpuid, gpr, &val); 1050 KASSERT(error == 0, ("%s: error %d getting register %d", __func__, 1051 error, gpr)); 1052 1053 if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize, 1054 addrsize, prot, gla)) { 1055 if (seg == VM_REG_GUEST_SS) 1056 vm_inject_ss(vm, vcpuid, 0); 1057 else 1058 vm_inject_gp(vm, vcpuid); 1059 return (-1); 1060 } 1061 1062 if (vie_canonical_check(paging->cpu_mode, *gla)) { 1063 if (seg == VM_REG_GUEST_SS) 1064 vm_inject_ss(vm, vcpuid, 0); 1065 else 1066 vm_inject_gp(vm, vcpuid); 1067 return (-1); 1068 } 1069 1070 if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) { 1071 vm_inject_ac(vm, vcpuid, 0); 1072 return (-1); 1073 } 1074 1075 return (0); 1076 } 1077 1078 static int 1079 vie_emulate_movs(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1080 { 1081 struct vm_copyinfo copyinfo[2]; 1082 uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val; 1083 uint64_t rcx, rdi, rsi, rflags; 1084 int error, fault, opsize, seg, repeat; 1085 struct vm_guest_paging *paging; 1086 1087 opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize; 1088 val = 0; 1089 error = 0; 1090 paging = &vie->paging; 1091 1092 /* 1093 * XXX although the MOVS instruction is only supposed to be used with 1094 * the "rep" prefix some guests like FreeBSD will use "repnz" instead. 1095 * 1096 * Empirically the "repnz" prefix has identical behavior to "rep" 1097 * and the zero flag does not make a difference. 1098 */ 1099 repeat = vie->repz_present | vie->repnz_present; 1100 1101 if (repeat) { 1102 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); 1103 KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); 1104 1105 /* 1106 * The count register is %rcx, %ecx or %cx depending on the 1107 * address size of the instruction. 1108 */ 1109 if ((rcx & vie_size2mask(vie->addrsize)) == 0) { 1110 error = 0; 1111 goto done; 1112 } 1113 } 1114 1115 /* 1116 * Source Destination Comments 1117 * -------------------------------------------- 1118 * (1) memory memory n/a 1119 * (2) memory mmio emulated 1120 * (3) mmio memory emulated 1121 * (4) mmio mmio emulated 1122 * 1123 * At this point we don't have sufficient information to distinguish 1124 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this 1125 * out because it will succeed only when operating on regular memory. 1126 * 1127 * XXX the emulation doesn't properly handle the case where 'gpa' 1128 * is straddling the boundary between the normal memory and MMIO. 1129 */ 1130 1131 seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS; 1132 if (vie_get_gla(vie, vm, vcpuid, opsize, vie->addrsize, PROT_READ, seg, 1133 VM_REG_GUEST_RSI, &srcaddr) != 0) { 1134 goto done; 1135 } 1136 1137 error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ, 1138 copyinfo, nitems(copyinfo), &fault); 1139 if (error == 0) { 1140 if (fault) 1141 goto done; /* Resume guest to handle fault */ 1142 1143 /* 1144 * case (2): read from system memory and write to mmio. 1145 */ 1146 vm_copyin(vm, vcpuid, copyinfo, &val, opsize); 1147 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 1148 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, opsize); 1149 if (error) 1150 goto done; 1151 } else { 1152 /* 1153 * 'vm_copy_setup()' is expected to fail for cases (3) and (4) 1154 * if 'srcaddr' is in the mmio space. 1155 */ 1156 1157 if (vie_get_gla(vie, vm, vcpuid, opsize, vie->addrsize, 1158 PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, 1159 &dstaddr) != 0) { 1160 goto done; 1161 } 1162 1163 error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize, 1164 PROT_WRITE, copyinfo, nitems(copyinfo), &fault); 1165 if (error == 0) { 1166 if (fault) 1167 goto done; /* Resume guest to handle fault */ 1168 1169 /* 1170 * case (3): read from MMIO and write to system memory. 1171 * 1172 * A MMIO read can have side-effects so we 1173 * commit to it only after vm_copy_setup() is 1174 * successful. If a page-fault needs to be 1175 * injected into the guest then it will happen 1176 * before the MMIO read is attempted. 1177 */ 1178 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 1179 opsize); 1180 1181 if (error == 0) { 1182 vm_copyout(vm, vcpuid, &val, copyinfo, opsize); 1183 } 1184 /* 1185 * Regardless of whether the MMIO read was successful or 1186 * not, the copy resources must be cleaned up. 1187 */ 1188 vm_copy_teardown(vm, vcpuid, copyinfo, 1189 nitems(copyinfo)); 1190 if (error != 0) { 1191 goto done; 1192 } 1193 } else { 1194 /* 1195 * Case (4): read from and write to mmio. 1196 * 1197 * Commit to the MMIO read/write (with potential 1198 * side-effects) only after we are sure that the 1199 * instruction is not going to be restarted due 1200 * to address translation faults. 1201 */ 1202 error = vm_gla2gpa(vm, vcpuid, paging, srcaddr, 1203 PROT_READ, &srcgpa, &fault); 1204 if (error || fault) 1205 goto done; 1206 1207 error = vm_gla2gpa(vm, vcpuid, paging, dstaddr, 1208 PROT_WRITE, &dstgpa, &fault); 1209 if (error || fault) 1210 goto done; 1211 1212 error = vie_mmio_read(vie, vm, vcpuid, srcgpa, &val, 1213 opsize); 1214 if (error) 1215 goto done; 1216 1217 error = vie_mmio_write(vie, vm, vcpuid, dstgpa, val, 1218 opsize); 1219 if (error) 1220 goto done; 1221 } 1222 } 1223 1224 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi); 1225 KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error)); 1226 1227 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); 1228 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); 1229 1230 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1231 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 1232 1233 if (rflags & PSL_D) { 1234 rsi -= opsize; 1235 rdi -= opsize; 1236 } else { 1237 rsi += opsize; 1238 rdi += opsize; 1239 } 1240 1241 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi, 1242 vie->addrsize); 1243 KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error)); 1244 1245 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, 1246 vie->addrsize); 1247 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); 1248 1249 if (repeat) { 1250 rcx = rcx - 1; 1251 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, 1252 rcx, vie->addrsize); 1253 KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); 1254 1255 /* 1256 * Repeat the instruction if the count register is not zero. 1257 */ 1258 if ((rcx & vie_size2mask(vie->addrsize)) != 0) 1259 return (vie_repeat(vie)); 1260 } 1261 done: 1262 return (error); 1263 } 1264 1265 static int 1266 vie_emulate_stos(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1267 { 1268 int error, opsize, repeat; 1269 uint64_t val; 1270 uint64_t rcx, rdi, rflags; 1271 1272 opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize; 1273 repeat = vie->repz_present | vie->repnz_present; 1274 1275 if (repeat) { 1276 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); 1277 KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); 1278 1279 /* 1280 * The count register is %rcx, %ecx or %cx depending on the 1281 * address size of the instruction. 1282 */ 1283 if ((rcx & vie_size2mask(vie->addrsize)) == 0) 1284 return (0); 1285 } 1286 1287 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); 1288 KASSERT(!error, ("%s: error %d getting rax", __func__, error)); 1289 1290 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, opsize); 1291 if (error) 1292 return (error); 1293 1294 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); 1295 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); 1296 1297 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1298 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 1299 1300 if (rflags & PSL_D) 1301 rdi -= opsize; 1302 else 1303 rdi += opsize; 1304 1305 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, 1306 vie->addrsize); 1307 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); 1308 1309 if (repeat) { 1310 rcx = rcx - 1; 1311 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, 1312 rcx, vie->addrsize); 1313 KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); 1314 1315 /* 1316 * Repeat the instruction if the count register is not zero. 1317 */ 1318 if ((rcx & vie_size2mask(vie->addrsize)) != 0) 1319 return (vie_repeat(vie)); 1320 } 1321 1322 return (0); 1323 } 1324 1325 static int 1326 vie_emulate_and(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1327 { 1328 int error, size; 1329 enum vm_reg_name reg; 1330 uint64_t result, rflags, rflags2, val1, val2; 1331 1332 size = vie->opsize; 1333 error = EINVAL; 1334 1335 switch (vie->op.op_byte) { 1336 case 0x23: 1337 /* 1338 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the 1339 * result in reg. 1340 * 1341 * 23/r and r16, r/m16 1342 * 23/r and r32, r/m32 1343 * REX.W + 23/r and r64, r/m64 1344 */ 1345 1346 /* get the first operand */ 1347 reg = gpr_map[vie->reg]; 1348 error = vm_get_register(vm, vcpuid, reg, &val1); 1349 if (error) 1350 break; 1351 1352 /* get the second operand */ 1353 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size); 1354 if (error) 1355 break; 1356 1357 /* perform the operation and write the result */ 1358 result = val1 & val2; 1359 error = vie_update_register(vm, vcpuid, reg, result, size); 1360 break; 1361 case 0x81: 1362 case 0x83: 1363 /* 1364 * AND mem (ModRM:r/m) with immediate and store the 1365 * result in mem. 1366 * 1367 * 81 /4 and r/m16, imm16 1368 * 81 /4 and r/m32, imm32 1369 * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64 1370 * 1371 * 83 /4 and r/m16, imm8 sign-extended to 16 1372 * 83 /4 and r/m32, imm8 sign-extended to 32 1373 * REX.W + 83/4 and r/m64, imm8 sign-extended to 64 1374 */ 1375 1376 /* get the first operand */ 1377 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val1, size); 1378 if (error) 1379 break; 1380 1381 /* 1382 * perform the operation with the pre-fetched immediate 1383 * operand and write the result 1384 */ 1385 result = val1 & vie->immediate; 1386 error = vie_mmio_write(vie, vm, vcpuid, gpa, result, size); 1387 break; 1388 default: 1389 break; 1390 } 1391 if (error) 1392 return (error); 1393 1394 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1395 if (error) 1396 return (error); 1397 1398 /* 1399 * OF and CF are cleared; the SF, ZF and PF flags are set according 1400 * to the result; AF is undefined. 1401 * 1402 * The updated status flags are obtained by subtracting 0 from 'result'. 1403 */ 1404 rflags2 = getcc(size, result, 0); 1405 rflags &= ~RFLAGS_STATUS_BITS; 1406 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 1407 1408 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 1409 return (error); 1410 } 1411 1412 static int 1413 vie_emulate_or(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1414 { 1415 int error, size; 1416 enum vm_reg_name reg; 1417 uint64_t result, rflags, rflags2, val1, val2; 1418 1419 size = vie->opsize; 1420 error = EINVAL; 1421 1422 switch (vie->op.op_byte) { 1423 case 0x0B: 1424 /* 1425 * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the 1426 * result in reg. 1427 * 1428 * 0b/r or r16, r/m16 1429 * 0b/r or r32, r/m32 1430 * REX.W + 0b/r or r64, r/m64 1431 */ 1432 1433 /* get the first operand */ 1434 reg = gpr_map[vie->reg]; 1435 error = vm_get_register(vm, vcpuid, reg, &val1); 1436 if (error) 1437 break; 1438 1439 /* get the second operand */ 1440 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size); 1441 if (error) 1442 break; 1443 1444 /* perform the operation and write the result */ 1445 result = val1 | val2; 1446 error = vie_update_register(vm, vcpuid, reg, result, size); 1447 break; 1448 case 0x81: 1449 case 0x83: 1450 /* 1451 * OR mem (ModRM:r/m) with immediate and store the 1452 * result in mem. 1453 * 1454 * 81 /1 or r/m16, imm16 1455 * 81 /1 or r/m32, imm32 1456 * REX.W + 81 /1 or r/m64, imm32 sign-extended to 64 1457 * 1458 * 83 /1 or r/m16, imm8 sign-extended to 16 1459 * 83 /1 or r/m32, imm8 sign-extended to 32 1460 * REX.W + 83/1 or r/m64, imm8 sign-extended to 64 1461 */ 1462 1463 /* get the first operand */ 1464 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val1, size); 1465 if (error) 1466 break; 1467 1468 /* 1469 * perform the operation with the pre-fetched immediate 1470 * operand and write the result 1471 */ 1472 result = val1 | vie->immediate; 1473 error = vie_mmio_write(vie, vm, vcpuid, gpa, result, size); 1474 break; 1475 default: 1476 break; 1477 } 1478 if (error) 1479 return (error); 1480 1481 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1482 if (error) 1483 return (error); 1484 1485 /* 1486 * OF and CF are cleared; the SF, ZF and PF flags are set according 1487 * to the result; AF is undefined. 1488 * 1489 * The updated status flags are obtained by subtracting 0 from 'result'. 1490 */ 1491 rflags2 = getcc(size, result, 0); 1492 rflags &= ~RFLAGS_STATUS_BITS; 1493 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 1494 1495 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 1496 return (error); 1497 } 1498 1499 static int 1500 vie_emulate_cmp(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1501 { 1502 int error, size; 1503 uint64_t regop, memop, op1, op2, rflags, rflags2; 1504 enum vm_reg_name reg; 1505 1506 size = vie->opsize; 1507 switch (vie->op.op_byte) { 1508 case 0x39: 1509 case 0x3B: 1510 /* 1511 * 39/r CMP r/m16, r16 1512 * 39/r CMP r/m32, r32 1513 * REX.W 39/r CMP r/m64, r64 1514 * 1515 * 3B/r CMP r16, r/m16 1516 * 3B/r CMP r32, r/m32 1517 * REX.W + 3B/r CMP r64, r/m64 1518 * 1519 * Compare the first operand with the second operand and 1520 * set status flags in EFLAGS register. The comparison is 1521 * performed by subtracting the second operand from the first 1522 * operand and then setting the status flags. 1523 */ 1524 1525 /* Get the register operand */ 1526 reg = gpr_map[vie->reg]; 1527 error = vm_get_register(vm, vcpuid, reg, ®op); 1528 if (error) 1529 return (error); 1530 1531 /* Get the memory operand */ 1532 error = vie_mmio_read(vie, vm, vcpuid, gpa, &memop, size); 1533 if (error) 1534 return (error); 1535 1536 if (vie->op.op_byte == 0x3B) { 1537 op1 = regop; 1538 op2 = memop; 1539 } else { 1540 op1 = memop; 1541 op2 = regop; 1542 } 1543 rflags2 = getcc(size, op1, op2); 1544 break; 1545 case 0x80: 1546 case 0x81: 1547 case 0x83: 1548 /* 1549 * 80 /7 cmp r/m8, imm8 1550 * REX + 80 /7 cmp r/m8, imm8 1551 * 1552 * 81 /7 cmp r/m16, imm16 1553 * 81 /7 cmp r/m32, imm32 1554 * REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 64 1555 * 1556 * 83 /7 cmp r/m16, imm8 sign-extended to 16 1557 * 83 /7 cmp r/m32, imm8 sign-extended to 32 1558 * REX.W + 83 /7 cmp r/m64, imm8 sign-extended to 64 1559 * 1560 * Compare mem (ModRM:r/m) with immediate and set 1561 * status flags according to the results. The 1562 * comparison is performed by subtracting the 1563 * immediate from the first operand and then setting 1564 * the status flags. 1565 * 1566 */ 1567 if (vie->op.op_byte == 0x80) 1568 size = 1; 1569 1570 /* get the first operand */ 1571 error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size); 1572 if (error) 1573 return (error); 1574 1575 rflags2 = getcc(size, op1, vie->immediate); 1576 break; 1577 default: 1578 return (EINVAL); 1579 } 1580 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1581 if (error) 1582 return (error); 1583 rflags &= ~RFLAGS_STATUS_BITS; 1584 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1585 1586 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 1587 return (error); 1588 } 1589 1590 static int 1591 vie_emulate_test(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1592 { 1593 int error, size; 1594 uint64_t op1, rflags, rflags2; 1595 1596 size = vie->opsize; 1597 error = EINVAL; 1598 1599 switch (vie->op.op_byte) { 1600 case 0xF6: 1601 /* 1602 * F6 /0 test r/m8, imm8 1603 * 1604 * Test mem (ModRM:r/m) with immediate and set status 1605 * flags according to the results. The comparison is 1606 * performed by anding the immediate from the first 1607 * operand and then setting the status flags. 1608 */ 1609 if ((vie->reg & 7) != 0) 1610 return (EINVAL); 1611 1612 size = 1; /* override for byte operation */ 1613 1614 error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size); 1615 if (error) 1616 return (error); 1617 1618 rflags2 = getandflags(size, op1, vie->immediate); 1619 break; 1620 case 0xF7: 1621 /* 1622 * F7 /0 test r/m16, imm16 1623 * F7 /0 test r/m32, imm32 1624 * REX.W + F7 /0 test r/m64, imm32 sign-extended to 64 1625 * 1626 * Test mem (ModRM:r/m) with immediate and set status 1627 * flags according to the results. The comparison is 1628 * performed by anding the immediate from the first 1629 * operand and then setting the status flags. 1630 */ 1631 if ((vie->reg & 7) != 0) 1632 return (EINVAL); 1633 1634 error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size); 1635 if (error) 1636 return (error); 1637 1638 rflags2 = getandflags(size, op1, vie->immediate); 1639 break; 1640 default: 1641 return (EINVAL); 1642 } 1643 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1644 if (error) 1645 return (error); 1646 1647 /* 1648 * OF and CF are cleared; the SF, ZF and PF flags are set according 1649 * to the result; AF is undefined. 1650 */ 1651 rflags &= ~RFLAGS_STATUS_BITS; 1652 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 1653 1654 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 1655 return (error); 1656 } 1657 1658 static int 1659 vie_emulate_bextr(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1660 { 1661 uint64_t src1, src2, dst, rflags; 1662 unsigned start, len; 1663 int error, size; 1664 struct vm_guest_paging *paging; 1665 1666 size = vie->opsize; 1667 error = EINVAL; 1668 paging = &vie->paging; 1669 1670 /* 1671 * VEX.LZ.0F38.W0 F7 /r BEXTR r32a, r/m32, r32b 1672 * VEX.LZ.0F38.W1 F7 /r BEXTR r64a, r/m64, r64b 1673 * 1674 * Destination operand is ModRM:reg. Source operands are ModRM:r/m and 1675 * Vex.vvvv. 1676 * 1677 * Operand size is always 32-bit if not in 64-bit mode (W1 is ignored). 1678 */ 1679 if (size != 4 && paging->cpu_mode != CPU_MODE_64BIT) 1680 size = 4; 1681 1682 /* 1683 * Extracts contiguous bits from the first /source/ operand (second 1684 * operand) using an index and length specified in the second /source/ 1685 * operand (third operand). 1686 */ 1687 error = vie_mmio_read(vie, vm, vcpuid, gpa, &src1, size); 1688 if (error) 1689 return (error); 1690 error = vm_get_register(vm, vcpuid, gpr_map[vie->vex_reg], &src2); 1691 if (error) 1692 return (error); 1693 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1694 if (error) 1695 return (error); 1696 1697 start = (src2 & 0xff); 1698 len = (src2 & 0xff00) >> 8; 1699 1700 /* If no bits are extracted, the destination register is cleared. */ 1701 dst = 0; 1702 1703 /* If START exceeds the operand size, no bits are extracted. */ 1704 if (start > size * 8) 1705 goto done; 1706 /* Length is bounded by both the destination size and start offset. */ 1707 if (start + len > size * 8) 1708 len = (size * 8) - start; 1709 if (len == 0) 1710 goto done; 1711 1712 if (start > 0) 1713 src1 = (src1 >> start); 1714 if (len < 64) 1715 src1 = src1 & ((1ull << len) - 1); 1716 dst = src1; 1717 1718 done: 1719 error = vie_update_register(vm, vcpuid, gpr_map[vie->reg], dst, size); 1720 if (error) 1721 return (error); 1722 1723 /* 1724 * AMD: OF, CF cleared; SF/AF/PF undefined; ZF set by result. 1725 * Intel: ZF is set by result; AF/SF/PF undefined; all others cleared. 1726 */ 1727 rflags &= ~RFLAGS_STATUS_BITS; 1728 if (dst == 0) 1729 rflags |= PSL_Z; 1730 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 1731 8); 1732 return (error); 1733 } 1734 1735 static int 1736 vie_emulate_add(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1737 { 1738 int error, size; 1739 uint64_t nval, rflags, rflags2, val1, val2; 1740 enum vm_reg_name reg; 1741 1742 size = vie->opsize; 1743 error = EINVAL; 1744 1745 switch (vie->op.op_byte) { 1746 case 0x03: 1747 /* 1748 * ADD r/m to r and store the result in r 1749 * 1750 * 03/r ADD r16, r/m16 1751 * 03/r ADD r32, r/m32 1752 * REX.W + 03/r ADD r64, r/m64 1753 */ 1754 1755 /* get the first operand */ 1756 reg = gpr_map[vie->reg]; 1757 error = vm_get_register(vm, vcpuid, reg, &val1); 1758 if (error) 1759 break; 1760 1761 /* get the second operand */ 1762 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size); 1763 if (error) 1764 break; 1765 1766 /* perform the operation and write the result */ 1767 nval = val1 + val2; 1768 error = vie_update_register(vm, vcpuid, reg, nval, size); 1769 break; 1770 default: 1771 break; 1772 } 1773 1774 if (!error) { 1775 rflags2 = getaddflags(size, val1, val2); 1776 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1777 &rflags); 1778 if (error) 1779 return (error); 1780 1781 rflags &= ~RFLAGS_STATUS_BITS; 1782 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1783 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1784 rflags, 8); 1785 } 1786 1787 return (error); 1788 } 1789 1790 static int 1791 vie_emulate_sub(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1792 { 1793 int error, size; 1794 uint64_t nval, rflags, rflags2, val1, val2; 1795 enum vm_reg_name reg; 1796 1797 size = vie->opsize; 1798 error = EINVAL; 1799 1800 switch (vie->op.op_byte) { 1801 case 0x2B: 1802 /* 1803 * SUB r/m from r and store the result in r 1804 * 1805 * 2B/r SUB r16, r/m16 1806 * 2B/r SUB r32, r/m32 1807 * REX.W + 2B/r SUB r64, r/m64 1808 */ 1809 1810 /* get the first operand */ 1811 reg = gpr_map[vie->reg]; 1812 error = vm_get_register(vm, vcpuid, reg, &val1); 1813 if (error) 1814 break; 1815 1816 /* get the second operand */ 1817 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size); 1818 if (error) 1819 break; 1820 1821 /* perform the operation and write the result */ 1822 nval = val1 - val2; 1823 error = vie_update_register(vm, vcpuid, reg, nval, size); 1824 break; 1825 default: 1826 break; 1827 } 1828 1829 if (!error) { 1830 rflags2 = getcc(size, val1, val2); 1831 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1832 &rflags); 1833 if (error) 1834 return (error); 1835 1836 rflags &= ~RFLAGS_STATUS_BITS; 1837 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1838 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1839 rflags, 8); 1840 } 1841 1842 return (error); 1843 } 1844 1845 static int 1846 vie_emulate_stack_op(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1847 { 1848 struct vm_copyinfo copyinfo[2]; 1849 struct seg_desc ss_desc; 1850 uint64_t cr0, rflags, rsp, stack_gla, val; 1851 int error, fault, size, stackaddrsize, pushop; 1852 struct vm_guest_paging *paging; 1853 1854 val = 0; 1855 size = vie->opsize; 1856 pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0; 1857 paging = &vie->paging; 1858 1859 /* 1860 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1 1861 */ 1862 if (paging->cpu_mode == CPU_MODE_REAL) { 1863 stackaddrsize = 2; 1864 } else if (paging->cpu_mode == CPU_MODE_64BIT) { 1865 /* 1866 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3 1867 * - Stack pointer size is always 64-bits. 1868 * - PUSH/POP of 32-bit values is not possible in 64-bit mode. 1869 * - 16-bit PUSH/POP is supported by using the operand size 1870 * override prefix (66H). 1871 */ 1872 stackaddrsize = 8; 1873 size = vie->opsize_override ? 2 : 8; 1874 } else { 1875 /* 1876 * In protected or compatibility mode the 'B' flag in the 1877 * stack-segment descriptor determines the size of the 1878 * stack pointer. 1879 */ 1880 error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc); 1881 KASSERT(error == 0, ("%s: error %d getting SS descriptor", 1882 __func__, error)); 1883 if (SEG_DESC_DEF32(ss_desc.access)) 1884 stackaddrsize = 4; 1885 else 1886 stackaddrsize = 2; 1887 } 1888 1889 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); 1890 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); 1891 1892 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1893 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 1894 1895 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp); 1896 KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error)); 1897 if (pushop) { 1898 rsp -= size; 1899 } 1900 1901 if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc, 1902 rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ, 1903 &stack_gla)) { 1904 vm_inject_ss(vm, vcpuid, 0); 1905 return (0); 1906 } 1907 1908 if (vie_canonical_check(paging->cpu_mode, stack_gla)) { 1909 vm_inject_ss(vm, vcpuid, 0); 1910 return (0); 1911 } 1912 1913 if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) { 1914 vm_inject_ac(vm, vcpuid, 0); 1915 return (0); 1916 } 1917 1918 error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, 1919 pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo), 1920 &fault); 1921 if (error || fault) 1922 return (error); 1923 1924 if (pushop) { 1925 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size); 1926 if (error == 0) 1927 vm_copyout(vm, vcpuid, &val, copyinfo, size); 1928 } else { 1929 vm_copyin(vm, vcpuid, copyinfo, &val, size); 1930 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size); 1931 rsp += size; 1932 } 1933 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 1934 1935 if (error == 0) { 1936 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp, 1937 stackaddrsize); 1938 KASSERT(error == 0, ("error %d updating rsp", error)); 1939 } 1940 return (error); 1941 } 1942 1943 static int 1944 vie_emulate_push(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1945 { 1946 int error; 1947 1948 /* 1949 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. 1950 * 1951 * PUSH is part of the group 5 extended opcodes and is identified 1952 * by ModRM:reg = b110. 1953 */ 1954 if ((vie->reg & 7) != 6) 1955 return (EINVAL); 1956 1957 error = vie_emulate_stack_op(vie, vm, vcpuid, gpa); 1958 return (error); 1959 } 1960 1961 static int 1962 vie_emulate_pop(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1963 { 1964 int error; 1965 1966 /* 1967 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. 1968 * 1969 * POP is part of the group 1A extended opcodes and is identified 1970 * by ModRM:reg = b000. 1971 */ 1972 if ((vie->reg & 7) != 0) 1973 return (EINVAL); 1974 1975 error = vie_emulate_stack_op(vie, vm, vcpuid, gpa); 1976 return (error); 1977 } 1978 1979 static int 1980 vie_emulate_group1(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1981 { 1982 int error; 1983 1984 switch (vie->reg & 7) { 1985 case 0x1: /* OR */ 1986 error = vie_emulate_or(vie, vm, vcpuid, gpa); 1987 break; 1988 case 0x4: /* AND */ 1989 error = vie_emulate_and(vie, vm, vcpuid, gpa); 1990 break; 1991 case 0x7: /* CMP */ 1992 error = vie_emulate_cmp(vie, vm, vcpuid, gpa); 1993 break; 1994 default: 1995 error = EINVAL; 1996 break; 1997 } 1998 1999 return (error); 2000 } 2001 2002 static int 2003 vie_emulate_bittest(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 2004 { 2005 uint64_t val, rflags; 2006 int error, bitmask, bitoff; 2007 2008 /* 2009 * 0F BA is a Group 8 extended opcode. 2010 * 2011 * Currently we only emulate the 'Bit Test' instruction which is 2012 * identified by a ModR/M:reg encoding of 100b. 2013 */ 2014 if ((vie->reg & 7) != 4) 2015 return (EINVAL); 2016 2017 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 2018 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 2019 2020 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, vie->opsize); 2021 if (error) 2022 return (error); 2023 2024 /* 2025 * Intel SDM, Vol 2, Table 3-2: 2026 * "Range of Bit Positions Specified by Bit Offset Operands" 2027 */ 2028 bitmask = vie->opsize * 8 - 1; 2029 bitoff = vie->immediate & bitmask; 2030 2031 /* Copy the bit into the Carry flag in %rflags */ 2032 if (val & (1UL << bitoff)) 2033 rflags |= PSL_C; 2034 else 2035 rflags &= ~PSL_C; 2036 2037 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 2038 KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error)); 2039 2040 return (0); 2041 } 2042 2043 static int 2044 vie_emulate_twob_group15(struct vie *vie, struct vm *vm, int vcpuid, 2045 uint64_t gpa) 2046 { 2047 int error; 2048 uint64_t buf; 2049 2050 switch (vie->reg & 7) { 2051 case 0x7: /* CLFLUSH, CLFLUSHOPT, and SFENCE */ 2052 if (vie->mod == 0x3) { 2053 /* 2054 * SFENCE. Ignore it, VM exit provides enough 2055 * barriers on its own. 2056 */ 2057 error = 0; 2058 } else { 2059 /* 2060 * CLFLUSH, CLFLUSHOPT. Only check for access 2061 * rights. 2062 */ 2063 error = vie_mmio_read(vie, vm, vcpuid, gpa, &buf, 1); 2064 } 2065 break; 2066 default: 2067 error = EINVAL; 2068 break; 2069 } 2070 2071 return (error); 2072 } 2073 2074 static int 2075 vie_emulate_clts(struct vie *vie, struct vm *vm, int vcpuid) 2076 { 2077 uint64_t val; 2078 int error; 2079 2080 if (vie->paging.cpl != 0) { 2081 vm_inject_gp(vm, vcpuid); 2082 vie->num_processed = 0; 2083 return (0); 2084 } 2085 2086 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &val); 2087 ASSERT(error == 0); 2088 2089 /* Clear %cr0.TS */ 2090 val &= ~CR0_TS; 2091 2092 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, val); 2093 ASSERT(error == 0); 2094 2095 return (0); 2096 } 2097 2098 static int 2099 vie_mmio_read(struct vie *vie, struct vm *vm, int cpuid, uint64_t gpa, 2100 uint64_t *rval, int bytes) 2101 { 2102 int err; 2103 2104 if (vie->mmio_req_read.state == VR_DONE) { 2105 ASSERT(vie->mmio_req_read.bytes == bytes); 2106 ASSERT(vie->mmio_req_read.gpa == gpa); 2107 2108 *rval = vie->mmio_req_read.data; 2109 return (0); 2110 } 2111 2112 err = vm_service_mmio_read(vm, cpuid, gpa, rval, bytes); 2113 if (err == 0) { 2114 /* 2115 * A successful read from an in-kernel-emulated device may come 2116 * with side effects, so stash the result in case it's used for 2117 * an instruction which subsequently needs to issue an MMIO 2118 * write to userspace. 2119 */ 2120 ASSERT(vie->mmio_req_read.state == VR_NONE); 2121 2122 vie->mmio_req_read.bytes = bytes; 2123 vie->mmio_req_read.gpa = gpa; 2124 vie->mmio_req_read.data = *rval; 2125 vie->mmio_req_read.state = VR_DONE; 2126 2127 } else if (err == ESRCH) { 2128 /* Hope that userspace emulation can fulfill this read */ 2129 vie->mmio_req_read.bytes = bytes; 2130 vie->mmio_req_read.gpa = gpa; 2131 vie->mmio_req_read.state = VR_PENDING; 2132 vie->status |= VIES_PENDING_MMIO; 2133 } else if (err < 0) { 2134 /* 2135 * The MMIO read failed in such a way that fallback to handling 2136 * in userspace is required. 2137 */ 2138 vie->status |= VIES_USER_FALLBACK; 2139 } 2140 return (err); 2141 } 2142 2143 static int 2144 vie_mmio_write(struct vie *vie, struct vm *vm, int cpuid, uint64_t gpa, 2145 uint64_t wval, int bytes) 2146 { 2147 int err; 2148 2149 if (vie->mmio_req_write.state == VR_DONE) { 2150 ASSERT(vie->mmio_req_write.bytes == bytes); 2151 ASSERT(vie->mmio_req_write.gpa == gpa); 2152 2153 return (0); 2154 } 2155 2156 err = vm_service_mmio_write(vm, cpuid, gpa, wval, bytes); 2157 if (err == 0) { 2158 /* 2159 * A successful write to an in-kernel-emulated device probably 2160 * results in side effects, so stash the fact that such a write 2161 * succeeded in case the operation requires other work. 2162 */ 2163 vie->mmio_req_write.bytes = bytes; 2164 vie->mmio_req_write.gpa = gpa; 2165 vie->mmio_req_write.data = wval; 2166 vie->mmio_req_write.state = VR_DONE; 2167 } else if (err == ESRCH) { 2168 /* Hope that userspace emulation can fulfill this write */ 2169 vie->mmio_req_write.bytes = bytes; 2170 vie->mmio_req_write.gpa = gpa; 2171 vie->mmio_req_write.data = wval; 2172 vie->mmio_req_write.state = VR_PENDING; 2173 vie->status |= VIES_PENDING_MMIO; 2174 } else if (err < 0) { 2175 /* 2176 * The MMIO write failed in such a way that fallback to handling 2177 * in userspace is required. 2178 */ 2179 vie->status |= VIES_USER_FALLBACK; 2180 } 2181 return (err); 2182 } 2183 2184 int 2185 vie_emulate_mmio(struct vie *vie, struct vm *vm, int vcpuid) 2186 { 2187 int error; 2188 uint64_t gpa; 2189 2190 if ((vie->status & (VIES_INST_DECODE | VIES_MMIO)) != 2191 (VIES_INST_DECODE | VIES_MMIO)) { 2192 return (EINVAL); 2193 } 2194 2195 gpa = vie->mmio_gpa; 2196 2197 switch (vie->op.op_type) { 2198 case VIE_OP_TYPE_GROUP1: 2199 error = vie_emulate_group1(vie, vm, vcpuid, gpa); 2200 break; 2201 case VIE_OP_TYPE_POP: 2202 error = vie_emulate_pop(vie, vm, vcpuid, gpa); 2203 break; 2204 case VIE_OP_TYPE_PUSH: 2205 error = vie_emulate_push(vie, vm, vcpuid, gpa); 2206 break; 2207 case VIE_OP_TYPE_CMP: 2208 error = vie_emulate_cmp(vie, vm, vcpuid, gpa); 2209 break; 2210 case VIE_OP_TYPE_MOV: 2211 error = vie_emulate_mov(vie, vm, vcpuid, gpa); 2212 break; 2213 case VIE_OP_TYPE_MOVSX: 2214 case VIE_OP_TYPE_MOVZX: 2215 error = vie_emulate_movx(vie, vm, vcpuid, gpa); 2216 break; 2217 case VIE_OP_TYPE_MOVS: 2218 error = vie_emulate_movs(vie, vm, vcpuid, gpa); 2219 break; 2220 case VIE_OP_TYPE_STOS: 2221 error = vie_emulate_stos(vie, vm, vcpuid, gpa); 2222 break; 2223 case VIE_OP_TYPE_AND: 2224 error = vie_emulate_and(vie, vm, vcpuid, gpa); 2225 break; 2226 case VIE_OP_TYPE_OR: 2227 error = vie_emulate_or(vie, vm, vcpuid, gpa); 2228 break; 2229 case VIE_OP_TYPE_SUB: 2230 error = vie_emulate_sub(vie, vm, vcpuid, gpa); 2231 break; 2232 case VIE_OP_TYPE_BITTEST: 2233 error = vie_emulate_bittest(vie, vm, vcpuid, gpa); 2234 break; 2235 case VIE_OP_TYPE_TWOB_GRP15: 2236 error = vie_emulate_twob_group15(vie, vm, vcpuid, gpa); 2237 break; 2238 case VIE_OP_TYPE_ADD: 2239 error = vie_emulate_add(vie, vm, vcpuid, gpa); 2240 break; 2241 case VIE_OP_TYPE_TEST: 2242 error = vie_emulate_test(vie, vm, vcpuid, gpa); 2243 break; 2244 case VIE_OP_TYPE_BEXTR: 2245 error = vie_emulate_bextr(vie, vm, vcpuid, gpa); 2246 break; 2247 default: 2248 error = EINVAL; 2249 break; 2250 } 2251 2252 if (error == ESRCH) { 2253 /* Return to userspace with the mmio request */ 2254 return (-1); 2255 } 2256 2257 return (error); 2258 } 2259 2260 static int 2261 vie_emulate_inout_port(struct vie *vie, struct vm *vm, int vcpuid, 2262 uint32_t *eax) 2263 { 2264 uint32_t mask, val; 2265 bool in; 2266 int err; 2267 2268 mask = vie_size2mask(vie->inout.bytes); 2269 in = (vie->inout.flags & INOUT_IN) != 0; 2270 2271 if (!in) { 2272 val = *eax & mask; 2273 } 2274 2275 if (vie->inout_req_state != VR_DONE) { 2276 err = vm_ioport_access(vm, vcpuid, in, vie->inout.port, 2277 vie->inout.bytes, &val); 2278 val &= mask; 2279 } else { 2280 /* 2281 * This port access was handled in userspace and the result was 2282 * injected in to be handled now. 2283 */ 2284 val = vie->inout_req_val & mask; 2285 vie->inout_req_state = VR_NONE; 2286 err = 0; 2287 } 2288 2289 if (err == ESRCH) { 2290 vie->status |= VIES_PENDING_INOUT; 2291 vie->inout_req_state = VR_PENDING; 2292 return (err); 2293 } else if (err != 0) { 2294 return (err); 2295 } 2296 2297 if (in) { 2298 *eax = (*eax & ~mask) | val; 2299 } 2300 return (0); 2301 } 2302 2303 static enum vm_reg_name 2304 vie_inout_segname(const struct vie *vie) 2305 { 2306 uint8_t segidx = vie->inout.segment; 2307 const enum vm_reg_name segmap[] = { 2308 VM_REG_GUEST_ES, 2309 VM_REG_GUEST_CS, 2310 VM_REG_GUEST_SS, 2311 VM_REG_GUEST_DS, 2312 VM_REG_GUEST_FS, 2313 VM_REG_GUEST_GS, 2314 }; 2315 const uint8_t maxidx = (sizeof (segmap) / sizeof (segmap[0])); 2316 2317 if (segidx >= maxidx) { 2318 panic("unexpected segment index %u", segidx); 2319 } 2320 return (segmap[segidx]); 2321 } 2322 2323 static int 2324 vie_emulate_inout_str(struct vie *vie, struct vm *vm, int vcpuid) 2325 { 2326 uint8_t bytes, addrsize; 2327 uint64_t index, count = 0, gla, rflags; 2328 int prot, err, fault; 2329 bool in, repeat; 2330 enum vm_reg_name seg_reg, idx_reg; 2331 struct vm_copyinfo copyinfo[2]; 2332 2333 in = (vie->inout.flags & INOUT_IN) != 0; 2334 bytes = vie->inout.bytes; 2335 addrsize = vie->inout.addrsize; 2336 prot = in ? PROT_WRITE : PROT_READ; 2337 2338 ASSERT(bytes == 1 || bytes == 2 || bytes == 4); 2339 ASSERT(addrsize == 2 || addrsize == 4 || addrsize == 8); 2340 2341 idx_reg = (in) ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; 2342 seg_reg = vie_inout_segname(vie); 2343 err = vm_get_register(vm, vcpuid, idx_reg, &index); 2344 ASSERT(err == 0); 2345 index = index & vie_size2mask(addrsize); 2346 2347 repeat = (vie->inout.flags & INOUT_REP) != 0; 2348 2349 /* Count register */ 2350 if (repeat) { 2351 err = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &count); 2352 count &= vie_size2mask(addrsize); 2353 2354 if (count == 0) { 2355 /* 2356 * If we were asked to emulate a REP INS/OUTS when the 2357 * count register is zero, no further work is required. 2358 */ 2359 return (0); 2360 } 2361 } else { 2362 count = 1; 2363 } 2364 2365 gla = 0; 2366 if (vie_get_gla(vie, vm, vcpuid, bytes, addrsize, prot, seg_reg, 2367 idx_reg, &gla) != 0) { 2368 /* vie_get_gla() already injected the appropriate fault */ 2369 return (0); 2370 } 2371 2372 /* 2373 * The INS/OUTS emulate currently assumes that the memory target resides 2374 * within the guest system memory, rather than a device MMIO region. If 2375 * such a case becomes a necessity, that additional handling could be 2376 * put in place. 2377 */ 2378 err = vm_copy_setup(vm, vcpuid, &vie->paging, gla, bytes, prot, 2379 copyinfo, nitems(copyinfo), &fault); 2380 2381 if (err) { 2382 /* Unrecoverable error */ 2383 return (err); 2384 } else if (fault) { 2385 /* Resume guest to handle fault */ 2386 return (0); 2387 } 2388 2389 if (!in) { 2390 vm_copyin(vm, vcpuid, copyinfo, &vie->inout.eax, bytes); 2391 } 2392 2393 err = vie_emulate_inout_port(vie, vm, vcpuid, &vie->inout.eax); 2394 2395 if (err == 0 && in) { 2396 vm_copyout(vm, vcpuid, &vie->inout.eax, copyinfo, bytes); 2397 } 2398 2399 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 2400 2401 if (err == 0) { 2402 err = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2403 &rflags); 2404 ASSERT(err == 0); 2405 2406 /* Update index */ 2407 if (rflags & PSL_D) { 2408 index -= bytes; 2409 } else { 2410 index += bytes; 2411 } 2412 2413 /* Update index register */ 2414 err = vie_update_register(vm, vcpuid, idx_reg, index, addrsize); 2415 ASSERT(err == 0); 2416 2417 /* 2418 * Update count register only if the instruction had a repeat 2419 * prefix. 2420 */ 2421 if ((vie->inout.flags & INOUT_REP) != 0) { 2422 count--; 2423 err = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, 2424 count, addrsize); 2425 ASSERT(err == 0); 2426 2427 if (count != 0) { 2428 return (vie_repeat(vie)); 2429 } 2430 } 2431 } 2432 2433 return (err); 2434 } 2435 2436 int 2437 vie_emulate_inout(struct vie *vie, struct vm *vm, int vcpuid) 2438 { 2439 int err = 0; 2440 2441 if ((vie->status & VIES_INOUT) == 0) { 2442 return (EINVAL); 2443 } 2444 2445 if ((vie->inout.flags & INOUT_STR) == 0) { 2446 /* 2447 * For now, using the 'rep' prefixes with plain (non-string) 2448 * in/out is not supported. 2449 */ 2450 if ((vie->inout.flags & INOUT_REP) != 0) { 2451 return (EINVAL); 2452 } 2453 2454 err = vie_emulate_inout_port(vie, vm, vcpuid, &vie->inout.eax); 2455 if (err == 0 && (vie->inout.flags & INOUT_IN) != 0) { 2456 /* 2457 * With the inX access now a success, the result needs 2458 * to be stored in the guest %rax. 2459 */ 2460 err = vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX, 2461 vie->inout.eax); 2462 VERIFY0(err); 2463 } 2464 } else { 2465 vie->status &= ~VIES_REPEAT; 2466 err = vie_emulate_inout_str(vie, vm, vcpuid); 2467 2468 } 2469 if (err < 0) { 2470 /* 2471 * Access to an I/O port failed in such a way that fallback to 2472 * handling in userspace is required. 2473 */ 2474 vie->status |= VIES_USER_FALLBACK; 2475 } else if (err == ESRCH) { 2476 ASSERT(vie->status & VIES_PENDING_INOUT); 2477 /* Return to userspace with the in/out request */ 2478 err = -1; 2479 } 2480 2481 return (err); 2482 } 2483 2484 int 2485 vie_emulate_other(struct vie *vie, struct vm *vm, int vcpuid) 2486 { 2487 int error; 2488 2489 if ((vie->status & (VIES_INST_DECODE | VIES_OTHER)) != 2490 (VIES_INST_DECODE | VIES_OTHER)) { 2491 return (EINVAL); 2492 } 2493 2494 switch (vie->op.op_type) { 2495 case VIE_OP_TYPE_CLTS: 2496 error = vie_emulate_clts(vie, vm, vcpuid); 2497 break; 2498 case VIE_OP_TYPE_MOV_CR: 2499 error = vie_emulate_mov_cr(vie, vm, vcpuid); 2500 break; 2501 default: 2502 error = EINVAL; 2503 break; 2504 } 2505 2506 return (error); 2507 } 2508 2509 void 2510 vie_reset(struct vie *vie) 2511 { 2512 vie->status = 0; 2513 vie->num_processed = vie->num_valid = 0; 2514 } 2515 2516 void 2517 vie_advance_pc(struct vie *vie, uint64_t *nextrip) 2518 { 2519 VERIFY((vie->status & VIES_REPEAT) == 0); 2520 2521 *nextrip += vie->num_processed; 2522 vie_reset(vie); 2523 } 2524 2525 void 2526 vie_exitinfo(const struct vie *vie, struct vm_exit *vme) 2527 { 2528 if (vie->status & VIES_USER_FALLBACK) { 2529 /* 2530 * Despite the fact that the instruction was successfully 2531 * decoded, some aspect of the emulation failed in such a way 2532 * that it is left up to userspace to complete the operation. 2533 */ 2534 vie_fallback_exitinfo(vie, vme); 2535 } else if (vie->status & VIES_MMIO) { 2536 vme->exitcode = VM_EXITCODE_MMIO; 2537 if (vie->mmio_req_read.state == VR_PENDING) { 2538 vme->u.mmio.gpa = vie->mmio_req_read.gpa; 2539 vme->u.mmio.data = 0; 2540 vme->u.mmio.bytes = vie->mmio_req_read.bytes; 2541 vme->u.mmio.read = 1; 2542 } else if (vie->mmio_req_write.state == VR_PENDING) { 2543 vme->u.mmio.gpa = vie->mmio_req_write.gpa; 2544 vme->u.mmio.data = vie->mmio_req_write.data & 2545 vie_size2mask(vie->mmio_req_write.bytes); 2546 vme->u.mmio.bytes = vie->mmio_req_write.bytes; 2547 vme->u.mmio.read = 0; 2548 } else { 2549 panic("bad pending MMIO state"); 2550 } 2551 } else if (vie->status & VIES_INOUT) { 2552 vme->exitcode = VM_EXITCODE_INOUT; 2553 vme->u.inout.port = vie->inout.port; 2554 vme->u.inout.bytes = vie->inout.bytes; 2555 if ((vie->inout.flags & INOUT_IN) != 0) { 2556 vme->u.inout.flags = INOUT_IN; 2557 vme->u.inout.eax = 0; 2558 } else { 2559 vme->u.inout.flags = 0; 2560 vme->u.inout.eax = vie->inout.eax & 2561 vie_size2mask(vie->inout.bytes); 2562 } 2563 } else { 2564 panic("no pending operation"); 2565 } 2566 } 2567 2568 /* 2569 * In the case of a decoding or verification failure, bailing out to userspace 2570 * to do the instruction emulation is our only option for now. 2571 */ 2572 void 2573 vie_fallback_exitinfo(const struct vie *vie, struct vm_exit *vme) 2574 { 2575 if ((vie->status & VIES_INST_FETCH) == 0) { 2576 bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul)); 2577 } else { 2578 ASSERT(sizeof (vie->inst) == sizeof (vme->u.inst_emul.inst)); 2579 2580 bcopy(vie->inst, vme->u.inst_emul.inst, sizeof (vie->inst)); 2581 vme->u.inst_emul.num_valid = vie->num_valid; 2582 } 2583 vme->exitcode = VM_EXITCODE_INST_EMUL; 2584 } 2585 2586 void 2587 vie_cs_info(const struct vie *vie, struct vm *vm, int vcpuid, uint64_t *cs_base, 2588 int *cs_d) 2589 { 2590 struct seg_desc cs_desc; 2591 int error; 2592 2593 error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &cs_desc); 2594 ASSERT(error == 0); 2595 2596 /* Initialization required for the paging info to be populated */ 2597 VERIFY(vie->status & VIES_INIT); 2598 switch (vie->paging.cpu_mode) { 2599 case CPU_MODE_REAL: 2600 *cs_base = cs_desc.base; 2601 *cs_d = 0; 2602 break; 2603 case CPU_MODE_PROTECTED: 2604 case CPU_MODE_COMPATIBILITY: 2605 *cs_base = cs_desc.base; 2606 *cs_d = SEG_DESC_DEF32(cs_desc.access) ? 1 : 0; 2607 break; 2608 default: 2609 *cs_base = 0; 2610 *cs_d = 0; 2611 break; 2612 } 2613 } 2614 2615 bool 2616 vie_pending(const struct vie *vie) 2617 { 2618 /* 2619 * These VIE status bits indicate conditions which must be addressed 2620 * through either device IO fulfillment (with corresponding 2621 * vie_fulfill_*()) or complete userspace emulation (followed by a 2622 * vie_reset()). 2623 */ 2624 const enum vie_status of_interest = 2625 VIES_PENDING_MMIO | VIES_PENDING_INOUT | VIES_USER_FALLBACK; 2626 2627 return ((vie->status & of_interest) != 0); 2628 } 2629 2630 bool 2631 vie_needs_fetch(const struct vie *vie) 2632 { 2633 if (vie->status & VIES_INST_FETCH) { 2634 ASSERT(vie->num_valid != 0); 2635 return (false); 2636 } 2637 return (true); 2638 } 2639 2640 static int 2641 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) 2642 { 2643 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 2644 ("%s: invalid size %d", __func__, size)); 2645 KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl)); 2646 2647 if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) 2648 return (0); 2649 2650 return ((gla & (size - 1)) ? 1 : 0); 2651 } 2652 2653 static int 2654 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla) 2655 { 2656 uint64_t mask; 2657 2658 if (cpu_mode != CPU_MODE_64BIT) 2659 return (0); 2660 2661 /* 2662 * The value of the bit 47 in the 'gla' should be replicated in the 2663 * most significant 16 bits. 2664 */ 2665 mask = ~((1UL << 48) - 1); 2666 if (gla & (1UL << 47)) 2667 return ((gla & mask) != mask); 2668 else 2669 return ((gla & mask) != 0); 2670 } 2671 2672 static uint64_t 2673 vie_size2mask(int size) 2674 { 2675 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 2676 ("vie_size2mask: invalid size %d", size)); 2677 return (size2mask[size]); 2678 } 2679 2680 static int 2681 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, 2682 struct seg_desc *desc, uint64_t offset, int length, int addrsize, 2683 int prot, uint64_t *gla) 2684 { 2685 uint64_t firstoff, low_limit, high_limit, segbase; 2686 int glasize, type; 2687 2688 KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS, 2689 ("%s: invalid segment %d", __func__, seg)); 2690 KASSERT(length == 1 || length == 2 || length == 4 || length == 8, 2691 ("%s: invalid operand size %d", __func__, length)); 2692 KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, 2693 ("%s: invalid prot %x", __func__, prot)); 2694 2695 firstoff = offset; 2696 if (cpu_mode == CPU_MODE_64BIT) { 2697 KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address " 2698 "size %d for cpu_mode %d", __func__, addrsize, cpu_mode)); 2699 glasize = 8; 2700 } else { 2701 KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address " 2702 "size %d for cpu mode %d", __func__, addrsize, cpu_mode)); 2703 glasize = 4; 2704 /* 2705 * If the segment selector is loaded with a NULL selector 2706 * then the descriptor is unusable and attempting to use 2707 * it results in a #GP(0). 2708 */ 2709 if (SEG_DESC_UNUSABLE(desc->access)) 2710 return (-1); 2711 2712 /* 2713 * The processor generates a #NP exception when a segment 2714 * register is loaded with a selector that points to a 2715 * descriptor that is not present. If this was the case then 2716 * it would have been checked before the VM-exit. 2717 */ 2718 KASSERT(SEG_DESC_PRESENT(desc->access), 2719 ("segment %d not present: %x", seg, desc->access)); 2720 2721 /* 2722 * The descriptor type must indicate a code/data segment. 2723 */ 2724 type = SEG_DESC_TYPE(desc->access); 2725 KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " 2726 "descriptor type %x", seg, type)); 2727 2728 if (prot & PROT_READ) { 2729 /* #GP on a read access to a exec-only code segment */ 2730 if ((type & 0xA) == 0x8) 2731 return (-1); 2732 } 2733 2734 if (prot & PROT_WRITE) { 2735 /* 2736 * #GP on a write access to a code segment or a 2737 * read-only data segment. 2738 */ 2739 if (type & 0x8) /* code segment */ 2740 return (-1); 2741 2742 if ((type & 0xA) == 0) /* read-only data seg */ 2743 return (-1); 2744 } 2745 2746 /* 2747 * 'desc->limit' is fully expanded taking granularity into 2748 * account. 2749 */ 2750 if ((type & 0xC) == 0x4) { 2751 /* expand-down data segment */ 2752 low_limit = desc->limit + 1; 2753 high_limit = SEG_DESC_DEF32(desc->access) ? 2754 0xffffffff : 0xffff; 2755 } else { 2756 /* code segment or expand-up data segment */ 2757 low_limit = 0; 2758 high_limit = desc->limit; 2759 } 2760 2761 while (length > 0) { 2762 offset &= vie_size2mask(addrsize); 2763 if (offset < low_limit || offset > high_limit) 2764 return (-1); 2765 offset++; 2766 length--; 2767 } 2768 } 2769 2770 /* 2771 * In 64-bit mode all segments except %fs and %gs have a segment 2772 * base address of 0. 2773 */ 2774 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && 2775 seg != VM_REG_GUEST_GS) { 2776 segbase = 0; 2777 } else { 2778 segbase = desc->base; 2779 } 2780 2781 /* 2782 * Truncate 'firstoff' to the effective address size before adding 2783 * it to the segment base. 2784 */ 2785 firstoff &= vie_size2mask(addrsize); 2786 *gla = (segbase + firstoff) & vie_size2mask(glasize); 2787 return (0); 2788 } 2789 2790 void 2791 vie_init_mmio(struct vie *vie, const char *inst_bytes, uint8_t inst_length, 2792 const struct vm_guest_paging *paging, uint64_t gpa) 2793 { 2794 KASSERT(inst_length <= VIE_INST_SIZE, 2795 ("%s: invalid instruction length (%d)", __func__, inst_length)); 2796 2797 bzero(vie, sizeof (struct vie)); 2798 2799 vie->base_register = VM_REG_LAST; 2800 vie->index_register = VM_REG_LAST; 2801 vie->segment_register = VM_REG_LAST; 2802 vie->status = VIES_INIT | VIES_MMIO; 2803 2804 if (inst_length != 0) { 2805 bcopy(inst_bytes, vie->inst, inst_length); 2806 vie->num_valid = inst_length; 2807 vie->status |= VIES_INST_FETCH; 2808 } 2809 2810 vie->paging = *paging; 2811 vie->mmio_gpa = gpa; 2812 } 2813 2814 void 2815 vie_init_inout(struct vie *vie, const struct vm_inout *inout, uint8_t inst_len, 2816 const struct vm_guest_paging *paging) 2817 { 2818 bzero(vie, sizeof (struct vie)); 2819 2820 vie->status = VIES_INIT | VIES_INOUT; 2821 2822 vie->inout = *inout; 2823 vie->paging = *paging; 2824 2825 /* 2826 * Since VMX/SVM assists already decoded the nature of the in/out 2827 * instruction, let the status reflect that. 2828 */ 2829 vie->status |= VIES_INST_FETCH | VIES_INST_DECODE; 2830 vie->num_processed = inst_len; 2831 } 2832 2833 void 2834 vie_init_other(struct vie *vie, const struct vm_guest_paging *paging) 2835 { 2836 bzero(vie, sizeof (struct vie)); 2837 2838 vie->base_register = VM_REG_LAST; 2839 vie->index_register = VM_REG_LAST; 2840 vie->segment_register = VM_REG_LAST; 2841 vie->status = VIES_INIT | VIES_OTHER; 2842 2843 vie->paging = *paging; 2844 } 2845 2846 int 2847 vie_fulfill_mmio(struct vie *vie, const struct vm_mmio *result) 2848 { 2849 struct vie_mmio *pending; 2850 2851 if ((vie->status & VIES_MMIO) == 0 || 2852 (vie->status & VIES_PENDING_MMIO) == 0) { 2853 return (EINVAL); 2854 } 2855 2856 if (result->read) { 2857 pending = &vie->mmio_req_read; 2858 } else { 2859 pending = &vie->mmio_req_write; 2860 } 2861 2862 if (pending->state != VR_PENDING || 2863 pending->bytes != result->bytes || pending->gpa != result->gpa) { 2864 return (EINVAL); 2865 } 2866 2867 if (result->read) { 2868 pending->data = result->data & vie_size2mask(pending->bytes); 2869 } 2870 pending->state = VR_DONE; 2871 vie->status &= ~VIES_PENDING_MMIO; 2872 2873 return (0); 2874 } 2875 2876 int 2877 vie_fulfill_inout(struct vie *vie, const struct vm_inout *result) 2878 { 2879 if ((vie->status & VIES_INOUT) == 0 || 2880 (vie->status & VIES_PENDING_INOUT) == 0) { 2881 return (EINVAL); 2882 } 2883 if ((vie->inout.flags & INOUT_IN) != (result->flags & INOUT_IN) || 2884 vie->inout.bytes != result->bytes || 2885 vie->inout.port != result->port) { 2886 return (EINVAL); 2887 } 2888 2889 if (result->flags & INOUT_IN) { 2890 vie->inout_req_val = result->eax & 2891 vie_size2mask(vie->inout.bytes); 2892 } 2893 vie->inout_req_state = VR_DONE; 2894 vie->status &= ~(VIES_PENDING_INOUT); 2895 2896 return (0); 2897 } 2898 2899 uint64_t 2900 vie_mmio_gpa(const struct vie *vie) 2901 { 2902 return (vie->mmio_gpa); 2903 } 2904 2905 static int 2906 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte) 2907 { 2908 int error_code = 0; 2909 2910 if (pte & PG_V) 2911 error_code |= PGEX_P; 2912 if (prot & PROT_WRITE) 2913 error_code |= PGEX_W; 2914 if (usermode) 2915 error_code |= PGEX_U; 2916 if (rsvd) 2917 error_code |= PGEX_RSV; 2918 if (prot & PROT_EXEC) 2919 error_code |= PGEX_I; 2920 2921 return (error_code); 2922 } 2923 2924 static void 2925 ptp_release(vm_page_t **vmp) 2926 { 2927 if (*vmp != NULL) { 2928 (void) vmp_release(*vmp); 2929 *vmp = NULL; 2930 } 2931 } 2932 2933 static void * 2934 ptp_hold(struct vm *vm, int vcpu, uintptr_t gpa, size_t len, vm_page_t **vmp) 2935 { 2936 vm_client_t *vmc = vm_get_vmclient(vm, vcpu); 2937 const uintptr_t hold_gpa = gpa & PAGEMASK; 2938 2939 /* Hold must not cross a page boundary */ 2940 VERIFY3U(gpa + len, <=, hold_gpa + PAGESIZE); 2941 2942 if (*vmp != NULL) { 2943 (void) vmp_release(*vmp); 2944 } 2945 2946 *vmp = vmc_hold(vmc, hold_gpa, PROT_READ | PROT_WRITE); 2947 if (*vmp == NULL) { 2948 return (NULL); 2949 } 2950 2951 return ((caddr_t)vmp_get_writable(*vmp) + (gpa - hold_gpa)); 2952 } 2953 2954 static int 2955 _vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 2956 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only) 2957 { 2958 int nlevels, pfcode; 2959 int ptpshift = 0, ptpindex = 0; 2960 uint64_t ptpphys; 2961 uint64_t *ptpbase = NULL, pte = 0, pgsize = 0; 2962 vm_page_t *cookie = NULL; 2963 const bool usermode = paging->cpl == 3; 2964 const bool writable = (prot & PROT_WRITE) != 0; 2965 2966 *guest_fault = 0; 2967 restart: 2968 ptpphys = paging->cr3; /* root of the page tables */ 2969 ptp_release(&cookie); 2970 2971 if (vie_canonical_check(paging->cpu_mode, gla)) { 2972 /* 2973 * XXX assuming a non-stack reference otherwise a stack fault 2974 * should be generated. 2975 */ 2976 if (!check_only) 2977 vm_inject_gp(vm, vcpuid); 2978 *guest_fault = 1; 2979 return (0); 2980 } 2981 2982 if (paging->paging_mode == PAGING_MODE_FLAT) { 2983 *gpa = gla; 2984 return (0); 2985 } 2986 2987 if (paging->paging_mode == PAGING_MODE_32) { 2988 uint32_t *ptpbase32, pte32; 2989 2990 nlevels = 2; 2991 while (--nlevels >= 0) { 2992 /* Zero out the lower 12 bits. */ 2993 ptpphys &= ~0xfff; 2994 2995 ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, 2996 &cookie); 2997 2998 if (ptpbase32 == NULL) { 2999 return (EFAULT); 3000 } 3001 3002 ptpshift = PAGE_SHIFT + nlevels * 10; 3003 ptpindex = (gla >> ptpshift) & 0x3FF; 3004 pgsize = 1UL << ptpshift; 3005 3006 pte32 = ptpbase32[ptpindex]; 3007 3008 if ((pte32 & PG_V) == 0 || 3009 (usermode && (pte32 & PG_U) == 0) || 3010 (writable && (pte32 & PG_RW) == 0)) { 3011 if (!check_only) { 3012 pfcode = pf_error_code(usermode, prot, 3013 0, pte32); 3014 vm_inject_pf(vm, vcpuid, pfcode, gla); 3015 } 3016 3017 ptp_release(&cookie); 3018 *guest_fault = 1; 3019 return (0); 3020 } 3021 3022 /* 3023 * Emulate the x86 MMU's management of the accessed 3024 * and dirty flags. While the accessed flag is set 3025 * at every level of the page table, the dirty flag 3026 * is only set at the last level providing the guest 3027 * physical address. 3028 */ 3029 if (!check_only && (pte32 & PG_A) == 0) { 3030 if (atomic_cmpset_32(&ptpbase32[ptpindex], 3031 pte32, pte32 | PG_A) == 0) { 3032 goto restart; 3033 } 3034 } 3035 3036 /* XXX must be ignored if CR4.PSE=0 */ 3037 if (nlevels > 0 && (pte32 & PG_PS) != 0) 3038 break; 3039 3040 ptpphys = pte32; 3041 } 3042 3043 /* Set the dirty bit in the page table entry if necessary */ 3044 if (!check_only && writable && (pte32 & PG_M) == 0) { 3045 if (atomic_cmpset_32(&ptpbase32[ptpindex], 3046 pte32, pte32 | PG_M) == 0) { 3047 goto restart; 3048 } 3049 } 3050 3051 /* Zero out the lower 'ptpshift' bits */ 3052 pte32 >>= ptpshift; pte32 <<= ptpshift; 3053 *gpa = pte32 | (gla & (pgsize - 1)); 3054 ptp_release(&cookie); 3055 return (0); 3056 } 3057 3058 if (paging->paging_mode == PAGING_MODE_PAE) { 3059 /* Zero out the lower 5 bits and the upper 32 bits */ 3060 ptpphys &= 0xffffffe0UL; 3061 3062 ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof (*ptpbase) * 4, 3063 &cookie); 3064 if (ptpbase == NULL) { 3065 return (EFAULT); 3066 } 3067 3068 ptpindex = (gla >> 30) & 0x3; 3069 3070 pte = ptpbase[ptpindex]; 3071 3072 if ((pte & PG_V) == 0) { 3073 if (!check_only) { 3074 pfcode = pf_error_code(usermode, prot, 0, pte); 3075 vm_inject_pf(vm, vcpuid, pfcode, gla); 3076 } 3077 3078 ptp_release(&cookie); 3079 *guest_fault = 1; 3080 return (0); 3081 } 3082 3083 ptpphys = pte; 3084 3085 nlevels = 2; 3086 } else { 3087 nlevels = 4; 3088 } 3089 3090 while (--nlevels >= 0) { 3091 /* Zero out the lower 12 bits and the upper 12 bits */ 3092 ptpphys &= 0x000ffffffffff000UL; 3093 3094 ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie); 3095 if (ptpbase == NULL) { 3096 return (EFAULT); 3097 } 3098 3099 ptpshift = PAGE_SHIFT + nlevels * 9; 3100 ptpindex = (gla >> ptpshift) & 0x1FF; 3101 pgsize = 1UL << ptpshift; 3102 3103 pte = ptpbase[ptpindex]; 3104 3105 if ((pte & PG_V) == 0 || 3106 (usermode && (pte & PG_U) == 0) || 3107 (writable && (pte & PG_RW) == 0)) { 3108 if (!check_only) { 3109 pfcode = pf_error_code(usermode, prot, 0, pte); 3110 vm_inject_pf(vm, vcpuid, pfcode, gla); 3111 } 3112 3113 ptp_release(&cookie); 3114 *guest_fault = 1; 3115 return (0); 3116 } 3117 3118 /* Set the accessed bit in the page table entry */ 3119 if (!check_only && (pte & PG_A) == 0) { 3120 if (atomic_cmpset_64(&ptpbase[ptpindex], 3121 pte, pte | PG_A) == 0) { 3122 goto restart; 3123 } 3124 } 3125 3126 if (nlevels > 0 && (pte & PG_PS) != 0) { 3127 if (pgsize > 1 * GB) { 3128 if (!check_only) { 3129 pfcode = pf_error_code(usermode, prot, 3130 1, pte); 3131 vm_inject_pf(vm, vcpuid, pfcode, gla); 3132 } 3133 3134 ptp_release(&cookie); 3135 *guest_fault = 1; 3136 return (0); 3137 } 3138 break; 3139 } 3140 3141 ptpphys = pte; 3142 } 3143 3144 /* Set the dirty bit in the page table entry if necessary */ 3145 if (!check_only && writable && (pte & PG_M) == 0) { 3146 if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0) 3147 goto restart; 3148 } 3149 ptp_release(&cookie); 3150 3151 /* Zero out the lower 'ptpshift' bits and the upper 12 bits */ 3152 pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12; 3153 *gpa = pte | (gla & (pgsize - 1)); 3154 return (0); 3155 } 3156 3157 int 3158 vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 3159 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) 3160 { 3161 3162 return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault, 3163 false)); 3164 } 3165 3166 int 3167 vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 3168 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) 3169 { 3170 3171 return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault, 3172 true)); 3173 } 3174 3175 int 3176 vie_fetch_instruction(struct vie *vie, struct vm *vm, int vcpuid, uint64_t rip, 3177 int *faultptr) 3178 { 3179 struct vm_copyinfo copyinfo[2]; 3180 int error, prot; 3181 3182 if ((vie->status & VIES_INIT) == 0) { 3183 return (EINVAL); 3184 } 3185 3186 prot = PROT_READ | PROT_EXEC; 3187 error = vm_copy_setup(vm, vcpuid, &vie->paging, rip, VIE_INST_SIZE, 3188 prot, copyinfo, nitems(copyinfo), faultptr); 3189 if (error || *faultptr) 3190 return (error); 3191 3192 vm_copyin(vm, vcpuid, copyinfo, vie->inst, VIE_INST_SIZE); 3193 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 3194 vie->num_valid = VIE_INST_SIZE; 3195 vie->status |= VIES_INST_FETCH; 3196 return (0); 3197 } 3198 3199 static int 3200 vie_peek(struct vie *vie, uint8_t *x) 3201 { 3202 3203 if (vie->num_processed < vie->num_valid) { 3204 *x = vie->inst[vie->num_processed]; 3205 return (0); 3206 } else 3207 return (-1); 3208 } 3209 3210 static void 3211 vie_advance(struct vie *vie) 3212 { 3213 3214 vie->num_processed++; 3215 } 3216 3217 static bool 3218 segment_override(uint8_t x, int *seg) 3219 { 3220 3221 switch (x) { 3222 case 0x2E: 3223 *seg = VM_REG_GUEST_CS; 3224 break; 3225 case 0x36: 3226 *seg = VM_REG_GUEST_SS; 3227 break; 3228 case 0x3E: 3229 *seg = VM_REG_GUEST_DS; 3230 break; 3231 case 0x26: 3232 *seg = VM_REG_GUEST_ES; 3233 break; 3234 case 0x64: 3235 *seg = VM_REG_GUEST_FS; 3236 break; 3237 case 0x65: 3238 *seg = VM_REG_GUEST_GS; 3239 break; 3240 default: 3241 return (false); 3242 } 3243 return (true); 3244 } 3245 3246 static int 3247 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d) 3248 { 3249 uint8_t x; 3250 3251 while (1) { 3252 if (vie_peek(vie, &x)) 3253 return (-1); 3254 3255 if (x == 0x66) 3256 vie->opsize_override = 1; 3257 else if (x == 0x67) 3258 vie->addrsize_override = 1; 3259 else if (x == 0xF3) 3260 vie->repz_present = 1; 3261 else if (x == 0xF2) 3262 vie->repnz_present = 1; 3263 else if (segment_override(x, &vie->segment_register)) 3264 vie->segment_override = 1; 3265 else 3266 break; 3267 3268 vie_advance(vie); 3269 } 3270 3271 /* 3272 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2: 3273 * - Only one REX prefix is allowed per instruction. 3274 * - The REX prefix must immediately precede the opcode byte or the 3275 * escape opcode byte. 3276 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3) 3277 * the mandatory prefix must come before the REX prefix. 3278 */ 3279 if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) { 3280 vie->rex_present = 1; 3281 vie->rex_w = x & 0x8 ? 1 : 0; 3282 vie->rex_r = x & 0x4 ? 1 : 0; 3283 vie->rex_x = x & 0x2 ? 1 : 0; 3284 vie->rex_b = x & 0x1 ? 1 : 0; 3285 vie_advance(vie); 3286 } 3287 3288 /* 3289 * § 2.3.5, "The VEX Prefix", SDM Vol 2. 3290 */ 3291 if ((cpu_mode == CPU_MODE_64BIT || 3292 cpu_mode == CPU_MODE_COMPATIBILITY) && x == 0xC4) { 3293 const struct vie_op *optab; 3294 3295 /* 3-byte VEX prefix. */ 3296 vie->vex_present = 1; 3297 3298 vie_advance(vie); 3299 if (vie_peek(vie, &x)) 3300 return (-1); 3301 3302 /* 3303 * 2nd byte: [R', X', B', mmmmm[4:0]]. Bits are inverted 3304 * relative to REX encoding. 3305 */ 3306 vie->rex_r = x & 0x80 ? 0 : 1; 3307 vie->rex_x = x & 0x40 ? 0 : 1; 3308 vie->rex_b = x & 0x20 ? 0 : 1; 3309 3310 switch (x & 0x1F) { 3311 case 0x2: 3312 /* 0F 38. */ 3313 optab = three_byte_opcodes_0f38; 3314 break; 3315 case 0x1: 3316 /* 0F class - nothing handled here yet. */ 3317 /* FALLTHROUGH */ 3318 case 0x3: 3319 /* 0F 3A class - nothing handled here yet. */ 3320 /* FALLTHROUGH */ 3321 default: 3322 /* Reserved (#UD). */ 3323 return (-1); 3324 } 3325 3326 vie_advance(vie); 3327 if (vie_peek(vie, &x)) 3328 return (-1); 3329 3330 /* 3rd byte: [W, vvvv[6:3], L, pp[1:0]]. */ 3331 vie->rex_w = x & 0x80 ? 1 : 0; 3332 3333 vie->vex_reg = ((~(unsigned)x & 0x78u) >> 3); 3334 vie->vex_l = !!(x & 0x4); 3335 vie->vex_pp = (x & 0x3); 3336 3337 /* PP: 1=66 2=F3 3=F2 prefixes. */ 3338 switch (vie->vex_pp) { 3339 case 0x1: 3340 vie->opsize_override = 1; 3341 break; 3342 case 0x2: 3343 vie->repz_present = 1; 3344 break; 3345 case 0x3: 3346 vie->repnz_present = 1; 3347 break; 3348 } 3349 3350 vie_advance(vie); 3351 3352 /* Opcode, sans literal prefix prefix. */ 3353 if (vie_peek(vie, &x)) 3354 return (-1); 3355 3356 vie->op = optab[x]; 3357 if (vie->op.op_type == VIE_OP_TYPE_NONE) 3358 return (-1); 3359 3360 vie_advance(vie); 3361 } 3362 3363 /* 3364 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1 3365 */ 3366 if (cpu_mode == CPU_MODE_64BIT) { 3367 /* 3368 * Default address size is 64-bits and default operand size 3369 * is 32-bits. 3370 */ 3371 vie->addrsize = vie->addrsize_override ? 4 : 8; 3372 if (vie->rex_w) 3373 vie->opsize = 8; 3374 else if (vie->opsize_override) 3375 vie->opsize = 2; 3376 else 3377 vie->opsize = 4; 3378 } else if (cs_d) { 3379 /* Default address and operand sizes are 32-bits */ 3380 vie->addrsize = vie->addrsize_override ? 2 : 4; 3381 vie->opsize = vie->opsize_override ? 2 : 4; 3382 } else { 3383 /* Default address and operand sizes are 16-bits */ 3384 vie->addrsize = vie->addrsize_override ? 4 : 2; 3385 vie->opsize = vie->opsize_override ? 4 : 2; 3386 } 3387 return (0); 3388 } 3389 3390 static int 3391 decode_two_byte_opcode(struct vie *vie) 3392 { 3393 uint8_t x; 3394 3395 if (vie_peek(vie, &x)) 3396 return (-1); 3397 3398 vie->op = two_byte_opcodes[x]; 3399 3400 if (vie->op.op_type == VIE_OP_TYPE_NONE) 3401 return (-1); 3402 3403 vie_advance(vie); 3404 return (0); 3405 } 3406 3407 static int 3408 decode_opcode(struct vie *vie) 3409 { 3410 uint8_t x; 3411 3412 if (vie_peek(vie, &x)) 3413 return (-1); 3414 3415 /* Already did this via VEX prefix. */ 3416 if (vie->op.op_type != VIE_OP_TYPE_NONE) 3417 return (0); 3418 3419 vie->op = one_byte_opcodes[x]; 3420 3421 if (vie->op.op_type == VIE_OP_TYPE_NONE) 3422 return (-1); 3423 3424 vie_advance(vie); 3425 3426 if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE) 3427 return (decode_two_byte_opcode(vie)); 3428 3429 return (0); 3430 } 3431 3432 static int 3433 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode) 3434 { 3435 uint8_t x; 3436 /* 3437 * Handling mov-to/from-cr is special since it is not issuing 3438 * mmio/pio requests and can be done in real mode. We must bypass some 3439 * of the other existing decoding restrictions for it. 3440 */ 3441 const bool is_movcr = ((vie->op.op_flags & VIE_OP_F_REG_REG) != 0); 3442 3443 if (vie->op.op_flags & VIE_OP_F_NO_MODRM) 3444 return (0); 3445 3446 if (cpu_mode == CPU_MODE_REAL && !is_movcr) 3447 return (-1); 3448 3449 if (vie_peek(vie, &x)) 3450 return (-1); 3451 3452 vie->mod = (x >> 6) & 0x3; 3453 vie->rm = (x >> 0) & 0x7; 3454 vie->reg = (x >> 3) & 0x7; 3455 3456 /* 3457 * A direct addressing mode makes no sense in the context of an EPT 3458 * fault. There has to be a memory access involved to cause the 3459 * EPT fault. 3460 */ 3461 if (vie->mod == VIE_MOD_DIRECT && !is_movcr) 3462 return (-1); 3463 3464 if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) || 3465 (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) { 3466 /* 3467 * Table 2-5: Special Cases of REX Encodings 3468 * 3469 * mod=0, r/m=5 is used in the compatibility mode to 3470 * indicate a disp32 without a base register. 3471 * 3472 * mod!=3, r/m=4 is used in the compatibility mode to 3473 * indicate that the SIB byte is present. 3474 * 3475 * The 'b' bit in the REX prefix is don't care in 3476 * this case. 3477 */ 3478 } else { 3479 vie->rm |= (vie->rex_b << 3); 3480 } 3481 3482 vie->reg |= (vie->rex_r << 3); 3483 3484 /* SIB */ 3485 if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB) 3486 goto done; 3487 3488 vie->base_register = gpr_map[vie->rm]; 3489 3490 switch (vie->mod) { 3491 case VIE_MOD_INDIRECT_DISP8: 3492 vie->disp_bytes = 1; 3493 break; 3494 case VIE_MOD_INDIRECT_DISP32: 3495 vie->disp_bytes = 4; 3496 break; 3497 case VIE_MOD_INDIRECT: 3498 if (vie->rm == VIE_RM_DISP32) { 3499 vie->disp_bytes = 4; 3500 /* 3501 * Table 2-7. RIP-Relative Addressing 3502 * 3503 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32 3504 * whereas in compatibility mode it just implies disp32. 3505 */ 3506 3507 if (cpu_mode == CPU_MODE_64BIT) 3508 vie->base_register = VM_REG_GUEST_RIP; 3509 else 3510 vie->base_register = VM_REG_LAST; 3511 } 3512 break; 3513 } 3514 3515 done: 3516 vie_advance(vie); 3517 3518 return (0); 3519 } 3520 3521 static int 3522 decode_sib(struct vie *vie) 3523 { 3524 uint8_t x; 3525 3526 /* Proceed only if SIB byte is present */ 3527 if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB) 3528 return (0); 3529 3530 if (vie_peek(vie, &x)) 3531 return (-1); 3532 3533 /* De-construct the SIB byte */ 3534 vie->ss = (x >> 6) & 0x3; 3535 vie->index = (x >> 3) & 0x7; 3536 vie->base = (x >> 0) & 0x7; 3537 3538 /* Apply the REX prefix modifiers */ 3539 vie->index |= vie->rex_x << 3; 3540 vie->base |= vie->rex_b << 3; 3541 3542 switch (vie->mod) { 3543 case VIE_MOD_INDIRECT_DISP8: 3544 vie->disp_bytes = 1; 3545 break; 3546 case VIE_MOD_INDIRECT_DISP32: 3547 vie->disp_bytes = 4; 3548 break; 3549 } 3550 3551 if (vie->mod == VIE_MOD_INDIRECT && 3552 (vie->base == 5 || vie->base == 13)) { 3553 /* 3554 * Special case when base register is unused if mod = 0 3555 * and base = %rbp or %r13. 3556 * 3557 * Documented in: 3558 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 3559 * Table 2-5: Special Cases of REX Encodings 3560 */ 3561 vie->disp_bytes = 4; 3562 } else { 3563 vie->base_register = gpr_map[vie->base]; 3564 } 3565 3566 /* 3567 * All encodings of 'index' are valid except for %rsp (4). 3568 * 3569 * Documented in: 3570 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 3571 * Table 2-5: Special Cases of REX Encodings 3572 */ 3573 if (vie->index != 4) 3574 vie->index_register = gpr_map[vie->index]; 3575 3576 /* 'scale' makes sense only in the context of an index register */ 3577 if (vie->index_register < VM_REG_LAST) 3578 vie->scale = 1 << vie->ss; 3579 3580 vie_advance(vie); 3581 3582 return (0); 3583 } 3584 3585 static int 3586 decode_displacement(struct vie *vie) 3587 { 3588 int n, i; 3589 uint8_t x; 3590 3591 union { 3592 char buf[4]; 3593 int8_t signed8; 3594 int32_t signed32; 3595 } u; 3596 3597 if ((n = vie->disp_bytes) == 0) 3598 return (0); 3599 3600 if (n != 1 && n != 4) 3601 panic("decode_displacement: invalid disp_bytes %d", n); 3602 3603 for (i = 0; i < n; i++) { 3604 if (vie_peek(vie, &x)) 3605 return (-1); 3606 3607 u.buf[i] = x; 3608 vie_advance(vie); 3609 } 3610 3611 if (n == 1) 3612 vie->displacement = u.signed8; /* sign-extended */ 3613 else 3614 vie->displacement = u.signed32; /* sign-extended */ 3615 3616 return (0); 3617 } 3618 3619 static int 3620 decode_immediate(struct vie *vie) 3621 { 3622 int i, n; 3623 uint8_t x; 3624 union { 3625 char buf[4]; 3626 int8_t signed8; 3627 int16_t signed16; 3628 int32_t signed32; 3629 } u; 3630 3631 /* Figure out immediate operand size (if any) */ 3632 if (vie->op.op_flags & VIE_OP_F_IMM) { 3633 /* 3634 * Section 2.2.1.5 "Immediates", Intel SDM: 3635 * In 64-bit mode the typical size of immediate operands 3636 * remains 32-bits. When the operand size if 64-bits, the 3637 * processor sign-extends all immediates to 64-bits prior 3638 * to their use. 3639 */ 3640 if (vie->opsize == 4 || vie->opsize == 8) 3641 vie->imm_bytes = 4; 3642 else 3643 vie->imm_bytes = 2; 3644 } else if (vie->op.op_flags & VIE_OP_F_IMM8) { 3645 vie->imm_bytes = 1; 3646 } 3647 3648 if ((n = vie->imm_bytes) == 0) 3649 return (0); 3650 3651 KASSERT(n == 1 || n == 2 || n == 4, 3652 ("%s: invalid number of immediate bytes: %d", __func__, n)); 3653 3654 for (i = 0; i < n; i++) { 3655 if (vie_peek(vie, &x)) 3656 return (-1); 3657 3658 u.buf[i] = x; 3659 vie_advance(vie); 3660 } 3661 3662 /* sign-extend the immediate value before use */ 3663 if (n == 1) 3664 vie->immediate = u.signed8; 3665 else if (n == 2) 3666 vie->immediate = u.signed16; 3667 else 3668 vie->immediate = u.signed32; 3669 3670 return (0); 3671 } 3672 3673 static int 3674 decode_moffset(struct vie *vie) 3675 { 3676 int i, n; 3677 uint8_t x; 3678 union { 3679 char buf[8]; 3680 uint64_t u64; 3681 } u; 3682 3683 if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0) 3684 return (0); 3685 3686 /* 3687 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM: 3688 * The memory offset size follows the address-size of the instruction. 3689 */ 3690 n = vie->addrsize; 3691 KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n)); 3692 3693 u.u64 = 0; 3694 for (i = 0; i < n; i++) { 3695 if (vie_peek(vie, &x)) 3696 return (-1); 3697 3698 u.buf[i] = x; 3699 vie_advance(vie); 3700 } 3701 vie->displacement = u.u64; 3702 return (0); 3703 } 3704 3705 /* 3706 * Verify that the 'guest linear address' provided as collateral of the nested 3707 * page table fault matches with our instruction decoding. 3708 */ 3709 int 3710 vie_verify_gla(struct vie *vie, struct vm *vm, int cpuid, uint64_t gla) 3711 { 3712 int error; 3713 uint64_t base, segbase, idx, gla2; 3714 enum vm_reg_name seg; 3715 struct seg_desc desc; 3716 3717 ASSERT((vie->status & VIES_INST_DECODE) != 0); 3718 3719 /* 3720 * If there was no valid GLA context with the exit, or the decoded 3721 * instruction acts on more than one address, verification is done. 3722 */ 3723 if (gla == VIE_INVALID_GLA || 3724 (vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) != 0) { 3725 return (0); 3726 } 3727 3728 base = 0; 3729 if (vie->base_register != VM_REG_LAST) { 3730 error = vm_get_register(vm, cpuid, vie->base_register, &base); 3731 if (error) { 3732 printf("verify_gla: error %d getting base reg %d\n", 3733 error, vie->base_register); 3734 return (-1); 3735 } 3736 3737 /* 3738 * RIP-relative addressing starts from the following 3739 * instruction 3740 */ 3741 if (vie->base_register == VM_REG_GUEST_RIP) 3742 base += vie->num_processed; 3743 } 3744 3745 idx = 0; 3746 if (vie->index_register != VM_REG_LAST) { 3747 error = vm_get_register(vm, cpuid, vie->index_register, &idx); 3748 if (error) { 3749 printf("verify_gla: error %d getting index reg %d\n", 3750 error, vie->index_register); 3751 return (-1); 3752 } 3753 } 3754 3755 /* 3756 * From "Specifying a Segment Selector", Intel SDM, Vol 1 3757 * 3758 * In 64-bit mode, segmentation is generally (but not 3759 * completely) disabled. The exceptions are the FS and GS 3760 * segments. 3761 * 3762 * In legacy IA-32 mode, when the ESP or EBP register is used 3763 * as the base, the SS segment is the default segment. For 3764 * other data references, except when relative to stack or 3765 * string destination the DS segment is the default. These 3766 * can be overridden to allow other segments to be accessed. 3767 */ 3768 if (vie->segment_override) { 3769 seg = vie->segment_register; 3770 } else if (vie->base_register == VM_REG_GUEST_RSP || 3771 vie->base_register == VM_REG_GUEST_RBP) { 3772 seg = VM_REG_GUEST_SS; 3773 } else { 3774 seg = VM_REG_GUEST_DS; 3775 } 3776 if (vie->paging.cpu_mode == CPU_MODE_64BIT && 3777 seg != VM_REG_GUEST_FS && seg != VM_REG_GUEST_GS) { 3778 segbase = 0; 3779 } else { 3780 error = vm_get_seg_desc(vm, cpuid, seg, &desc); 3781 if (error) { 3782 printf("verify_gla: error %d getting segment" 3783 " descriptor %d", error, vie->segment_register); 3784 return (-1); 3785 } 3786 segbase = desc.base; 3787 } 3788 3789 gla2 = segbase + base + vie->scale * idx + vie->displacement; 3790 gla2 &= size2mask[vie->addrsize]; 3791 if (gla != gla2) { 3792 printf("verify_gla mismatch: segbase(0x%0lx)" 3793 "base(0x%0lx), scale(%d), index(0x%0lx), " 3794 "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n", 3795 segbase, base, vie->scale, idx, vie->displacement, 3796 gla, gla2); 3797 return (-1); 3798 } 3799 3800 return (0); 3801 } 3802 3803 int 3804 vie_decode_instruction(struct vie *vie, struct vm *vm, int cpuid, int cs_d) 3805 { 3806 enum vm_cpu_mode cpu_mode; 3807 3808 if ((vie->status & VIES_INST_FETCH) == 0) { 3809 return (EINVAL); 3810 } 3811 3812 cpu_mode = vie->paging.cpu_mode; 3813 3814 if (decode_prefixes(vie, cpu_mode, cs_d)) 3815 return (-1); 3816 3817 if (decode_opcode(vie)) 3818 return (-1); 3819 3820 if (decode_modrm(vie, cpu_mode)) 3821 return (-1); 3822 3823 if (decode_sib(vie)) 3824 return (-1); 3825 3826 if (decode_displacement(vie)) 3827 return (-1); 3828 3829 if (decode_immediate(vie)) 3830 return (-1); 3831 3832 if (decode_moffset(vie)) 3833 return (-1); 3834 3835 vie->status |= VIES_INST_DECODE; 3836 3837 return (0); 3838 } 3839