1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2012 Sandvine, Inc. 5 * Copyright (c) 2012 NetApp, Inc. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * $FreeBSD$ 30 */ 31 /* 32 * This file and its contents are supplied under the terms of the 33 * Common Development and Distribution License ("CDDL"), version 1.0. 34 * You may only use this file in accordance with the terms of version 35 * 1.0 of the CDDL. 36 * 37 * A full copy of the text of the CDDL should have accompanied this 38 * source. A copy of the CDDL is also available via the Internet at 39 * http://www.illumos.org/license/CDDL. 40 * 41 * Copyright 2015 Pluribus Networks Inc. 42 * Copyright 2018 Joyent, Inc. 43 * Copyright 2021 Oxide Computer Company 44 * Copyright 2022 OmniOS Community Edition (OmniOSce) Association. 45 */ 46 47 #include <sys/cdefs.h> 48 __FBSDID("$FreeBSD$"); 49 50 #include <sys/param.h> 51 #include <sys/pcpu.h> 52 #include <sys/systm.h> 53 #include <sys/proc.h> 54 55 #include <machine/vmparam.h> 56 #include <machine/vmm.h> 57 #include <sys/vmm_kernel.h> 58 #include <sys/vmm_vm.h> 59 60 #include <sys/vmm_instruction_emul.h> 61 #include <x86/psl.h> 62 #include <x86/specialreg.h> 63 64 #include "vmm_ioport.h" 65 #include "vmm_ktr.h" 66 67 enum vie_status { 68 VIES_INIT = (1U << 0), 69 VIES_MMIO = (1U << 1), 70 VIES_INOUT = (1U << 2), 71 VIES_OTHER = (1U << 3), 72 VIES_INST_FETCH = (1U << 4), 73 VIES_INST_DECODE = (1U << 5), 74 VIES_PENDING_MMIO = (1U << 6), 75 VIES_PENDING_INOUT = (1U << 7), 76 VIES_REPEAT = (1U << 8), 77 VIES_USER_FALLBACK = (1U << 9), 78 VIES_COMPLETE = (1U << 10), 79 }; 80 81 /* State of request to perform emulated access (inout or MMIO) */ 82 enum vie_req { 83 VR_NONE, 84 VR_PENDING, 85 VR_DONE, 86 }; 87 88 struct vie_mmio { 89 uint64_t data; 90 uint64_t gpa; 91 uint8_t bytes; 92 enum vie_req state; 93 }; 94 95 struct vie_op { 96 uint8_t op_byte; /* actual opcode byte */ 97 uint8_t op_type; /* type of operation (e.g. MOV) */ 98 uint16_t op_flags; 99 }; 100 101 #define VIE_INST_SIZE 15 102 struct vie { 103 uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */ 104 uint8_t num_valid; /* size of the instruction */ 105 uint8_t num_processed; 106 107 uint8_t addrsize:4, opsize:4; /* address and operand sizes */ 108 uint8_t rex_w:1, /* REX prefix */ 109 rex_r:1, 110 rex_x:1, 111 rex_b:1, 112 rex_present:1, 113 repz_present:1, /* REP/REPE/REPZ prefix */ 114 repnz_present:1, /* REPNE/REPNZ prefix */ 115 opsize_override:1, /* Operand size override */ 116 addrsize_override:1, /* Address size override */ 117 segment_override:1; /* Segment override */ 118 119 uint8_t mod:2, /* ModRM byte */ 120 reg:4, 121 rm:4; 122 123 uint8_t ss:2, /* SIB byte */ 124 vex_present:1, /* VEX prefixed */ 125 vex_l:1, /* L bit */ 126 index:4, /* SIB byte */ 127 base:4; /* SIB byte */ 128 129 uint8_t disp_bytes; 130 uint8_t imm_bytes; 131 132 uint8_t scale; 133 134 uint8_t vex_reg:4, /* vvvv: first source reg specifier */ 135 vex_pp:2, /* pp */ 136 _sparebits:2; 137 138 uint8_t _sparebytes[2]; 139 140 int base_register; /* VM_REG_GUEST_xyz */ 141 int index_register; /* VM_REG_GUEST_xyz */ 142 int segment_register; /* VM_REG_GUEST_xyz */ 143 144 int64_t displacement; /* optional addr displacement */ 145 int64_t immediate; /* optional immediate operand */ 146 147 struct vie_op op; /* opcode description */ 148 149 enum vie_status status; 150 151 struct vm_guest_paging paging; /* guest paging state */ 152 153 uint64_t mmio_gpa; /* faulting GPA */ 154 struct vie_mmio mmio_req_read; 155 struct vie_mmio mmio_req_write; 156 157 struct vm_inout inout; /* active in/out op */ 158 enum vie_req inout_req_state; 159 uint32_t inout_req_val; /* value from userspace */ 160 }; 161 162 163 /* struct vie_op.op_type */ 164 enum { 165 VIE_OP_TYPE_NONE = 0, 166 VIE_OP_TYPE_MOV, 167 VIE_OP_TYPE_MOVSX, 168 VIE_OP_TYPE_MOVZX, 169 VIE_OP_TYPE_MOV_CR, 170 VIE_OP_TYPE_AND, 171 VIE_OP_TYPE_OR, 172 VIE_OP_TYPE_SUB, 173 VIE_OP_TYPE_TWO_BYTE, 174 VIE_OP_TYPE_PUSH, 175 VIE_OP_TYPE_CMP, 176 VIE_OP_TYPE_POP, 177 VIE_OP_TYPE_MOVS, 178 VIE_OP_TYPE_GROUP1, 179 VIE_OP_TYPE_STOS, 180 VIE_OP_TYPE_BITTEST, 181 VIE_OP_TYPE_TWOB_GRP15, 182 VIE_OP_TYPE_ADD, 183 VIE_OP_TYPE_TEST, 184 VIE_OP_TYPE_BEXTR, 185 VIE_OP_TYPE_CLTS, 186 VIE_OP_TYPE_LAST 187 }; 188 189 /* struct vie_op.op_flags */ 190 #define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */ 191 #define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ 192 #define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */ 193 #define VIE_OP_F_NO_MODRM (1 << 3) 194 #define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4) 195 #define VIE_OP_F_REG_REG (1 << 5) /* special-case for mov-cr */ 196 197 static const struct vie_op three_byte_opcodes_0f38[256] = { 198 [0xF7] = { 199 .op_byte = 0xF7, 200 .op_type = VIE_OP_TYPE_BEXTR, 201 }, 202 }; 203 204 static const struct vie_op two_byte_opcodes[256] = { 205 [0x06] = { 206 .op_byte = 0x06, 207 .op_type = VIE_OP_TYPE_CLTS, 208 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 209 }, 210 [0x20] = { 211 .op_byte = 0x20, 212 .op_type = VIE_OP_TYPE_MOV_CR, 213 .op_flags = VIE_OP_F_REG_REG | VIE_OP_F_NO_GLA_VERIFICATION 214 }, 215 [0x22] = { 216 .op_byte = 0x22, 217 .op_type = VIE_OP_TYPE_MOV_CR, 218 .op_flags = VIE_OP_F_REG_REG | VIE_OP_F_NO_GLA_VERIFICATION 219 }, 220 [0xAE] = { 221 .op_byte = 0xAE, 222 .op_type = VIE_OP_TYPE_TWOB_GRP15, 223 }, 224 [0xB6] = { 225 .op_byte = 0xB6, 226 .op_type = VIE_OP_TYPE_MOVZX, 227 }, 228 [0xB7] = { 229 .op_byte = 0xB7, 230 .op_type = VIE_OP_TYPE_MOVZX, 231 }, 232 [0xBA] = { 233 .op_byte = 0xBA, 234 .op_type = VIE_OP_TYPE_BITTEST, 235 .op_flags = VIE_OP_F_IMM8, 236 }, 237 [0xBE] = { 238 .op_byte = 0xBE, 239 .op_type = VIE_OP_TYPE_MOVSX, 240 }, 241 }; 242 243 static const struct vie_op one_byte_opcodes[256] = { 244 [0x03] = { 245 .op_byte = 0x03, 246 .op_type = VIE_OP_TYPE_ADD, 247 }, 248 [0x0F] = { 249 .op_byte = 0x0F, 250 .op_type = VIE_OP_TYPE_TWO_BYTE 251 }, 252 [0x0B] = { 253 .op_byte = 0x0B, 254 .op_type = VIE_OP_TYPE_OR, 255 }, 256 [0x2B] = { 257 .op_byte = 0x2B, 258 .op_type = VIE_OP_TYPE_SUB, 259 }, 260 [0x39] = { 261 .op_byte = 0x39, 262 .op_type = VIE_OP_TYPE_CMP, 263 }, 264 [0x3B] = { 265 .op_byte = 0x3B, 266 .op_type = VIE_OP_TYPE_CMP, 267 }, 268 [0x88] = { 269 .op_byte = 0x88, 270 .op_type = VIE_OP_TYPE_MOV, 271 }, 272 [0x89] = { 273 .op_byte = 0x89, 274 .op_type = VIE_OP_TYPE_MOV, 275 }, 276 [0x8A] = { 277 .op_byte = 0x8A, 278 .op_type = VIE_OP_TYPE_MOV, 279 }, 280 [0x8B] = { 281 .op_byte = 0x8B, 282 .op_type = VIE_OP_TYPE_MOV, 283 }, 284 [0xA1] = { 285 .op_byte = 0xA1, 286 .op_type = VIE_OP_TYPE_MOV, 287 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 288 }, 289 [0xA3] = { 290 .op_byte = 0xA3, 291 .op_type = VIE_OP_TYPE_MOV, 292 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 293 }, 294 [0xA4] = { 295 .op_byte = 0xA4, 296 .op_type = VIE_OP_TYPE_MOVS, 297 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 298 }, 299 [0xA5] = { 300 .op_byte = 0xA5, 301 .op_type = VIE_OP_TYPE_MOVS, 302 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 303 }, 304 [0xAA] = { 305 .op_byte = 0xAA, 306 .op_type = VIE_OP_TYPE_STOS, 307 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 308 }, 309 [0xAB] = { 310 .op_byte = 0xAB, 311 .op_type = VIE_OP_TYPE_STOS, 312 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 313 }, 314 [0xC6] = { 315 /* XXX Group 11 extended opcode - not just MOV */ 316 .op_byte = 0xC6, 317 .op_type = VIE_OP_TYPE_MOV, 318 .op_flags = VIE_OP_F_IMM8, 319 }, 320 [0xC7] = { 321 .op_byte = 0xC7, 322 .op_type = VIE_OP_TYPE_MOV, 323 .op_flags = VIE_OP_F_IMM, 324 }, 325 [0x23] = { 326 .op_byte = 0x23, 327 .op_type = VIE_OP_TYPE_AND, 328 }, 329 [0x80] = { 330 /* Group 1 extended opcode */ 331 .op_byte = 0x80, 332 .op_type = VIE_OP_TYPE_GROUP1, 333 .op_flags = VIE_OP_F_IMM8, 334 }, 335 [0x81] = { 336 /* Group 1 extended opcode */ 337 .op_byte = 0x81, 338 .op_type = VIE_OP_TYPE_GROUP1, 339 .op_flags = VIE_OP_F_IMM, 340 }, 341 [0x83] = { 342 /* Group 1 extended opcode */ 343 .op_byte = 0x83, 344 .op_type = VIE_OP_TYPE_GROUP1, 345 .op_flags = VIE_OP_F_IMM8, 346 }, 347 [0x8F] = { 348 /* XXX Group 1A extended opcode - not just POP */ 349 .op_byte = 0x8F, 350 .op_type = VIE_OP_TYPE_POP, 351 }, 352 [0xF6] = { 353 /* XXX Group 3 extended opcode - not just TEST */ 354 .op_byte = 0xF6, 355 .op_type = VIE_OP_TYPE_TEST, 356 .op_flags = VIE_OP_F_IMM8, 357 }, 358 [0xF7] = { 359 /* XXX Group 3 extended opcode - not just TEST */ 360 .op_byte = 0xF7, 361 .op_type = VIE_OP_TYPE_TEST, 362 .op_flags = VIE_OP_F_IMM, 363 }, 364 [0xFF] = { 365 /* XXX Group 5 extended opcode - not just PUSH */ 366 .op_byte = 0xFF, 367 .op_type = VIE_OP_TYPE_PUSH, 368 } 369 }; 370 371 /* struct vie.mod */ 372 #define VIE_MOD_INDIRECT 0 373 #define VIE_MOD_INDIRECT_DISP8 1 374 #define VIE_MOD_INDIRECT_DISP32 2 375 #define VIE_MOD_DIRECT 3 376 377 /* struct vie.rm */ 378 #define VIE_RM_SIB 4 379 #define VIE_RM_DISP32 5 380 381 #define GB (1024 * 1024 * 1024) 382 383 384 /* 385 * Paging defines, previously pulled in from machine/pmap.h 386 */ 387 #define PG_V (1 << 0) /* Present */ 388 #define PG_RW (1 << 1) /* Read/Write */ 389 #define PG_U (1 << 2) /* User/Supervisor */ 390 #define PG_A (1 << 5) /* Accessed */ 391 #define PG_M (1 << 6) /* Dirty */ 392 #define PG_PS (1 << 7) /* Largepage */ 393 394 /* 395 * Paging except defines, previously pulled in from machine/pmap.h 396 */ 397 #define PGEX_P (1 << 0) /* Non-present/Protection */ 398 #define PGEX_W (1 << 1) /* Read/Write */ 399 #define PGEX_U (1 << 2) /* User/Supervisor */ 400 #define PGEX_RSV (1 << 3) /* (Non-)Reserved */ 401 #define PGEX_I (1 << 4) /* Instruction */ 402 403 404 static enum vm_reg_name gpr_map[16] = { 405 VM_REG_GUEST_RAX, 406 VM_REG_GUEST_RCX, 407 VM_REG_GUEST_RDX, 408 VM_REG_GUEST_RBX, 409 VM_REG_GUEST_RSP, 410 VM_REG_GUEST_RBP, 411 VM_REG_GUEST_RSI, 412 VM_REG_GUEST_RDI, 413 VM_REG_GUEST_R8, 414 VM_REG_GUEST_R9, 415 VM_REG_GUEST_R10, 416 VM_REG_GUEST_R11, 417 VM_REG_GUEST_R12, 418 VM_REG_GUEST_R13, 419 VM_REG_GUEST_R14, 420 VM_REG_GUEST_R15 421 }; 422 423 static enum vm_reg_name cr_map[16] = { 424 VM_REG_GUEST_CR0, 425 VM_REG_LAST, 426 VM_REG_GUEST_CR2, 427 VM_REG_GUEST_CR3, 428 VM_REG_GUEST_CR4, 429 VM_REG_LAST, 430 VM_REG_LAST, 431 VM_REG_LAST, 432 VM_REG_LAST, 433 VM_REG_LAST, 434 VM_REG_LAST, 435 VM_REG_LAST, 436 VM_REG_LAST, 437 VM_REG_LAST, 438 VM_REG_LAST, 439 VM_REG_LAST 440 }; 441 442 static uint64_t size2mask[] = { 443 [1] = 0xff, 444 [2] = 0xffff, 445 [4] = 0xffffffff, 446 [8] = 0xffffffffffffffff, 447 }; 448 449 450 static int vie_mmio_read(struct vie *vie, struct vm *vm, int cpuid, 451 uint64_t gpa, uint64_t *rval, int bytes); 452 static int vie_mmio_write(struct vie *vie, struct vm *vm, int cpuid, 453 uint64_t gpa, uint64_t wval, int bytes); 454 static int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, 455 struct seg_desc *desc, uint64_t offset, int length, int addrsize, 456 int prot, uint64_t *gla); 457 static int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla); 458 static int vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, 459 uint64_t gla); 460 static uint64_t vie_size2mask(int size); 461 462 struct vie * 463 vie_alloc() 464 { 465 return (kmem_zalloc(sizeof (struct vie), KM_SLEEP)); 466 } 467 468 void 469 vie_free(struct vie *vie) 470 { 471 kmem_free(vie, sizeof (struct vie)); 472 } 473 474 enum vm_reg_name 475 vie_regnum_map(uint8_t regnum) 476 { 477 VERIFY3U(regnum, <, 16); 478 return (gpr_map[regnum]); 479 } 480 481 static void 482 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr) 483 { 484 *lhbr = 0; 485 *reg = gpr_map[vie->reg]; 486 487 /* 488 * 64-bit mode imposes limitations on accessing legacy high byte 489 * registers (lhbr). 490 * 491 * The legacy high-byte registers cannot be addressed if the REX 492 * prefix is present. In this case the values 4, 5, 6 and 7 of the 493 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively. 494 * 495 * If the REX prefix is not present then the values 4, 5, 6 and 7 496 * of the 'ModRM:reg' field address the legacy high-byte registers, 497 * %ah, %ch, %dh and %bh respectively. 498 */ 499 if (!vie->rex_present) { 500 if (vie->reg & 0x4) { 501 *lhbr = 1; 502 *reg = gpr_map[vie->reg & 0x3]; 503 } 504 } 505 } 506 507 static int 508 vie_read_bytereg(struct vie *vie, struct vm *vm, int vcpuid, uint8_t *rval) 509 { 510 uint64_t val; 511 int error, lhbr; 512 enum vm_reg_name reg; 513 514 vie_calc_bytereg(vie, ®, &lhbr); 515 error = vm_get_register(vm, vcpuid, reg, &val); 516 517 /* 518 * To obtain the value of a legacy high byte register shift the 519 * base register right by 8 bits (%ah = %rax >> 8). 520 */ 521 if (lhbr) 522 *rval = val >> 8; 523 else 524 *rval = val; 525 return (error); 526 } 527 528 static int 529 vie_write_bytereg(struct vie *vie, struct vm *vm, int vcpuid, uint8_t byte) 530 { 531 uint64_t origval, val, mask; 532 int error, lhbr; 533 enum vm_reg_name reg; 534 535 vie_calc_bytereg(vie, ®, &lhbr); 536 error = vm_get_register(vm, vcpuid, reg, &origval); 537 if (error == 0) { 538 val = byte; 539 mask = 0xff; 540 if (lhbr) { 541 /* 542 * Shift left by 8 to store 'byte' in a legacy high 543 * byte register. 544 */ 545 val <<= 8; 546 mask <<= 8; 547 } 548 val |= origval & ~mask; 549 error = vm_set_register(vm, vcpuid, reg, val); 550 } 551 return (error); 552 } 553 554 static int 555 vie_update_register(struct vm *vm, int vcpuid, enum vm_reg_name reg, 556 uint64_t val, int size) 557 { 558 int error; 559 uint64_t origval; 560 561 switch (size) { 562 case 1: 563 case 2: 564 error = vm_get_register(vm, vcpuid, reg, &origval); 565 if (error) 566 return (error); 567 val &= size2mask[size]; 568 val |= origval & ~size2mask[size]; 569 break; 570 case 4: 571 val &= 0xffffffffUL; 572 break; 573 case 8: 574 break; 575 default: 576 return (EINVAL); 577 } 578 579 error = vm_set_register(vm, vcpuid, reg, val); 580 return (error); 581 } 582 583 static int 584 vie_repeat(struct vie *vie) 585 { 586 vie->status |= VIES_REPEAT; 587 588 /* 589 * Clear out any cached operation values so the repeated instruction can 590 * begin without using that stale state. Other state, such as the 591 * decoding results, are kept around as it will not vary between 592 * iterations of a rep-prefixed instruction. 593 */ 594 if ((vie->status & VIES_MMIO) != 0) { 595 vie->mmio_req_read.state = VR_NONE; 596 vie->mmio_req_write.state = VR_NONE; 597 } else if ((vie->status & VIES_INOUT) != 0) { 598 vie->inout_req_state = VR_NONE; 599 } else { 600 panic("unexpected emulation state"); 601 } 602 603 return (EAGAIN); 604 } 605 606 #define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V) 607 608 /* 609 * Return the status flags that would result from doing (x - y). 610 */ 611 /* BEGIN CSTYLED */ 612 #define GETCC(sz) \ 613 static ulong_t \ 614 getcc##sz(uint##sz##_t x, uint##sz##_t y) \ 615 { \ 616 ulong_t rflags; \ 617 \ 618 __asm __volatile("sub %2,%1; pushfq; popq %0" : \ 619 "=r" (rflags), "+r" (x) : "m" (y)); \ 620 return (rflags); \ 621 } struct __hack 622 /* END CSTYLED */ 623 624 GETCC(8); 625 GETCC(16); 626 GETCC(32); 627 GETCC(64); 628 629 static ulong_t 630 getcc(int opsize, uint64_t x, uint64_t y) 631 { 632 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 633 ("getcc: invalid operand size %d", opsize)); 634 635 if (opsize == 1) 636 return (getcc8(x, y)); 637 else if (opsize == 2) 638 return (getcc16(x, y)); 639 else if (opsize == 4) 640 return (getcc32(x, y)); 641 else 642 return (getcc64(x, y)); 643 } 644 645 /* 646 * Macro creation of functions getaddflags{8,16,32,64} 647 */ 648 /* BEGIN CSTYLED */ 649 #define GETADDFLAGS(sz) \ 650 static ulong_t \ 651 getaddflags##sz(uint##sz##_t x, uint##sz##_t y) \ 652 { \ 653 ulong_t rflags; \ 654 \ 655 __asm __volatile("add %2,%1; pushfq; popq %0" : \ 656 "=r" (rflags), "+r" (x) : "m" (y)); \ 657 return (rflags); \ 658 } struct __hack 659 /* END CSTYLED */ 660 661 GETADDFLAGS(8); 662 GETADDFLAGS(16); 663 GETADDFLAGS(32); 664 GETADDFLAGS(64); 665 666 static ulong_t 667 getaddflags(int opsize, uint64_t x, uint64_t y) 668 { 669 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 670 ("getaddflags: invalid operand size %d", opsize)); 671 672 if (opsize == 1) 673 return (getaddflags8(x, y)); 674 else if (opsize == 2) 675 return (getaddflags16(x, y)); 676 else if (opsize == 4) 677 return (getaddflags32(x, y)); 678 else 679 return (getaddflags64(x, y)); 680 } 681 682 /* 683 * Return the status flags that would result from doing (x & y). 684 */ 685 /* BEGIN CSTYLED */ 686 #define GETANDFLAGS(sz) \ 687 static ulong_t \ 688 getandflags##sz(uint##sz##_t x, uint##sz##_t y) \ 689 { \ 690 ulong_t rflags; \ 691 \ 692 __asm __volatile("and %2,%1; pushfq; popq %0" : \ 693 "=r" (rflags), "+r" (x) : "m" (y)); \ 694 return (rflags); \ 695 } struct __hack 696 /* END CSTYLED */ 697 698 GETANDFLAGS(8); 699 GETANDFLAGS(16); 700 GETANDFLAGS(32); 701 GETANDFLAGS(64); 702 703 static ulong_t 704 getandflags(int opsize, uint64_t x, uint64_t y) 705 { 706 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 707 ("getandflags: invalid operand size %d", opsize)); 708 709 if (opsize == 1) 710 return (getandflags8(x, y)); 711 else if (opsize == 2) 712 return (getandflags16(x, y)); 713 else if (opsize == 4) 714 return (getandflags32(x, y)); 715 else 716 return (getandflags64(x, y)); 717 } 718 719 static int 720 vie_emulate_mov_cr(struct vie *vie, struct vm *vm, int vcpuid) 721 { 722 uint64_t val; 723 int err; 724 enum vm_reg_name gpr = gpr_map[vie->rm]; 725 enum vm_reg_name cr = cr_map[vie->reg]; 726 727 uint_t size = 4; 728 if (vie->paging.cpu_mode == CPU_MODE_64BIT) { 729 size = 8; 730 } 731 732 switch (vie->op.op_byte) { 733 case 0x20: 734 /* 735 * MOV control register (ModRM:reg) to reg (ModRM:r/m) 736 * 20/r: mov r32, CR0-CR7 737 * 20/r: mov r64, CR0-CR7 738 * REX.R + 20/0: mov r64, CR8 739 */ 740 if (vie->paging.cpl != 0) { 741 vm_inject_gp(vm, vcpuid); 742 vie->num_processed = 0; 743 return (0); 744 } 745 err = vm_get_register(vm, vcpuid, cr, &val); 746 if (err != 0) { 747 /* #UD for access to non-existent CRs */ 748 vm_inject_ud(vm, vcpuid); 749 vie->num_processed = 0; 750 return (0); 751 } 752 err = vie_update_register(vm, vcpuid, gpr, val, size); 753 break; 754 case 0x22: { 755 /* 756 * MOV reg (ModRM:r/m) to control register (ModRM:reg) 757 * 22/r: mov CR0-CR7, r32 758 * 22/r: mov CR0-CR7, r64 759 * REX.R + 22/0: mov CR8, r64 760 */ 761 uint64_t old, diff; 762 763 if (vie->paging.cpl != 0) { 764 vm_inject_gp(vm, vcpuid); 765 vie->num_processed = 0; 766 return (0); 767 } 768 err = vm_get_register(vm, vcpuid, cr, &old); 769 if (err != 0) { 770 /* #UD for access to non-existent CRs */ 771 vm_inject_ud(vm, vcpuid); 772 vie->num_processed = 0; 773 return (0); 774 } 775 err = vm_get_register(vm, vcpuid, gpr, &val); 776 VERIFY0(err); 777 val &= size2mask[size]; 778 diff = old ^ val; 779 780 switch (cr) { 781 case VM_REG_GUEST_CR0: 782 if ((diff & CR0_PG) != 0) { 783 uint64_t efer; 784 785 err = vm_get_register(vm, vcpuid, 786 VM_REG_GUEST_EFER, &efer); 787 VERIFY0(err); 788 789 /* Keep the long-mode state in EFER in sync */ 790 if ((val & CR0_PG) != 0 && 791 (efer & EFER_LME) != 0) { 792 efer |= EFER_LMA; 793 } 794 if ((val & CR0_PG) == 0 && 795 (efer & EFER_LME) != 0) { 796 efer &= ~EFER_LMA; 797 } 798 799 err = vm_set_register(vm, vcpuid, 800 VM_REG_GUEST_EFER, efer); 801 VERIFY0(err); 802 } 803 /* TODO: enforce more of the #GP checks */ 804 err = vm_set_register(vm, vcpuid, cr, val); 805 VERIFY0(err); 806 break; 807 case VM_REG_GUEST_CR2: 808 case VM_REG_GUEST_CR3: 809 case VM_REG_GUEST_CR4: 810 /* TODO: enforce more of the #GP checks */ 811 err = vm_set_register(vm, vcpuid, cr, val); 812 break; 813 default: 814 /* The cr_map mapping should prevent this */ 815 panic("invalid cr %d", cr); 816 } 817 break; 818 } 819 default: 820 return (EINVAL); 821 } 822 return (err); 823 } 824 825 static int 826 vie_emulate_mov(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 827 { 828 int error, size; 829 enum vm_reg_name reg; 830 uint8_t byte; 831 uint64_t val; 832 833 size = vie->opsize; 834 error = EINVAL; 835 836 switch (vie->op.op_byte) { 837 case 0x88: 838 /* 839 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m) 840 * 88/r: mov r/m8, r8 841 * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available) 842 */ 843 size = 1; /* override for byte operation */ 844 error = vie_read_bytereg(vie, vm, vcpuid, &byte); 845 if (error == 0) { 846 error = vie_mmio_write(vie, vm, vcpuid, gpa, byte, 847 size); 848 } 849 break; 850 case 0x89: 851 /* 852 * MOV from reg (ModRM:reg) to mem (ModRM:r/m) 853 * 89/r: mov r/m16, r16 854 * 89/r: mov r/m32, r32 855 * REX.W + 89/r mov r/m64, r64 856 */ 857 reg = gpr_map[vie->reg]; 858 error = vm_get_register(vm, vcpuid, reg, &val); 859 if (error == 0) { 860 val &= size2mask[size]; 861 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size); 862 } 863 break; 864 case 0x8A: 865 /* 866 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg) 867 * 8A/r: mov r8, r/m8 868 * REX + 8A/r: mov r8, r/m8 869 */ 870 size = 1; /* override for byte operation */ 871 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size); 872 if (error == 0) 873 error = vie_write_bytereg(vie, vm, vcpuid, val); 874 break; 875 case 0x8B: 876 /* 877 * MOV from mem (ModRM:r/m) to reg (ModRM:reg) 878 * 8B/r: mov r16, r/m16 879 * 8B/r: mov r32, r/m32 880 * REX.W 8B/r: mov r64, r/m64 881 */ 882 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size); 883 if (error == 0) { 884 reg = gpr_map[vie->reg]; 885 error = vie_update_register(vm, vcpuid, reg, val, size); 886 } 887 break; 888 case 0xA1: 889 /* 890 * MOV from seg:moffset to AX/EAX/RAX 891 * A1: mov AX, moffs16 892 * A1: mov EAX, moffs32 893 * REX.W + A1: mov RAX, moffs64 894 */ 895 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size); 896 if (error == 0) { 897 reg = VM_REG_GUEST_RAX; 898 error = vie_update_register(vm, vcpuid, reg, val, size); 899 } 900 break; 901 case 0xA3: 902 /* 903 * MOV from AX/EAX/RAX to seg:moffset 904 * A3: mov moffs16, AX 905 * A3: mov moffs32, EAX 906 * REX.W + A3: mov moffs64, RAX 907 */ 908 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); 909 if (error == 0) { 910 val &= size2mask[size]; 911 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size); 912 } 913 break; 914 case 0xC6: 915 /* 916 * MOV from imm8 to mem (ModRM:r/m) 917 * C6/0 mov r/m8, imm8 918 * REX + C6/0 mov r/m8, imm8 919 */ 920 size = 1; /* override for byte operation */ 921 val = vie->immediate; 922 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size); 923 break; 924 case 0xC7: 925 /* 926 * MOV from imm16/imm32 to mem (ModRM:r/m) 927 * C7/0 mov r/m16, imm16 928 * C7/0 mov r/m32, imm32 929 * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) 930 */ 931 val = vie->immediate & size2mask[size]; 932 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size); 933 break; 934 default: 935 break; 936 } 937 938 return (error); 939 } 940 941 static int 942 vie_emulate_movx(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 943 { 944 int error, size; 945 enum vm_reg_name reg; 946 uint64_t val; 947 948 size = vie->opsize; 949 error = EINVAL; 950 951 switch (vie->op.op_byte) { 952 case 0xB6: 953 /* 954 * MOV and zero extend byte from mem (ModRM:r/m) to 955 * reg (ModRM:reg). 956 * 957 * 0F B6/r movzx r16, r/m8 958 * 0F B6/r movzx r32, r/m8 959 * REX.W + 0F B6/r movzx r64, r/m8 960 */ 961 962 /* get the first operand */ 963 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 1); 964 if (error) 965 break; 966 967 /* get the second operand */ 968 reg = gpr_map[vie->reg]; 969 970 /* zero-extend byte */ 971 val = (uint8_t)val; 972 973 /* write the result */ 974 error = vie_update_register(vm, vcpuid, reg, val, size); 975 break; 976 case 0xB7: 977 /* 978 * MOV and zero extend word from mem (ModRM:r/m) to 979 * reg (ModRM:reg). 980 * 981 * 0F B7/r movzx r32, r/m16 982 * REX.W + 0F B7/r movzx r64, r/m16 983 */ 984 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 2); 985 if (error) 986 return (error); 987 988 reg = gpr_map[vie->reg]; 989 990 /* zero-extend word */ 991 val = (uint16_t)val; 992 993 error = vie_update_register(vm, vcpuid, reg, val, size); 994 break; 995 case 0xBE: 996 /* 997 * MOV and sign extend byte from mem (ModRM:r/m) to 998 * reg (ModRM:reg). 999 * 1000 * 0F BE/r movsx r16, r/m8 1001 * 0F BE/r movsx r32, r/m8 1002 * REX.W + 0F BE/r movsx r64, r/m8 1003 */ 1004 1005 /* get the first operand */ 1006 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 1); 1007 if (error) 1008 break; 1009 1010 /* get the second operand */ 1011 reg = gpr_map[vie->reg]; 1012 1013 /* sign extend byte */ 1014 val = (int8_t)val; 1015 1016 /* write the result */ 1017 error = vie_update_register(vm, vcpuid, reg, val, size); 1018 break; 1019 default: 1020 break; 1021 } 1022 return (error); 1023 } 1024 1025 /* 1026 * Helper function to calculate and validate a linear address. 1027 */ 1028 static int 1029 vie_get_gla(struct vie *vie, struct vm *vm, int vcpuid, int opsize, 1030 int addrsize, int prot, enum vm_reg_name seg, enum vm_reg_name gpr, 1031 uint64_t *gla) 1032 { 1033 struct seg_desc desc; 1034 uint64_t cr0, val, rflags; 1035 int error; 1036 struct vm_guest_paging *paging; 1037 1038 paging = &vie->paging; 1039 1040 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); 1041 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); 1042 1043 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1044 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 1045 1046 error = vm_get_seg_desc(vm, vcpuid, seg, &desc); 1047 KASSERT(error == 0, ("%s: error %d getting segment descriptor %d", 1048 __func__, error, seg)); 1049 1050 error = vm_get_register(vm, vcpuid, gpr, &val); 1051 KASSERT(error == 0, ("%s: error %d getting register %d", __func__, 1052 error, gpr)); 1053 1054 if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize, 1055 addrsize, prot, gla)) { 1056 if (seg == VM_REG_GUEST_SS) 1057 vm_inject_ss(vm, vcpuid, 0); 1058 else 1059 vm_inject_gp(vm, vcpuid); 1060 return (-1); 1061 } 1062 1063 if (vie_canonical_check(paging->cpu_mode, *gla)) { 1064 if (seg == VM_REG_GUEST_SS) 1065 vm_inject_ss(vm, vcpuid, 0); 1066 else 1067 vm_inject_gp(vm, vcpuid); 1068 return (-1); 1069 } 1070 1071 if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) { 1072 vm_inject_ac(vm, vcpuid, 0); 1073 return (-1); 1074 } 1075 1076 return (0); 1077 } 1078 1079 static int 1080 vie_emulate_movs(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1081 { 1082 struct vm_copyinfo copyinfo[2]; 1083 uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val; 1084 uint64_t rcx, rdi, rsi, rflags; 1085 int error, fault, opsize, seg, repeat; 1086 struct vm_guest_paging *paging; 1087 1088 opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize; 1089 val = 0; 1090 error = 0; 1091 paging = &vie->paging; 1092 1093 /* 1094 * XXX although the MOVS instruction is only supposed to be used with 1095 * the "rep" prefix some guests like FreeBSD will use "repnz" instead. 1096 * 1097 * Empirically the "repnz" prefix has identical behavior to "rep" 1098 * and the zero flag does not make a difference. 1099 */ 1100 repeat = vie->repz_present | vie->repnz_present; 1101 1102 if (repeat) { 1103 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); 1104 KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); 1105 1106 /* 1107 * The count register is %rcx, %ecx or %cx depending on the 1108 * address size of the instruction. 1109 */ 1110 if ((rcx & vie_size2mask(vie->addrsize)) == 0) { 1111 error = 0; 1112 goto done; 1113 } 1114 } 1115 1116 /* 1117 * Source Destination Comments 1118 * -------------------------------------------- 1119 * (1) memory memory n/a 1120 * (2) memory mmio emulated 1121 * (3) mmio memory emulated 1122 * (4) mmio mmio emulated 1123 * 1124 * At this point we don't have sufficient information to distinguish 1125 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this 1126 * out because it will succeed only when operating on regular memory. 1127 * 1128 * XXX the emulation doesn't properly handle the case where 'gpa' 1129 * is straddling the boundary between the normal memory and MMIO. 1130 */ 1131 1132 seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS; 1133 if (vie_get_gla(vie, vm, vcpuid, opsize, vie->addrsize, PROT_READ, seg, 1134 VM_REG_GUEST_RSI, &srcaddr) != 0) { 1135 goto done; 1136 } 1137 1138 error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ, 1139 copyinfo, nitems(copyinfo), &fault); 1140 if (error == 0) { 1141 if (fault) 1142 goto done; /* Resume guest to handle fault */ 1143 1144 /* 1145 * case (2): read from system memory and write to mmio. 1146 */ 1147 vm_copyin(vm, vcpuid, copyinfo, &val, opsize); 1148 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 1149 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, opsize); 1150 if (error) 1151 goto done; 1152 } else { 1153 /* 1154 * 'vm_copy_setup()' is expected to fail for cases (3) and (4) 1155 * if 'srcaddr' is in the mmio space. 1156 */ 1157 1158 if (vie_get_gla(vie, vm, vcpuid, opsize, vie->addrsize, 1159 PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, 1160 &dstaddr) != 0) { 1161 goto done; 1162 } 1163 1164 error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize, 1165 PROT_WRITE, copyinfo, nitems(copyinfo), &fault); 1166 if (error == 0) { 1167 if (fault) 1168 goto done; /* Resume guest to handle fault */ 1169 1170 /* 1171 * case (3): read from MMIO and write to system memory. 1172 * 1173 * A MMIO read can have side-effects so we 1174 * commit to it only after vm_copy_setup() is 1175 * successful. If a page-fault needs to be 1176 * injected into the guest then it will happen 1177 * before the MMIO read is attempted. 1178 */ 1179 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 1180 opsize); 1181 1182 if (error == 0) { 1183 vm_copyout(vm, vcpuid, &val, copyinfo, opsize); 1184 } 1185 /* 1186 * Regardless of whether the MMIO read was successful or 1187 * not, the copy resources must be cleaned up. 1188 */ 1189 vm_copy_teardown(vm, vcpuid, copyinfo, 1190 nitems(copyinfo)); 1191 if (error != 0) { 1192 goto done; 1193 } 1194 } else { 1195 /* 1196 * Case (4): read from and write to mmio. 1197 * 1198 * Commit to the MMIO read/write (with potential 1199 * side-effects) only after we are sure that the 1200 * instruction is not going to be restarted due 1201 * to address translation faults. 1202 */ 1203 error = vm_gla2gpa(vm, vcpuid, paging, srcaddr, 1204 PROT_READ, &srcgpa, &fault); 1205 if (error || fault) 1206 goto done; 1207 1208 error = vm_gla2gpa(vm, vcpuid, paging, dstaddr, 1209 PROT_WRITE, &dstgpa, &fault); 1210 if (error || fault) 1211 goto done; 1212 1213 error = vie_mmio_read(vie, vm, vcpuid, srcgpa, &val, 1214 opsize); 1215 if (error) 1216 goto done; 1217 1218 error = vie_mmio_write(vie, vm, vcpuid, dstgpa, val, 1219 opsize); 1220 if (error) 1221 goto done; 1222 } 1223 } 1224 1225 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi); 1226 KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error)); 1227 1228 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); 1229 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); 1230 1231 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1232 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 1233 1234 if (rflags & PSL_D) { 1235 rsi -= opsize; 1236 rdi -= opsize; 1237 } else { 1238 rsi += opsize; 1239 rdi += opsize; 1240 } 1241 1242 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi, 1243 vie->addrsize); 1244 KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error)); 1245 1246 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, 1247 vie->addrsize); 1248 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); 1249 1250 if (repeat) { 1251 rcx = rcx - 1; 1252 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, 1253 rcx, vie->addrsize); 1254 KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); 1255 1256 /* 1257 * Repeat the instruction if the count register is not zero. 1258 */ 1259 if ((rcx & vie_size2mask(vie->addrsize)) != 0) 1260 return (vie_repeat(vie)); 1261 } 1262 done: 1263 return (error); 1264 } 1265 1266 static int 1267 vie_emulate_stos(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1268 { 1269 int error, opsize, repeat; 1270 uint64_t val; 1271 uint64_t rcx, rdi, rflags; 1272 1273 opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize; 1274 repeat = vie->repz_present | vie->repnz_present; 1275 1276 if (repeat) { 1277 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); 1278 KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); 1279 1280 /* 1281 * The count register is %rcx, %ecx or %cx depending on the 1282 * address size of the instruction. 1283 */ 1284 if ((rcx & vie_size2mask(vie->addrsize)) == 0) 1285 return (0); 1286 } 1287 1288 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); 1289 KASSERT(!error, ("%s: error %d getting rax", __func__, error)); 1290 1291 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, opsize); 1292 if (error) 1293 return (error); 1294 1295 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); 1296 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); 1297 1298 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1299 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 1300 1301 if (rflags & PSL_D) 1302 rdi -= opsize; 1303 else 1304 rdi += opsize; 1305 1306 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, 1307 vie->addrsize); 1308 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); 1309 1310 if (repeat) { 1311 rcx = rcx - 1; 1312 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, 1313 rcx, vie->addrsize); 1314 KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); 1315 1316 /* 1317 * Repeat the instruction if the count register is not zero. 1318 */ 1319 if ((rcx & vie_size2mask(vie->addrsize)) != 0) 1320 return (vie_repeat(vie)); 1321 } 1322 1323 return (0); 1324 } 1325 1326 static int 1327 vie_emulate_and(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1328 { 1329 int error, size; 1330 enum vm_reg_name reg; 1331 uint64_t result, rflags, rflags2, val1, val2; 1332 1333 size = vie->opsize; 1334 error = EINVAL; 1335 1336 switch (vie->op.op_byte) { 1337 case 0x23: 1338 /* 1339 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the 1340 * result in reg. 1341 * 1342 * 23/r and r16, r/m16 1343 * 23/r and r32, r/m32 1344 * REX.W + 23/r and r64, r/m64 1345 */ 1346 1347 /* get the first operand */ 1348 reg = gpr_map[vie->reg]; 1349 error = vm_get_register(vm, vcpuid, reg, &val1); 1350 if (error) 1351 break; 1352 1353 /* get the second operand */ 1354 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size); 1355 if (error) 1356 break; 1357 1358 /* perform the operation and write the result */ 1359 result = val1 & val2; 1360 error = vie_update_register(vm, vcpuid, reg, result, size); 1361 break; 1362 case 0x81: 1363 case 0x83: 1364 /* 1365 * AND mem (ModRM:r/m) with immediate and store the 1366 * result in mem. 1367 * 1368 * 81 /4 and r/m16, imm16 1369 * 81 /4 and r/m32, imm32 1370 * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64 1371 * 1372 * 83 /4 and r/m16, imm8 sign-extended to 16 1373 * 83 /4 and r/m32, imm8 sign-extended to 32 1374 * REX.W + 83/4 and r/m64, imm8 sign-extended to 64 1375 */ 1376 1377 /* get the first operand */ 1378 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val1, size); 1379 if (error) 1380 break; 1381 1382 /* 1383 * perform the operation with the pre-fetched immediate 1384 * operand and write the result 1385 */ 1386 result = val1 & vie->immediate; 1387 error = vie_mmio_write(vie, vm, vcpuid, gpa, result, size); 1388 break; 1389 default: 1390 break; 1391 } 1392 if (error) 1393 return (error); 1394 1395 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1396 if (error) 1397 return (error); 1398 1399 /* 1400 * OF and CF are cleared; the SF, ZF and PF flags are set according 1401 * to the result; AF is undefined. 1402 * 1403 * The updated status flags are obtained by subtracting 0 from 'result'. 1404 */ 1405 rflags2 = getcc(size, result, 0); 1406 rflags &= ~RFLAGS_STATUS_BITS; 1407 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 1408 1409 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 1410 return (error); 1411 } 1412 1413 static int 1414 vie_emulate_or(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1415 { 1416 int error, size; 1417 enum vm_reg_name reg; 1418 uint64_t result, rflags, rflags2, val1, val2; 1419 1420 size = vie->opsize; 1421 error = EINVAL; 1422 1423 switch (vie->op.op_byte) { 1424 case 0x0B: 1425 /* 1426 * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the 1427 * result in reg. 1428 * 1429 * 0b/r or r16, r/m16 1430 * 0b/r or r32, r/m32 1431 * REX.W + 0b/r or r64, r/m64 1432 */ 1433 1434 /* get the first operand */ 1435 reg = gpr_map[vie->reg]; 1436 error = vm_get_register(vm, vcpuid, reg, &val1); 1437 if (error) 1438 break; 1439 1440 /* get the second operand */ 1441 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size); 1442 if (error) 1443 break; 1444 1445 /* perform the operation and write the result */ 1446 result = val1 | val2; 1447 error = vie_update_register(vm, vcpuid, reg, result, size); 1448 break; 1449 case 0x81: 1450 case 0x83: 1451 /* 1452 * OR mem (ModRM:r/m) with immediate and store the 1453 * result in mem. 1454 * 1455 * 81 /1 or r/m16, imm16 1456 * 81 /1 or r/m32, imm32 1457 * REX.W + 81 /1 or r/m64, imm32 sign-extended to 64 1458 * 1459 * 83 /1 or r/m16, imm8 sign-extended to 16 1460 * 83 /1 or r/m32, imm8 sign-extended to 32 1461 * REX.W + 83/1 or r/m64, imm8 sign-extended to 64 1462 */ 1463 1464 /* get the first operand */ 1465 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val1, size); 1466 if (error) 1467 break; 1468 1469 /* 1470 * perform the operation with the pre-fetched immediate 1471 * operand and write the result 1472 */ 1473 result = val1 | vie->immediate; 1474 error = vie_mmio_write(vie, vm, vcpuid, gpa, result, size); 1475 break; 1476 default: 1477 break; 1478 } 1479 if (error) 1480 return (error); 1481 1482 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1483 if (error) 1484 return (error); 1485 1486 /* 1487 * OF and CF are cleared; the SF, ZF and PF flags are set according 1488 * to the result; AF is undefined. 1489 * 1490 * The updated status flags are obtained by subtracting 0 from 'result'. 1491 */ 1492 rflags2 = getcc(size, result, 0); 1493 rflags &= ~RFLAGS_STATUS_BITS; 1494 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 1495 1496 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 1497 return (error); 1498 } 1499 1500 static int 1501 vie_emulate_cmp(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1502 { 1503 int error, size; 1504 uint64_t regop, memop, op1, op2, rflags, rflags2; 1505 enum vm_reg_name reg; 1506 1507 size = vie->opsize; 1508 switch (vie->op.op_byte) { 1509 case 0x39: 1510 case 0x3B: 1511 /* 1512 * 39/r CMP r/m16, r16 1513 * 39/r CMP r/m32, r32 1514 * REX.W 39/r CMP r/m64, r64 1515 * 1516 * 3B/r CMP r16, r/m16 1517 * 3B/r CMP r32, r/m32 1518 * REX.W + 3B/r CMP r64, r/m64 1519 * 1520 * Compare the first operand with the second operand and 1521 * set status flags in EFLAGS register. The comparison is 1522 * performed by subtracting the second operand from the first 1523 * operand and then setting the status flags. 1524 */ 1525 1526 /* Get the register operand */ 1527 reg = gpr_map[vie->reg]; 1528 error = vm_get_register(vm, vcpuid, reg, ®op); 1529 if (error) 1530 return (error); 1531 1532 /* Get the memory operand */ 1533 error = vie_mmio_read(vie, vm, vcpuid, gpa, &memop, size); 1534 if (error) 1535 return (error); 1536 1537 if (vie->op.op_byte == 0x3B) { 1538 op1 = regop; 1539 op2 = memop; 1540 } else { 1541 op1 = memop; 1542 op2 = regop; 1543 } 1544 rflags2 = getcc(size, op1, op2); 1545 break; 1546 case 0x80: 1547 case 0x81: 1548 case 0x83: 1549 /* 1550 * 80 /7 cmp r/m8, imm8 1551 * REX + 80 /7 cmp r/m8, imm8 1552 * 1553 * 81 /7 cmp r/m16, imm16 1554 * 81 /7 cmp r/m32, imm32 1555 * REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 64 1556 * 1557 * 83 /7 cmp r/m16, imm8 sign-extended to 16 1558 * 83 /7 cmp r/m32, imm8 sign-extended to 32 1559 * REX.W + 83 /7 cmp r/m64, imm8 sign-extended to 64 1560 * 1561 * Compare mem (ModRM:r/m) with immediate and set 1562 * status flags according to the results. The 1563 * comparison is performed by subtracting the 1564 * immediate from the first operand and then setting 1565 * the status flags. 1566 * 1567 */ 1568 if (vie->op.op_byte == 0x80) 1569 size = 1; 1570 1571 /* get the first operand */ 1572 error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size); 1573 if (error) 1574 return (error); 1575 1576 rflags2 = getcc(size, op1, vie->immediate); 1577 break; 1578 default: 1579 return (EINVAL); 1580 } 1581 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1582 if (error) 1583 return (error); 1584 rflags &= ~RFLAGS_STATUS_BITS; 1585 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1586 1587 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 1588 return (error); 1589 } 1590 1591 static int 1592 vie_emulate_test(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1593 { 1594 int error, size; 1595 uint64_t op1, rflags, rflags2; 1596 1597 size = vie->opsize; 1598 error = EINVAL; 1599 1600 switch (vie->op.op_byte) { 1601 case 0xF6: 1602 /* 1603 * F6 /0 test r/m8, imm8 1604 * 1605 * Test mem (ModRM:r/m) with immediate and set status 1606 * flags according to the results. The comparison is 1607 * performed by anding the immediate from the first 1608 * operand and then setting the status flags. 1609 */ 1610 if ((vie->reg & 7) != 0) 1611 return (EINVAL); 1612 1613 size = 1; /* override for byte operation */ 1614 1615 error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size); 1616 if (error) 1617 return (error); 1618 1619 rflags2 = getandflags(size, op1, vie->immediate); 1620 break; 1621 case 0xF7: 1622 /* 1623 * F7 /0 test r/m16, imm16 1624 * F7 /0 test r/m32, imm32 1625 * REX.W + F7 /0 test r/m64, imm32 sign-extended to 64 1626 * 1627 * Test mem (ModRM:r/m) with immediate and set status 1628 * flags according to the results. The comparison is 1629 * performed by anding the immediate from the first 1630 * operand and then setting the status flags. 1631 */ 1632 if ((vie->reg & 7) != 0) 1633 return (EINVAL); 1634 1635 error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size); 1636 if (error) 1637 return (error); 1638 1639 rflags2 = getandflags(size, op1, vie->immediate); 1640 break; 1641 default: 1642 return (EINVAL); 1643 } 1644 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1645 if (error) 1646 return (error); 1647 1648 /* 1649 * OF and CF are cleared; the SF, ZF and PF flags are set according 1650 * to the result; AF is undefined. 1651 */ 1652 rflags &= ~RFLAGS_STATUS_BITS; 1653 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 1654 1655 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 1656 return (error); 1657 } 1658 1659 static int 1660 vie_emulate_bextr(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1661 { 1662 uint64_t src1, src2, dst, rflags; 1663 unsigned start, len; 1664 int error, size; 1665 struct vm_guest_paging *paging; 1666 1667 size = vie->opsize; 1668 error = EINVAL; 1669 paging = &vie->paging; 1670 1671 /* 1672 * VEX.LZ.0F38.W0 F7 /r BEXTR r32a, r/m32, r32b 1673 * VEX.LZ.0F38.W1 F7 /r BEXTR r64a, r/m64, r64b 1674 * 1675 * Destination operand is ModRM:reg. Source operands are ModRM:r/m and 1676 * Vex.vvvv. 1677 * 1678 * Operand size is always 32-bit if not in 64-bit mode (W1 is ignored). 1679 */ 1680 if (size != 4 && paging->cpu_mode != CPU_MODE_64BIT) 1681 size = 4; 1682 1683 /* 1684 * Extracts contiguous bits from the first /source/ operand (second 1685 * operand) using an index and length specified in the second /source/ 1686 * operand (third operand). 1687 */ 1688 error = vie_mmio_read(vie, vm, vcpuid, gpa, &src1, size); 1689 if (error) 1690 return (error); 1691 error = vm_get_register(vm, vcpuid, gpr_map[vie->vex_reg], &src2); 1692 if (error) 1693 return (error); 1694 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1695 if (error) 1696 return (error); 1697 1698 start = (src2 & 0xff); 1699 len = (src2 & 0xff00) >> 8; 1700 1701 /* If no bits are extracted, the destination register is cleared. */ 1702 dst = 0; 1703 1704 /* If START exceeds the operand size, no bits are extracted. */ 1705 if (start > size * 8) 1706 goto done; 1707 /* Length is bounded by both the destination size and start offset. */ 1708 if (start + len > size * 8) 1709 len = (size * 8) - start; 1710 if (len == 0) 1711 goto done; 1712 1713 if (start > 0) 1714 src1 = (src1 >> start); 1715 if (len < 64) 1716 src1 = src1 & ((1ull << len) - 1); 1717 dst = src1; 1718 1719 done: 1720 error = vie_update_register(vm, vcpuid, gpr_map[vie->reg], dst, size); 1721 if (error) 1722 return (error); 1723 1724 /* 1725 * AMD: OF, CF cleared; SF/AF/PF undefined; ZF set by result. 1726 * Intel: ZF is set by result; AF/SF/PF undefined; all others cleared. 1727 */ 1728 rflags &= ~RFLAGS_STATUS_BITS; 1729 if (dst == 0) 1730 rflags |= PSL_Z; 1731 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 1732 8); 1733 return (error); 1734 } 1735 1736 static int 1737 vie_emulate_add(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1738 { 1739 int error, size; 1740 uint64_t nval, rflags, rflags2, val1, val2; 1741 enum vm_reg_name reg; 1742 1743 size = vie->opsize; 1744 error = EINVAL; 1745 1746 switch (vie->op.op_byte) { 1747 case 0x03: 1748 /* 1749 * ADD r/m to r and store the result in r 1750 * 1751 * 03/r ADD r16, r/m16 1752 * 03/r ADD r32, r/m32 1753 * REX.W + 03/r ADD r64, r/m64 1754 */ 1755 1756 /* get the first operand */ 1757 reg = gpr_map[vie->reg]; 1758 error = vm_get_register(vm, vcpuid, reg, &val1); 1759 if (error) 1760 break; 1761 1762 /* get the second operand */ 1763 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size); 1764 if (error) 1765 break; 1766 1767 /* perform the operation and write the result */ 1768 nval = val1 + val2; 1769 error = vie_update_register(vm, vcpuid, reg, nval, size); 1770 break; 1771 default: 1772 break; 1773 } 1774 1775 if (!error) { 1776 rflags2 = getaddflags(size, val1, val2); 1777 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1778 &rflags); 1779 if (error) 1780 return (error); 1781 1782 rflags &= ~RFLAGS_STATUS_BITS; 1783 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1784 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1785 rflags, 8); 1786 } 1787 1788 return (error); 1789 } 1790 1791 static int 1792 vie_emulate_sub(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1793 { 1794 int error, size; 1795 uint64_t nval, rflags, rflags2, val1, val2; 1796 enum vm_reg_name reg; 1797 1798 size = vie->opsize; 1799 error = EINVAL; 1800 1801 switch (vie->op.op_byte) { 1802 case 0x2B: 1803 /* 1804 * SUB r/m from r and store the result in r 1805 * 1806 * 2B/r SUB r16, r/m16 1807 * 2B/r SUB r32, r/m32 1808 * REX.W + 2B/r SUB r64, r/m64 1809 */ 1810 1811 /* get the first operand */ 1812 reg = gpr_map[vie->reg]; 1813 error = vm_get_register(vm, vcpuid, reg, &val1); 1814 if (error) 1815 break; 1816 1817 /* get the second operand */ 1818 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size); 1819 if (error) 1820 break; 1821 1822 /* perform the operation and write the result */ 1823 nval = val1 - val2; 1824 error = vie_update_register(vm, vcpuid, reg, nval, size); 1825 break; 1826 default: 1827 break; 1828 } 1829 1830 if (!error) { 1831 rflags2 = getcc(size, val1, val2); 1832 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1833 &rflags); 1834 if (error) 1835 return (error); 1836 1837 rflags &= ~RFLAGS_STATUS_BITS; 1838 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1839 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1840 rflags, 8); 1841 } 1842 1843 return (error); 1844 } 1845 1846 static int 1847 vie_emulate_stack_op(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1848 { 1849 struct vm_copyinfo copyinfo[2]; 1850 struct seg_desc ss_desc; 1851 uint64_t cr0, rflags, rsp, stack_gla, val; 1852 int error, fault, size, stackaddrsize, pushop; 1853 struct vm_guest_paging *paging; 1854 1855 val = 0; 1856 size = vie->opsize; 1857 pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0; 1858 paging = &vie->paging; 1859 1860 /* 1861 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1 1862 */ 1863 if (paging->cpu_mode == CPU_MODE_REAL) { 1864 stackaddrsize = 2; 1865 } else if (paging->cpu_mode == CPU_MODE_64BIT) { 1866 /* 1867 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3 1868 * - Stack pointer size is always 64-bits. 1869 * - PUSH/POP of 32-bit values is not possible in 64-bit mode. 1870 * - 16-bit PUSH/POP is supported by using the operand size 1871 * override prefix (66H). 1872 */ 1873 stackaddrsize = 8; 1874 size = vie->opsize_override ? 2 : 8; 1875 } else { 1876 /* 1877 * In protected or compatibility mode the 'B' flag in the 1878 * stack-segment descriptor determines the size of the 1879 * stack pointer. 1880 */ 1881 error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc); 1882 KASSERT(error == 0, ("%s: error %d getting SS descriptor", 1883 __func__, error)); 1884 if (SEG_DESC_DEF32(ss_desc.access)) 1885 stackaddrsize = 4; 1886 else 1887 stackaddrsize = 2; 1888 } 1889 1890 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); 1891 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); 1892 1893 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1894 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 1895 1896 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp); 1897 KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error)); 1898 if (pushop) { 1899 rsp -= size; 1900 } 1901 1902 if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc, 1903 rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ, 1904 &stack_gla)) { 1905 vm_inject_ss(vm, vcpuid, 0); 1906 return (0); 1907 } 1908 1909 if (vie_canonical_check(paging->cpu_mode, stack_gla)) { 1910 vm_inject_ss(vm, vcpuid, 0); 1911 return (0); 1912 } 1913 1914 if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) { 1915 vm_inject_ac(vm, vcpuid, 0); 1916 return (0); 1917 } 1918 1919 error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, 1920 pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo), 1921 &fault); 1922 if (error || fault) 1923 return (error); 1924 1925 if (pushop) { 1926 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size); 1927 if (error == 0) 1928 vm_copyout(vm, vcpuid, &val, copyinfo, size); 1929 } else { 1930 vm_copyin(vm, vcpuid, copyinfo, &val, size); 1931 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size); 1932 rsp += size; 1933 } 1934 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 1935 1936 if (error == 0) { 1937 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp, 1938 stackaddrsize); 1939 KASSERT(error == 0, ("error %d updating rsp", error)); 1940 } 1941 return (error); 1942 } 1943 1944 static int 1945 vie_emulate_push(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1946 { 1947 int error; 1948 1949 /* 1950 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. 1951 * 1952 * PUSH is part of the group 5 extended opcodes and is identified 1953 * by ModRM:reg = b110. 1954 */ 1955 if ((vie->reg & 7) != 6) 1956 return (EINVAL); 1957 1958 error = vie_emulate_stack_op(vie, vm, vcpuid, gpa); 1959 return (error); 1960 } 1961 1962 static int 1963 vie_emulate_pop(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1964 { 1965 int error; 1966 1967 /* 1968 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. 1969 * 1970 * POP is part of the group 1A extended opcodes and is identified 1971 * by ModRM:reg = b000. 1972 */ 1973 if ((vie->reg & 7) != 0) 1974 return (EINVAL); 1975 1976 error = vie_emulate_stack_op(vie, vm, vcpuid, gpa); 1977 return (error); 1978 } 1979 1980 static int 1981 vie_emulate_group1(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1982 { 1983 int error; 1984 1985 switch (vie->reg & 7) { 1986 case 0x1: /* OR */ 1987 error = vie_emulate_or(vie, vm, vcpuid, gpa); 1988 break; 1989 case 0x4: /* AND */ 1990 error = vie_emulate_and(vie, vm, vcpuid, gpa); 1991 break; 1992 case 0x7: /* CMP */ 1993 error = vie_emulate_cmp(vie, vm, vcpuid, gpa); 1994 break; 1995 default: 1996 error = EINVAL; 1997 break; 1998 } 1999 2000 return (error); 2001 } 2002 2003 static int 2004 vie_emulate_bittest(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 2005 { 2006 uint64_t val, rflags; 2007 int error, bitmask, bitoff; 2008 2009 /* 2010 * 0F BA is a Group 8 extended opcode. 2011 * 2012 * Currently we only emulate the 'Bit Test' instruction which is 2013 * identified by a ModR/M:reg encoding of 100b. 2014 */ 2015 if ((vie->reg & 7) != 4) 2016 return (EINVAL); 2017 2018 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 2019 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 2020 2021 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, vie->opsize); 2022 if (error) 2023 return (error); 2024 2025 /* 2026 * Intel SDM, Vol 2, Table 3-2: 2027 * "Range of Bit Positions Specified by Bit Offset Operands" 2028 */ 2029 bitmask = vie->opsize * 8 - 1; 2030 bitoff = vie->immediate & bitmask; 2031 2032 /* Copy the bit into the Carry flag in %rflags */ 2033 if (val & (1UL << bitoff)) 2034 rflags |= PSL_C; 2035 else 2036 rflags &= ~PSL_C; 2037 2038 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 2039 KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error)); 2040 2041 return (0); 2042 } 2043 2044 static int 2045 vie_emulate_twob_group15(struct vie *vie, struct vm *vm, int vcpuid, 2046 uint64_t gpa) 2047 { 2048 int error; 2049 uint64_t buf; 2050 2051 switch (vie->reg & 7) { 2052 case 0x7: /* CLFLUSH, CLFLUSHOPT, and SFENCE */ 2053 if (vie->mod == 0x3) { 2054 /* 2055 * SFENCE. Ignore it, VM exit provides enough 2056 * barriers on its own. 2057 */ 2058 error = 0; 2059 } else { 2060 /* 2061 * CLFLUSH, CLFLUSHOPT. Only check for access 2062 * rights. 2063 */ 2064 error = vie_mmio_read(vie, vm, vcpuid, gpa, &buf, 1); 2065 } 2066 break; 2067 default: 2068 error = EINVAL; 2069 break; 2070 } 2071 2072 return (error); 2073 } 2074 2075 static int 2076 vie_emulate_clts(struct vie *vie, struct vm *vm, int vcpuid) 2077 { 2078 uint64_t val; 2079 int error; 2080 2081 if (vie->paging.cpl != 0) { 2082 vm_inject_gp(vm, vcpuid); 2083 vie->num_processed = 0; 2084 return (0); 2085 } 2086 2087 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &val); 2088 ASSERT(error == 0); 2089 2090 /* Clear %cr0.TS */ 2091 val &= ~CR0_TS; 2092 2093 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, val); 2094 ASSERT(error == 0); 2095 2096 return (0); 2097 } 2098 2099 static int 2100 vie_mmio_read(struct vie *vie, struct vm *vm, int cpuid, uint64_t gpa, 2101 uint64_t *rval, int bytes) 2102 { 2103 int err; 2104 2105 if (vie->mmio_req_read.state == VR_DONE) { 2106 ASSERT(vie->mmio_req_read.bytes == bytes); 2107 ASSERT(vie->mmio_req_read.gpa == gpa); 2108 2109 *rval = vie->mmio_req_read.data; 2110 return (0); 2111 } 2112 2113 err = vm_service_mmio_read(vm, cpuid, gpa, rval, bytes); 2114 if (err == 0) { 2115 /* 2116 * A successful read from an in-kernel-emulated device may come 2117 * with side effects, so stash the result in case it's used for 2118 * an instruction which subsequently needs to issue an MMIO 2119 * write to userspace. 2120 */ 2121 ASSERT(vie->mmio_req_read.state == VR_NONE); 2122 2123 vie->mmio_req_read.bytes = bytes; 2124 vie->mmio_req_read.gpa = gpa; 2125 vie->mmio_req_read.data = *rval; 2126 vie->mmio_req_read.state = VR_DONE; 2127 2128 } else if (err == ESRCH) { 2129 /* Hope that userspace emulation can fulfill this read */ 2130 vie->mmio_req_read.bytes = bytes; 2131 vie->mmio_req_read.gpa = gpa; 2132 vie->mmio_req_read.state = VR_PENDING; 2133 vie->status |= VIES_PENDING_MMIO; 2134 } else if (err < 0) { 2135 /* 2136 * The MMIO read failed in such a way that fallback to handling 2137 * in userspace is required. 2138 */ 2139 vie->status |= VIES_USER_FALLBACK; 2140 } 2141 return (err); 2142 } 2143 2144 static int 2145 vie_mmio_write(struct vie *vie, struct vm *vm, int cpuid, uint64_t gpa, 2146 uint64_t wval, int bytes) 2147 { 2148 int err; 2149 2150 if (vie->mmio_req_write.state == VR_DONE) { 2151 ASSERT(vie->mmio_req_write.bytes == bytes); 2152 ASSERT(vie->mmio_req_write.gpa == gpa); 2153 2154 return (0); 2155 } 2156 2157 err = vm_service_mmio_write(vm, cpuid, gpa, wval, bytes); 2158 if (err == 0) { 2159 /* 2160 * A successful write to an in-kernel-emulated device probably 2161 * results in side effects, so stash the fact that such a write 2162 * succeeded in case the operation requires other work. 2163 */ 2164 vie->mmio_req_write.bytes = bytes; 2165 vie->mmio_req_write.gpa = gpa; 2166 vie->mmio_req_write.data = wval; 2167 vie->mmio_req_write.state = VR_DONE; 2168 } else if (err == ESRCH) { 2169 /* Hope that userspace emulation can fulfill this write */ 2170 vie->mmio_req_write.bytes = bytes; 2171 vie->mmio_req_write.gpa = gpa; 2172 vie->mmio_req_write.data = wval; 2173 vie->mmio_req_write.state = VR_PENDING; 2174 vie->status |= VIES_PENDING_MMIO; 2175 } else if (err < 0) { 2176 /* 2177 * The MMIO write failed in such a way that fallback to handling 2178 * in userspace is required. 2179 */ 2180 vie->status |= VIES_USER_FALLBACK; 2181 } 2182 return (err); 2183 } 2184 2185 int 2186 vie_emulate_mmio(struct vie *vie, struct vm *vm, int vcpuid) 2187 { 2188 int error; 2189 uint64_t gpa; 2190 2191 if ((vie->status & (VIES_INST_DECODE | VIES_MMIO)) != 2192 (VIES_INST_DECODE | VIES_MMIO)) { 2193 return (EINVAL); 2194 } 2195 2196 gpa = vie->mmio_gpa; 2197 2198 switch (vie->op.op_type) { 2199 case VIE_OP_TYPE_GROUP1: 2200 error = vie_emulate_group1(vie, vm, vcpuid, gpa); 2201 break; 2202 case VIE_OP_TYPE_POP: 2203 error = vie_emulate_pop(vie, vm, vcpuid, gpa); 2204 break; 2205 case VIE_OP_TYPE_PUSH: 2206 error = vie_emulate_push(vie, vm, vcpuid, gpa); 2207 break; 2208 case VIE_OP_TYPE_CMP: 2209 error = vie_emulate_cmp(vie, vm, vcpuid, gpa); 2210 break; 2211 case VIE_OP_TYPE_MOV: 2212 error = vie_emulate_mov(vie, vm, vcpuid, gpa); 2213 break; 2214 case VIE_OP_TYPE_MOVSX: 2215 case VIE_OP_TYPE_MOVZX: 2216 error = vie_emulate_movx(vie, vm, vcpuid, gpa); 2217 break; 2218 case VIE_OP_TYPE_MOVS: 2219 error = vie_emulate_movs(vie, vm, vcpuid, gpa); 2220 break; 2221 case VIE_OP_TYPE_STOS: 2222 error = vie_emulate_stos(vie, vm, vcpuid, gpa); 2223 break; 2224 case VIE_OP_TYPE_AND: 2225 error = vie_emulate_and(vie, vm, vcpuid, gpa); 2226 break; 2227 case VIE_OP_TYPE_OR: 2228 error = vie_emulate_or(vie, vm, vcpuid, gpa); 2229 break; 2230 case VIE_OP_TYPE_SUB: 2231 error = vie_emulate_sub(vie, vm, vcpuid, gpa); 2232 break; 2233 case VIE_OP_TYPE_BITTEST: 2234 error = vie_emulate_bittest(vie, vm, vcpuid, gpa); 2235 break; 2236 case VIE_OP_TYPE_TWOB_GRP15: 2237 error = vie_emulate_twob_group15(vie, vm, vcpuid, gpa); 2238 break; 2239 case VIE_OP_TYPE_ADD: 2240 error = vie_emulate_add(vie, vm, vcpuid, gpa); 2241 break; 2242 case VIE_OP_TYPE_TEST: 2243 error = vie_emulate_test(vie, vm, vcpuid, gpa); 2244 break; 2245 case VIE_OP_TYPE_BEXTR: 2246 error = vie_emulate_bextr(vie, vm, vcpuid, gpa); 2247 break; 2248 default: 2249 error = EINVAL; 2250 break; 2251 } 2252 2253 if (error == ESRCH) { 2254 /* Return to userspace with the mmio request */ 2255 return (-1); 2256 } 2257 2258 return (error); 2259 } 2260 2261 static int 2262 vie_emulate_inout_port(struct vie *vie, struct vm *vm, int vcpuid, 2263 uint32_t *eax) 2264 { 2265 uint32_t mask, val; 2266 bool in; 2267 int err; 2268 2269 mask = vie_size2mask(vie->inout.bytes); 2270 in = (vie->inout.flags & INOUT_IN) != 0; 2271 2272 if (!in) { 2273 val = *eax & mask; 2274 } 2275 2276 if (vie->inout_req_state != VR_DONE) { 2277 err = vm_ioport_access(vm, vcpuid, in, vie->inout.port, 2278 vie->inout.bytes, &val); 2279 val &= mask; 2280 } else { 2281 /* 2282 * This port access was handled in userspace and the result was 2283 * injected in to be handled now. 2284 */ 2285 val = vie->inout_req_val & mask; 2286 vie->inout_req_state = VR_NONE; 2287 err = 0; 2288 } 2289 2290 if (err == ESRCH) { 2291 vie->status |= VIES_PENDING_INOUT; 2292 vie->inout_req_state = VR_PENDING; 2293 return (err); 2294 } else if (err != 0) { 2295 return (err); 2296 } 2297 2298 if (in) { 2299 *eax = (*eax & ~mask) | val; 2300 } 2301 return (0); 2302 } 2303 2304 static enum vm_reg_name 2305 vie_inout_segname(const struct vie *vie) 2306 { 2307 uint8_t segidx = vie->inout.segment; 2308 const enum vm_reg_name segmap[] = { 2309 VM_REG_GUEST_ES, 2310 VM_REG_GUEST_CS, 2311 VM_REG_GUEST_SS, 2312 VM_REG_GUEST_DS, 2313 VM_REG_GUEST_FS, 2314 VM_REG_GUEST_GS, 2315 }; 2316 const uint8_t maxidx = (sizeof (segmap) / sizeof (segmap[0])); 2317 2318 if (segidx >= maxidx) { 2319 panic("unexpected segment index %u", segidx); 2320 } 2321 return (segmap[segidx]); 2322 } 2323 2324 static int 2325 vie_emulate_inout_str(struct vie *vie, struct vm *vm, int vcpuid) 2326 { 2327 uint8_t bytes, addrsize; 2328 uint64_t index, count = 0, gla, rflags; 2329 int prot, err, fault; 2330 bool in, repeat; 2331 enum vm_reg_name seg_reg, idx_reg; 2332 struct vm_copyinfo copyinfo[2]; 2333 2334 in = (vie->inout.flags & INOUT_IN) != 0; 2335 bytes = vie->inout.bytes; 2336 addrsize = vie->inout.addrsize; 2337 prot = in ? PROT_WRITE : PROT_READ; 2338 2339 ASSERT(bytes == 1 || bytes == 2 || bytes == 4); 2340 ASSERT(addrsize == 2 || addrsize == 4 || addrsize == 8); 2341 2342 idx_reg = (in) ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; 2343 seg_reg = vie_inout_segname(vie); 2344 err = vm_get_register(vm, vcpuid, idx_reg, &index); 2345 ASSERT(err == 0); 2346 index = index & vie_size2mask(addrsize); 2347 2348 repeat = (vie->inout.flags & INOUT_REP) != 0; 2349 2350 /* Count register */ 2351 if (repeat) { 2352 err = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &count); 2353 count &= vie_size2mask(addrsize); 2354 2355 if (count == 0) { 2356 /* 2357 * If we were asked to emulate a REP INS/OUTS when the 2358 * count register is zero, no further work is required. 2359 */ 2360 return (0); 2361 } 2362 } else { 2363 count = 1; 2364 } 2365 2366 gla = 0; 2367 if (vie_get_gla(vie, vm, vcpuid, bytes, addrsize, prot, seg_reg, 2368 idx_reg, &gla) != 0) { 2369 /* vie_get_gla() already injected the appropriate fault */ 2370 return (0); 2371 } 2372 2373 /* 2374 * The INS/OUTS emulate currently assumes that the memory target resides 2375 * within the guest system memory, rather than a device MMIO region. If 2376 * such a case becomes a necessity, that additional handling could be 2377 * put in place. 2378 */ 2379 err = vm_copy_setup(vm, vcpuid, &vie->paging, gla, bytes, prot, 2380 copyinfo, nitems(copyinfo), &fault); 2381 2382 if (err) { 2383 /* Unrecoverable error */ 2384 return (err); 2385 } else if (fault) { 2386 /* Resume guest to handle fault */ 2387 return (0); 2388 } 2389 2390 if (!in) { 2391 vm_copyin(vm, vcpuid, copyinfo, &vie->inout.eax, bytes); 2392 } 2393 2394 err = vie_emulate_inout_port(vie, vm, vcpuid, &vie->inout.eax); 2395 2396 if (err == 0 && in) { 2397 vm_copyout(vm, vcpuid, &vie->inout.eax, copyinfo, bytes); 2398 } 2399 2400 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 2401 2402 if (err == 0) { 2403 err = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2404 &rflags); 2405 ASSERT(err == 0); 2406 2407 /* Update index */ 2408 if (rflags & PSL_D) { 2409 index -= bytes; 2410 } else { 2411 index += bytes; 2412 } 2413 2414 /* Update index register */ 2415 err = vie_update_register(vm, vcpuid, idx_reg, index, addrsize); 2416 ASSERT(err == 0); 2417 2418 /* 2419 * Update count register only if the instruction had a repeat 2420 * prefix. 2421 */ 2422 if ((vie->inout.flags & INOUT_REP) != 0) { 2423 count--; 2424 err = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, 2425 count, addrsize); 2426 ASSERT(err == 0); 2427 2428 if (count != 0) { 2429 return (vie_repeat(vie)); 2430 } 2431 } 2432 } 2433 2434 return (err); 2435 } 2436 2437 int 2438 vie_emulate_inout(struct vie *vie, struct vm *vm, int vcpuid) 2439 { 2440 int err = 0; 2441 2442 if ((vie->status & VIES_INOUT) == 0) { 2443 return (EINVAL); 2444 } 2445 2446 if ((vie->inout.flags & INOUT_STR) == 0) { 2447 /* 2448 * For now, using the 'rep' prefixes with plain (non-string) 2449 * in/out is not supported. 2450 */ 2451 if ((vie->inout.flags & INOUT_REP) != 0) { 2452 return (EINVAL); 2453 } 2454 2455 err = vie_emulate_inout_port(vie, vm, vcpuid, &vie->inout.eax); 2456 if (err == 0 && (vie->inout.flags & INOUT_IN) != 0) { 2457 /* 2458 * With the inX access now a success, the result needs 2459 * to be stored in the guest %rax. 2460 */ 2461 err = vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX, 2462 vie->inout.eax); 2463 VERIFY0(err); 2464 } 2465 } else { 2466 vie->status &= ~VIES_REPEAT; 2467 err = vie_emulate_inout_str(vie, vm, vcpuid); 2468 2469 } 2470 if (err < 0) { 2471 /* 2472 * Access to an I/O port failed in such a way that fallback to 2473 * handling in userspace is required. 2474 */ 2475 vie->status |= VIES_USER_FALLBACK; 2476 } else if (err == ESRCH) { 2477 ASSERT(vie->status & VIES_PENDING_INOUT); 2478 /* Return to userspace with the in/out request */ 2479 err = -1; 2480 } 2481 2482 return (err); 2483 } 2484 2485 int 2486 vie_emulate_other(struct vie *vie, struct vm *vm, int vcpuid) 2487 { 2488 int error; 2489 2490 if ((vie->status & (VIES_INST_DECODE | VIES_OTHER)) != 2491 (VIES_INST_DECODE | VIES_OTHER)) { 2492 return (EINVAL); 2493 } 2494 2495 switch (vie->op.op_type) { 2496 case VIE_OP_TYPE_CLTS: 2497 error = vie_emulate_clts(vie, vm, vcpuid); 2498 break; 2499 case VIE_OP_TYPE_MOV_CR: 2500 error = vie_emulate_mov_cr(vie, vm, vcpuid); 2501 break; 2502 default: 2503 error = EINVAL; 2504 break; 2505 } 2506 2507 return (error); 2508 } 2509 2510 void 2511 vie_reset(struct vie *vie) 2512 { 2513 vie->status = 0; 2514 vie->num_processed = vie->num_valid = 0; 2515 } 2516 2517 void 2518 vie_advance_pc(struct vie *vie, uint64_t *nextrip) 2519 { 2520 VERIFY((vie->status & VIES_REPEAT) == 0); 2521 2522 *nextrip += vie->num_processed; 2523 vie_reset(vie); 2524 } 2525 2526 void 2527 vie_exitinfo(const struct vie *vie, struct vm_exit *vme) 2528 { 2529 if (vie->status & VIES_USER_FALLBACK) { 2530 /* 2531 * Despite the fact that the instruction was successfully 2532 * decoded, some aspect of the emulation failed in such a way 2533 * that it is left up to userspace to complete the operation. 2534 */ 2535 vie_fallback_exitinfo(vie, vme); 2536 } else if (vie->status & VIES_MMIO) { 2537 vme->exitcode = VM_EXITCODE_MMIO; 2538 if (vie->mmio_req_read.state == VR_PENDING) { 2539 vme->u.mmio.gpa = vie->mmio_req_read.gpa; 2540 vme->u.mmio.data = 0; 2541 vme->u.mmio.bytes = vie->mmio_req_read.bytes; 2542 vme->u.mmio.read = 1; 2543 } else if (vie->mmio_req_write.state == VR_PENDING) { 2544 vme->u.mmio.gpa = vie->mmio_req_write.gpa; 2545 vme->u.mmio.data = vie->mmio_req_write.data & 2546 vie_size2mask(vie->mmio_req_write.bytes); 2547 vme->u.mmio.bytes = vie->mmio_req_write.bytes; 2548 vme->u.mmio.read = 0; 2549 } else { 2550 panic("bad pending MMIO state"); 2551 } 2552 } else if (vie->status & VIES_INOUT) { 2553 vme->exitcode = VM_EXITCODE_INOUT; 2554 vme->u.inout.port = vie->inout.port; 2555 vme->u.inout.bytes = vie->inout.bytes; 2556 if ((vie->inout.flags & INOUT_IN) != 0) { 2557 vme->u.inout.flags = INOUT_IN; 2558 vme->u.inout.eax = 0; 2559 } else { 2560 vme->u.inout.flags = 0; 2561 vme->u.inout.eax = vie->inout.eax & 2562 vie_size2mask(vie->inout.bytes); 2563 } 2564 } else { 2565 panic("no pending operation"); 2566 } 2567 } 2568 2569 /* 2570 * In the case of a decoding or verification failure, bailing out to userspace 2571 * to do the instruction emulation is our only option for now. 2572 */ 2573 void 2574 vie_fallback_exitinfo(const struct vie *vie, struct vm_exit *vme) 2575 { 2576 if ((vie->status & VIES_INST_FETCH) == 0) { 2577 bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul)); 2578 } else { 2579 ASSERT(sizeof (vie->inst) == sizeof (vme->u.inst_emul.inst)); 2580 2581 bcopy(vie->inst, vme->u.inst_emul.inst, sizeof (vie->inst)); 2582 vme->u.inst_emul.num_valid = vie->num_valid; 2583 } 2584 vme->exitcode = VM_EXITCODE_INST_EMUL; 2585 } 2586 2587 void 2588 vie_cs_info(const struct vie *vie, struct vm *vm, int vcpuid, uint64_t *cs_base, 2589 int *cs_d) 2590 { 2591 struct seg_desc cs_desc; 2592 int error; 2593 2594 error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &cs_desc); 2595 ASSERT(error == 0); 2596 2597 /* Initialization required for the paging info to be populated */ 2598 VERIFY(vie->status & VIES_INIT); 2599 switch (vie->paging.cpu_mode) { 2600 case CPU_MODE_REAL: 2601 *cs_base = cs_desc.base; 2602 *cs_d = 0; 2603 break; 2604 case CPU_MODE_PROTECTED: 2605 case CPU_MODE_COMPATIBILITY: 2606 *cs_base = cs_desc.base; 2607 *cs_d = SEG_DESC_DEF32(cs_desc.access) ? 1 : 0; 2608 break; 2609 default: 2610 *cs_base = 0; 2611 *cs_d = 0; 2612 break; 2613 } 2614 } 2615 2616 bool 2617 vie_pending(const struct vie *vie) 2618 { 2619 /* 2620 * These VIE status bits indicate conditions which must be addressed 2621 * through either device IO fulfillment (with corresponding 2622 * vie_fulfill_*()) or complete userspace emulation (followed by a 2623 * vie_reset()). 2624 */ 2625 const enum vie_status of_interest = 2626 VIES_PENDING_MMIO | VIES_PENDING_INOUT | VIES_USER_FALLBACK; 2627 2628 return ((vie->status & of_interest) != 0); 2629 } 2630 2631 bool 2632 vie_needs_fetch(const struct vie *vie) 2633 { 2634 if (vie->status & VIES_INST_FETCH) { 2635 ASSERT(vie->num_valid != 0); 2636 return (false); 2637 } 2638 return (true); 2639 } 2640 2641 static int 2642 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) 2643 { 2644 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 2645 ("%s: invalid size %d", __func__, size)); 2646 KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl)); 2647 2648 if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) 2649 return (0); 2650 2651 return ((gla & (size - 1)) ? 1 : 0); 2652 } 2653 2654 static int 2655 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla) 2656 { 2657 uint64_t mask; 2658 2659 if (cpu_mode != CPU_MODE_64BIT) 2660 return (0); 2661 2662 /* 2663 * The value of the bit 47 in the 'gla' should be replicated in the 2664 * most significant 16 bits. 2665 */ 2666 mask = ~((1UL << 48) - 1); 2667 if (gla & (1UL << 47)) 2668 return ((gla & mask) != mask); 2669 else 2670 return ((gla & mask) != 0); 2671 } 2672 2673 static uint64_t 2674 vie_size2mask(int size) 2675 { 2676 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 2677 ("vie_size2mask: invalid size %d", size)); 2678 return (size2mask[size]); 2679 } 2680 2681 static int 2682 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, 2683 struct seg_desc *desc, uint64_t offset, int length, int addrsize, 2684 int prot, uint64_t *gla) 2685 { 2686 uint64_t firstoff, low_limit, high_limit, segbase; 2687 int glasize, type; 2688 2689 KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS, 2690 ("%s: invalid segment %d", __func__, seg)); 2691 KASSERT(length == 1 || length == 2 || length == 4 || length == 8, 2692 ("%s: invalid operand size %d", __func__, length)); 2693 KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, 2694 ("%s: invalid prot %x", __func__, prot)); 2695 2696 firstoff = offset; 2697 if (cpu_mode == CPU_MODE_64BIT) { 2698 KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address " 2699 "size %d for cpu_mode %d", __func__, addrsize, cpu_mode)); 2700 glasize = 8; 2701 } else { 2702 KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address " 2703 "size %d for cpu mode %d", __func__, addrsize, cpu_mode)); 2704 glasize = 4; 2705 /* 2706 * If the segment selector is loaded with a NULL selector 2707 * then the descriptor is unusable and attempting to use 2708 * it results in a #GP(0). 2709 */ 2710 if (SEG_DESC_UNUSABLE(desc->access)) 2711 return (-1); 2712 2713 /* 2714 * The processor generates a #NP exception when a segment 2715 * register is loaded with a selector that points to a 2716 * descriptor that is not present. If this was the case then 2717 * it would have been checked before the VM-exit. 2718 */ 2719 KASSERT(SEG_DESC_PRESENT(desc->access), 2720 ("segment %d not present: %x", seg, desc->access)); 2721 2722 /* 2723 * The descriptor type must indicate a code/data segment. 2724 */ 2725 type = SEG_DESC_TYPE(desc->access); 2726 KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " 2727 "descriptor type %x", seg, type)); 2728 2729 if (prot & PROT_READ) { 2730 /* #GP on a read access to a exec-only code segment */ 2731 if ((type & 0xA) == 0x8) 2732 return (-1); 2733 } 2734 2735 if (prot & PROT_WRITE) { 2736 /* 2737 * #GP on a write access to a code segment or a 2738 * read-only data segment. 2739 */ 2740 if (type & 0x8) /* code segment */ 2741 return (-1); 2742 2743 if ((type & 0xA) == 0) /* read-only data seg */ 2744 return (-1); 2745 } 2746 2747 /* 2748 * 'desc->limit' is fully expanded taking granularity into 2749 * account. 2750 */ 2751 if ((type & 0xC) == 0x4) { 2752 /* expand-down data segment */ 2753 low_limit = desc->limit + 1; 2754 high_limit = SEG_DESC_DEF32(desc->access) ? 2755 0xffffffff : 0xffff; 2756 } else { 2757 /* code segment or expand-up data segment */ 2758 low_limit = 0; 2759 high_limit = desc->limit; 2760 } 2761 2762 while (length > 0) { 2763 offset &= vie_size2mask(addrsize); 2764 if (offset < low_limit || offset > high_limit) 2765 return (-1); 2766 offset++; 2767 length--; 2768 } 2769 } 2770 2771 /* 2772 * In 64-bit mode all segments except %fs and %gs have a segment 2773 * base address of 0. 2774 */ 2775 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && 2776 seg != VM_REG_GUEST_GS) { 2777 segbase = 0; 2778 } else { 2779 segbase = desc->base; 2780 } 2781 2782 /* 2783 * Truncate 'firstoff' to the effective address size before adding 2784 * it to the segment base. 2785 */ 2786 firstoff &= vie_size2mask(addrsize); 2787 *gla = (segbase + firstoff) & vie_size2mask(glasize); 2788 return (0); 2789 } 2790 2791 void 2792 vie_init_mmio(struct vie *vie, const char *inst_bytes, uint8_t inst_length, 2793 const struct vm_guest_paging *paging, uint64_t gpa) 2794 { 2795 KASSERT(inst_length <= VIE_INST_SIZE, 2796 ("%s: invalid instruction length (%d)", __func__, inst_length)); 2797 2798 bzero(vie, sizeof (struct vie)); 2799 2800 vie->base_register = VM_REG_LAST; 2801 vie->index_register = VM_REG_LAST; 2802 vie->segment_register = VM_REG_LAST; 2803 vie->status = VIES_INIT | VIES_MMIO; 2804 2805 if (inst_length != 0) { 2806 bcopy(inst_bytes, vie->inst, inst_length); 2807 vie->num_valid = inst_length; 2808 vie->status |= VIES_INST_FETCH; 2809 } 2810 2811 vie->paging = *paging; 2812 vie->mmio_gpa = gpa; 2813 } 2814 2815 void 2816 vie_init_inout(struct vie *vie, const struct vm_inout *inout, uint8_t inst_len, 2817 const struct vm_guest_paging *paging) 2818 { 2819 bzero(vie, sizeof (struct vie)); 2820 2821 vie->status = VIES_INIT | VIES_INOUT; 2822 2823 vie->inout = *inout; 2824 vie->paging = *paging; 2825 2826 /* 2827 * Since VMX/SVM assists already decoded the nature of the in/out 2828 * instruction, let the status reflect that. 2829 */ 2830 vie->status |= VIES_INST_FETCH | VIES_INST_DECODE; 2831 vie->num_processed = inst_len; 2832 } 2833 2834 void 2835 vie_init_other(struct vie *vie, const struct vm_guest_paging *paging) 2836 { 2837 bzero(vie, sizeof (struct vie)); 2838 2839 vie->base_register = VM_REG_LAST; 2840 vie->index_register = VM_REG_LAST; 2841 vie->segment_register = VM_REG_LAST; 2842 vie->status = VIES_INIT | VIES_OTHER; 2843 2844 vie->paging = *paging; 2845 } 2846 2847 int 2848 vie_fulfill_mmio(struct vie *vie, const struct vm_mmio *result) 2849 { 2850 struct vie_mmio *pending; 2851 2852 if ((vie->status & VIES_MMIO) == 0 || 2853 (vie->status & VIES_PENDING_MMIO) == 0) { 2854 return (EINVAL); 2855 } 2856 2857 if (result->read) { 2858 pending = &vie->mmio_req_read; 2859 } else { 2860 pending = &vie->mmio_req_write; 2861 } 2862 2863 if (pending->state != VR_PENDING || 2864 pending->bytes != result->bytes || pending->gpa != result->gpa) { 2865 return (EINVAL); 2866 } 2867 2868 if (result->read) { 2869 pending->data = result->data & vie_size2mask(pending->bytes); 2870 } 2871 pending->state = VR_DONE; 2872 vie->status &= ~VIES_PENDING_MMIO; 2873 2874 return (0); 2875 } 2876 2877 int 2878 vie_fulfill_inout(struct vie *vie, const struct vm_inout *result) 2879 { 2880 if ((vie->status & VIES_INOUT) == 0 || 2881 (vie->status & VIES_PENDING_INOUT) == 0) { 2882 return (EINVAL); 2883 } 2884 if ((vie->inout.flags & INOUT_IN) != (result->flags & INOUT_IN) || 2885 vie->inout.bytes != result->bytes || 2886 vie->inout.port != result->port) { 2887 return (EINVAL); 2888 } 2889 2890 if (result->flags & INOUT_IN) { 2891 vie->inout_req_val = result->eax & 2892 vie_size2mask(vie->inout.bytes); 2893 } 2894 vie->inout_req_state = VR_DONE; 2895 vie->status &= ~(VIES_PENDING_INOUT); 2896 2897 return (0); 2898 } 2899 2900 uint64_t 2901 vie_mmio_gpa(const struct vie *vie) 2902 { 2903 return (vie->mmio_gpa); 2904 } 2905 2906 static int 2907 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte) 2908 { 2909 int error_code = 0; 2910 2911 if (pte & PG_V) 2912 error_code |= PGEX_P; 2913 if (prot & PROT_WRITE) 2914 error_code |= PGEX_W; 2915 if (usermode) 2916 error_code |= PGEX_U; 2917 if (rsvd) 2918 error_code |= PGEX_RSV; 2919 if (prot & PROT_EXEC) 2920 error_code |= PGEX_I; 2921 2922 return (error_code); 2923 } 2924 2925 static void 2926 ptp_release(vm_page_t **vmp) 2927 { 2928 if (*vmp != NULL) { 2929 (void) vmp_release(*vmp); 2930 *vmp = NULL; 2931 } 2932 } 2933 2934 static void * 2935 ptp_hold(struct vm *vm, int vcpu, uintptr_t gpa, size_t len, vm_page_t **vmp) 2936 { 2937 vm_client_t *vmc = vm_get_vmclient(vm, vcpu); 2938 const uintptr_t hold_gpa = gpa & PAGEMASK; 2939 2940 /* Hold must not cross a page boundary */ 2941 VERIFY3U(gpa + len, <=, hold_gpa + PAGESIZE); 2942 2943 if (*vmp != NULL) { 2944 (void) vmp_release(*vmp); 2945 } 2946 2947 *vmp = vmc_hold(vmc, hold_gpa, PROT_READ | PROT_WRITE); 2948 if (*vmp == NULL) { 2949 return (NULL); 2950 } 2951 2952 return ((caddr_t)vmp_get_writable(*vmp) + (gpa - hold_gpa)); 2953 } 2954 2955 static int 2956 _vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 2957 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only) 2958 { 2959 int nlevels, pfcode; 2960 int ptpshift = 0, ptpindex = 0; 2961 uint64_t ptpphys; 2962 uint64_t *ptpbase = NULL, pte = 0, pgsize = 0; 2963 vm_page_t *cookie = NULL; 2964 const bool usermode = paging->cpl == 3; 2965 const bool writable = (prot & PROT_WRITE) != 0; 2966 2967 *guest_fault = 0; 2968 restart: 2969 ptpphys = paging->cr3; /* root of the page tables */ 2970 ptp_release(&cookie); 2971 2972 if (vie_canonical_check(paging->cpu_mode, gla)) { 2973 /* 2974 * XXX assuming a non-stack reference otherwise a stack fault 2975 * should be generated. 2976 */ 2977 if (!check_only) 2978 vm_inject_gp(vm, vcpuid); 2979 *guest_fault = 1; 2980 return (0); 2981 } 2982 2983 if (paging->paging_mode == PAGING_MODE_FLAT) { 2984 *gpa = gla; 2985 return (0); 2986 } 2987 2988 if (paging->paging_mode == PAGING_MODE_32) { 2989 uint32_t *ptpbase32, pte32; 2990 2991 nlevels = 2; 2992 while (--nlevels >= 0) { 2993 /* Zero out the lower 12 bits. */ 2994 ptpphys &= ~0xfff; 2995 2996 ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, 2997 &cookie); 2998 2999 if (ptpbase32 == NULL) { 3000 return (EFAULT); 3001 } 3002 3003 ptpshift = PAGE_SHIFT + nlevels * 10; 3004 ptpindex = (gla >> ptpshift) & 0x3FF; 3005 pgsize = 1UL << ptpshift; 3006 3007 pte32 = ptpbase32[ptpindex]; 3008 3009 if ((pte32 & PG_V) == 0 || 3010 (usermode && (pte32 & PG_U) == 0) || 3011 (writable && (pte32 & PG_RW) == 0)) { 3012 if (!check_only) { 3013 pfcode = pf_error_code(usermode, prot, 3014 0, pte32); 3015 vm_inject_pf(vm, vcpuid, pfcode, gla); 3016 } 3017 3018 ptp_release(&cookie); 3019 *guest_fault = 1; 3020 return (0); 3021 } 3022 3023 /* 3024 * Emulate the x86 MMU's management of the accessed 3025 * and dirty flags. While the accessed flag is set 3026 * at every level of the page table, the dirty flag 3027 * is only set at the last level providing the guest 3028 * physical address. 3029 */ 3030 if (!check_only && (pte32 & PG_A) == 0) { 3031 if (atomic_cmpset_32(&ptpbase32[ptpindex], 3032 pte32, pte32 | PG_A) == 0) { 3033 goto restart; 3034 } 3035 } 3036 3037 /* XXX must be ignored if CR4.PSE=0 */ 3038 if (nlevels > 0 && (pte32 & PG_PS) != 0) 3039 break; 3040 3041 ptpphys = pte32; 3042 } 3043 3044 /* Set the dirty bit in the page table entry if necessary */ 3045 if (!check_only && writable && (pte32 & PG_M) == 0) { 3046 if (atomic_cmpset_32(&ptpbase32[ptpindex], 3047 pte32, pte32 | PG_M) == 0) { 3048 goto restart; 3049 } 3050 } 3051 3052 /* Zero out the lower 'ptpshift' bits */ 3053 pte32 >>= ptpshift; pte32 <<= ptpshift; 3054 *gpa = pte32 | (gla & (pgsize - 1)); 3055 ptp_release(&cookie); 3056 return (0); 3057 } 3058 3059 if (paging->paging_mode == PAGING_MODE_PAE) { 3060 /* Zero out the lower 5 bits and the upper 32 bits */ 3061 ptpphys &= 0xffffffe0UL; 3062 3063 ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof (*ptpbase) * 4, 3064 &cookie); 3065 if (ptpbase == NULL) { 3066 return (EFAULT); 3067 } 3068 3069 ptpindex = (gla >> 30) & 0x3; 3070 3071 pte = ptpbase[ptpindex]; 3072 3073 if ((pte & PG_V) == 0) { 3074 if (!check_only) { 3075 pfcode = pf_error_code(usermode, prot, 0, pte); 3076 vm_inject_pf(vm, vcpuid, pfcode, gla); 3077 } 3078 3079 ptp_release(&cookie); 3080 *guest_fault = 1; 3081 return (0); 3082 } 3083 3084 ptpphys = pte; 3085 3086 nlevels = 2; 3087 } else { 3088 nlevels = 4; 3089 } 3090 3091 while (--nlevels >= 0) { 3092 /* Zero out the lower 12 bits and the upper 12 bits */ 3093 ptpphys &= 0x000ffffffffff000UL; 3094 3095 ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie); 3096 if (ptpbase == NULL) { 3097 return (EFAULT); 3098 } 3099 3100 ptpshift = PAGE_SHIFT + nlevels * 9; 3101 ptpindex = (gla >> ptpshift) & 0x1FF; 3102 pgsize = 1UL << ptpshift; 3103 3104 pte = ptpbase[ptpindex]; 3105 3106 if ((pte & PG_V) == 0 || 3107 (usermode && (pte & PG_U) == 0) || 3108 (writable && (pte & PG_RW) == 0)) { 3109 if (!check_only) { 3110 pfcode = pf_error_code(usermode, prot, 0, pte); 3111 vm_inject_pf(vm, vcpuid, pfcode, gla); 3112 } 3113 3114 ptp_release(&cookie); 3115 *guest_fault = 1; 3116 return (0); 3117 } 3118 3119 /* Set the accessed bit in the page table entry */ 3120 if (!check_only && (pte & PG_A) == 0) { 3121 if (atomic_cmpset_64(&ptpbase[ptpindex], 3122 pte, pte | PG_A) == 0) { 3123 goto restart; 3124 } 3125 } 3126 3127 if (nlevels > 0 && (pte & PG_PS) != 0) { 3128 if (pgsize > 1 * GB) { 3129 if (!check_only) { 3130 pfcode = pf_error_code(usermode, prot, 3131 1, pte); 3132 vm_inject_pf(vm, vcpuid, pfcode, gla); 3133 } 3134 3135 ptp_release(&cookie); 3136 *guest_fault = 1; 3137 return (0); 3138 } 3139 break; 3140 } 3141 3142 ptpphys = pte; 3143 } 3144 3145 /* Set the dirty bit in the page table entry if necessary */ 3146 if (!check_only && writable && (pte & PG_M) == 0) { 3147 if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0) 3148 goto restart; 3149 } 3150 ptp_release(&cookie); 3151 3152 /* Zero out the lower 'ptpshift' bits and the upper 12 bits */ 3153 pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12; 3154 *gpa = pte | (gla & (pgsize - 1)); 3155 return (0); 3156 } 3157 3158 int 3159 vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 3160 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) 3161 { 3162 3163 return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault, 3164 false)); 3165 } 3166 3167 int 3168 vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 3169 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) 3170 { 3171 3172 return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault, 3173 true)); 3174 } 3175 3176 int 3177 vie_fetch_instruction(struct vie *vie, struct vm *vm, int vcpuid, uint64_t rip, 3178 int *faultptr) 3179 { 3180 struct vm_copyinfo copyinfo[2]; 3181 int error, prot; 3182 3183 if ((vie->status & VIES_INIT) == 0) { 3184 return (EINVAL); 3185 } 3186 3187 prot = PROT_READ | PROT_EXEC; 3188 error = vm_copy_setup(vm, vcpuid, &vie->paging, rip, VIE_INST_SIZE, 3189 prot, copyinfo, nitems(copyinfo), faultptr); 3190 if (error || *faultptr) 3191 return (error); 3192 3193 vm_copyin(vm, vcpuid, copyinfo, vie->inst, VIE_INST_SIZE); 3194 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 3195 vie->num_valid = VIE_INST_SIZE; 3196 vie->status |= VIES_INST_FETCH; 3197 return (0); 3198 } 3199 3200 static int 3201 vie_peek(struct vie *vie, uint8_t *x) 3202 { 3203 3204 if (vie->num_processed < vie->num_valid) { 3205 *x = vie->inst[vie->num_processed]; 3206 return (0); 3207 } else 3208 return (-1); 3209 } 3210 3211 static void 3212 vie_advance(struct vie *vie) 3213 { 3214 3215 vie->num_processed++; 3216 } 3217 3218 static bool 3219 segment_override(uint8_t x, int *seg) 3220 { 3221 3222 switch (x) { 3223 case 0x2E: 3224 *seg = VM_REG_GUEST_CS; 3225 break; 3226 case 0x36: 3227 *seg = VM_REG_GUEST_SS; 3228 break; 3229 case 0x3E: 3230 *seg = VM_REG_GUEST_DS; 3231 break; 3232 case 0x26: 3233 *seg = VM_REG_GUEST_ES; 3234 break; 3235 case 0x64: 3236 *seg = VM_REG_GUEST_FS; 3237 break; 3238 case 0x65: 3239 *seg = VM_REG_GUEST_GS; 3240 break; 3241 default: 3242 return (false); 3243 } 3244 return (true); 3245 } 3246 3247 static int 3248 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d) 3249 { 3250 uint8_t x; 3251 3252 while (1) { 3253 if (vie_peek(vie, &x)) 3254 return (-1); 3255 3256 if (x == 0x66) 3257 vie->opsize_override = 1; 3258 else if (x == 0x67) 3259 vie->addrsize_override = 1; 3260 else if (x == 0xF3) 3261 vie->repz_present = 1; 3262 else if (x == 0xF2) 3263 vie->repnz_present = 1; 3264 else if (segment_override(x, &vie->segment_register)) 3265 vie->segment_override = 1; 3266 else 3267 break; 3268 3269 vie_advance(vie); 3270 } 3271 3272 /* 3273 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2: 3274 * - Only one REX prefix is allowed per instruction. 3275 * - The REX prefix must immediately precede the opcode byte or the 3276 * escape opcode byte. 3277 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3) 3278 * the mandatory prefix must come before the REX prefix. 3279 */ 3280 if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) { 3281 vie->rex_present = 1; 3282 vie->rex_w = x & 0x8 ? 1 : 0; 3283 vie->rex_r = x & 0x4 ? 1 : 0; 3284 vie->rex_x = x & 0x2 ? 1 : 0; 3285 vie->rex_b = x & 0x1 ? 1 : 0; 3286 vie_advance(vie); 3287 } 3288 3289 /* 3290 * § 2.3.5, "The VEX Prefix", SDM Vol 2. 3291 */ 3292 if ((cpu_mode == CPU_MODE_64BIT || 3293 cpu_mode == CPU_MODE_COMPATIBILITY) && x == 0xC4) { 3294 const struct vie_op *optab; 3295 3296 /* 3-byte VEX prefix. */ 3297 vie->vex_present = 1; 3298 3299 vie_advance(vie); 3300 if (vie_peek(vie, &x)) 3301 return (-1); 3302 3303 /* 3304 * 2nd byte: [R', X', B', mmmmm[4:0]]. Bits are inverted 3305 * relative to REX encoding. 3306 */ 3307 vie->rex_r = x & 0x80 ? 0 : 1; 3308 vie->rex_x = x & 0x40 ? 0 : 1; 3309 vie->rex_b = x & 0x20 ? 0 : 1; 3310 3311 switch (x & 0x1F) { 3312 case 0x2: 3313 /* 0F 38. */ 3314 optab = three_byte_opcodes_0f38; 3315 break; 3316 case 0x1: 3317 /* 0F class - nothing handled here yet. */ 3318 /* FALLTHROUGH */ 3319 case 0x3: 3320 /* 0F 3A class - nothing handled here yet. */ 3321 /* FALLTHROUGH */ 3322 default: 3323 /* Reserved (#UD). */ 3324 return (-1); 3325 } 3326 3327 vie_advance(vie); 3328 if (vie_peek(vie, &x)) 3329 return (-1); 3330 3331 /* 3rd byte: [W, vvvv[6:3], L, pp[1:0]]. */ 3332 vie->rex_w = x & 0x80 ? 1 : 0; 3333 3334 vie->vex_reg = ((~(unsigned)x & 0x78u) >> 3); 3335 vie->vex_l = !!(x & 0x4); 3336 vie->vex_pp = (x & 0x3); 3337 3338 /* PP: 1=66 2=F3 3=F2 prefixes. */ 3339 switch (vie->vex_pp) { 3340 case 0x1: 3341 vie->opsize_override = 1; 3342 break; 3343 case 0x2: 3344 vie->repz_present = 1; 3345 break; 3346 case 0x3: 3347 vie->repnz_present = 1; 3348 break; 3349 } 3350 3351 vie_advance(vie); 3352 3353 /* Opcode, sans literal prefix prefix. */ 3354 if (vie_peek(vie, &x)) 3355 return (-1); 3356 3357 vie->op = optab[x]; 3358 if (vie->op.op_type == VIE_OP_TYPE_NONE) 3359 return (-1); 3360 3361 vie_advance(vie); 3362 } 3363 3364 /* 3365 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1 3366 */ 3367 if (cpu_mode == CPU_MODE_64BIT) { 3368 /* 3369 * Default address size is 64-bits and default operand size 3370 * is 32-bits. 3371 */ 3372 vie->addrsize = vie->addrsize_override ? 4 : 8; 3373 if (vie->rex_w) 3374 vie->opsize = 8; 3375 else if (vie->opsize_override) 3376 vie->opsize = 2; 3377 else 3378 vie->opsize = 4; 3379 } else if (cs_d) { 3380 /* Default address and operand sizes are 32-bits */ 3381 vie->addrsize = vie->addrsize_override ? 2 : 4; 3382 vie->opsize = vie->opsize_override ? 2 : 4; 3383 } else { 3384 /* Default address and operand sizes are 16-bits */ 3385 vie->addrsize = vie->addrsize_override ? 4 : 2; 3386 vie->opsize = vie->opsize_override ? 4 : 2; 3387 } 3388 return (0); 3389 } 3390 3391 static int 3392 decode_two_byte_opcode(struct vie *vie) 3393 { 3394 uint8_t x; 3395 3396 if (vie_peek(vie, &x)) 3397 return (-1); 3398 3399 vie->op = two_byte_opcodes[x]; 3400 3401 if (vie->op.op_type == VIE_OP_TYPE_NONE) 3402 return (-1); 3403 3404 vie_advance(vie); 3405 return (0); 3406 } 3407 3408 static int 3409 decode_opcode(struct vie *vie) 3410 { 3411 uint8_t x; 3412 3413 if (vie_peek(vie, &x)) 3414 return (-1); 3415 3416 /* Already did this via VEX prefix. */ 3417 if (vie->op.op_type != VIE_OP_TYPE_NONE) 3418 return (0); 3419 3420 vie->op = one_byte_opcodes[x]; 3421 3422 if (vie->op.op_type == VIE_OP_TYPE_NONE) 3423 return (-1); 3424 3425 vie_advance(vie); 3426 3427 if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE) 3428 return (decode_two_byte_opcode(vie)); 3429 3430 return (0); 3431 } 3432 3433 static int 3434 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode) 3435 { 3436 uint8_t x; 3437 /* 3438 * Handling mov-to/from-cr is special since it is not issuing 3439 * mmio/pio requests and can be done in real mode. We must bypass some 3440 * of the other existing decoding restrictions for it. 3441 */ 3442 const bool is_movcr = ((vie->op.op_flags & VIE_OP_F_REG_REG) != 0); 3443 3444 if (vie->op.op_flags & VIE_OP_F_NO_MODRM) 3445 return (0); 3446 3447 if (cpu_mode == CPU_MODE_REAL && !is_movcr) 3448 return (-1); 3449 3450 if (vie_peek(vie, &x)) 3451 return (-1); 3452 3453 vie->mod = (x >> 6) & 0x3; 3454 vie->rm = (x >> 0) & 0x7; 3455 vie->reg = (x >> 3) & 0x7; 3456 3457 /* 3458 * A direct addressing mode makes no sense in the context of an EPT 3459 * fault. There has to be a memory access involved to cause the 3460 * EPT fault. 3461 */ 3462 if (vie->mod == VIE_MOD_DIRECT && !is_movcr) 3463 return (-1); 3464 3465 if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) || 3466 (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) { 3467 /* 3468 * Table 2-5: Special Cases of REX Encodings 3469 * 3470 * mod=0, r/m=5 is used in the compatibility mode to 3471 * indicate a disp32 without a base register. 3472 * 3473 * mod!=3, r/m=4 is used in the compatibility mode to 3474 * indicate that the SIB byte is present. 3475 * 3476 * The 'b' bit in the REX prefix is don't care in 3477 * this case. 3478 */ 3479 } else { 3480 vie->rm |= (vie->rex_b << 3); 3481 } 3482 3483 vie->reg |= (vie->rex_r << 3); 3484 3485 /* SIB */ 3486 if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB) 3487 goto done; 3488 3489 vie->base_register = gpr_map[vie->rm]; 3490 3491 switch (vie->mod) { 3492 case VIE_MOD_INDIRECT_DISP8: 3493 vie->disp_bytes = 1; 3494 break; 3495 case VIE_MOD_INDIRECT_DISP32: 3496 vie->disp_bytes = 4; 3497 break; 3498 case VIE_MOD_INDIRECT: 3499 if (vie->rm == VIE_RM_DISP32) { 3500 vie->disp_bytes = 4; 3501 /* 3502 * Table 2-7. RIP-Relative Addressing 3503 * 3504 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32 3505 * whereas in compatibility mode it just implies disp32. 3506 */ 3507 3508 if (cpu_mode == CPU_MODE_64BIT) 3509 vie->base_register = VM_REG_GUEST_RIP; 3510 else 3511 vie->base_register = VM_REG_LAST; 3512 } 3513 break; 3514 } 3515 3516 done: 3517 vie_advance(vie); 3518 3519 return (0); 3520 } 3521 3522 static int 3523 decode_sib(struct vie *vie) 3524 { 3525 uint8_t x; 3526 3527 /* Proceed only if SIB byte is present */ 3528 if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB) 3529 return (0); 3530 3531 if (vie_peek(vie, &x)) 3532 return (-1); 3533 3534 /* De-construct the SIB byte */ 3535 vie->ss = (x >> 6) & 0x3; 3536 vie->index = (x >> 3) & 0x7; 3537 vie->base = (x >> 0) & 0x7; 3538 3539 /* Apply the REX prefix modifiers */ 3540 vie->index |= vie->rex_x << 3; 3541 vie->base |= vie->rex_b << 3; 3542 3543 switch (vie->mod) { 3544 case VIE_MOD_INDIRECT_DISP8: 3545 vie->disp_bytes = 1; 3546 break; 3547 case VIE_MOD_INDIRECT_DISP32: 3548 vie->disp_bytes = 4; 3549 break; 3550 } 3551 3552 if (vie->mod == VIE_MOD_INDIRECT && 3553 (vie->base == 5 || vie->base == 13)) { 3554 /* 3555 * Special case when base register is unused if mod = 0 3556 * and base = %rbp or %r13. 3557 * 3558 * Documented in: 3559 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 3560 * Table 2-5: Special Cases of REX Encodings 3561 */ 3562 vie->disp_bytes = 4; 3563 } else { 3564 vie->base_register = gpr_map[vie->base]; 3565 } 3566 3567 /* 3568 * All encodings of 'index' are valid except for %rsp (4). 3569 * 3570 * Documented in: 3571 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 3572 * Table 2-5: Special Cases of REX Encodings 3573 */ 3574 if (vie->index != 4) 3575 vie->index_register = gpr_map[vie->index]; 3576 3577 /* 'scale' makes sense only in the context of an index register */ 3578 if (vie->index_register < VM_REG_LAST) 3579 vie->scale = 1 << vie->ss; 3580 3581 vie_advance(vie); 3582 3583 return (0); 3584 } 3585 3586 static int 3587 decode_displacement(struct vie *vie) 3588 { 3589 int n, i; 3590 uint8_t x; 3591 3592 union { 3593 char buf[4]; 3594 int8_t signed8; 3595 int32_t signed32; 3596 } u; 3597 3598 if ((n = vie->disp_bytes) == 0) 3599 return (0); 3600 3601 if (n != 1 && n != 4) 3602 panic("decode_displacement: invalid disp_bytes %d", n); 3603 3604 for (i = 0; i < n; i++) { 3605 if (vie_peek(vie, &x)) 3606 return (-1); 3607 3608 u.buf[i] = x; 3609 vie_advance(vie); 3610 } 3611 3612 if (n == 1) 3613 vie->displacement = u.signed8; /* sign-extended */ 3614 else 3615 vie->displacement = u.signed32; /* sign-extended */ 3616 3617 return (0); 3618 } 3619 3620 static int 3621 decode_immediate(struct vie *vie) 3622 { 3623 int i, n; 3624 uint8_t x; 3625 union { 3626 char buf[4]; 3627 int8_t signed8; 3628 int16_t signed16; 3629 int32_t signed32; 3630 } u; 3631 3632 /* Figure out immediate operand size (if any) */ 3633 if (vie->op.op_flags & VIE_OP_F_IMM) { 3634 /* 3635 * Section 2.2.1.5 "Immediates", Intel SDM: 3636 * In 64-bit mode the typical size of immediate operands 3637 * remains 32-bits. When the operand size if 64-bits, the 3638 * processor sign-extends all immediates to 64-bits prior 3639 * to their use. 3640 */ 3641 if (vie->opsize == 4 || vie->opsize == 8) 3642 vie->imm_bytes = 4; 3643 else 3644 vie->imm_bytes = 2; 3645 } else if (vie->op.op_flags & VIE_OP_F_IMM8) { 3646 vie->imm_bytes = 1; 3647 } 3648 3649 if ((n = vie->imm_bytes) == 0) 3650 return (0); 3651 3652 KASSERT(n == 1 || n == 2 || n == 4, 3653 ("%s: invalid number of immediate bytes: %d", __func__, n)); 3654 3655 for (i = 0; i < n; i++) { 3656 if (vie_peek(vie, &x)) 3657 return (-1); 3658 3659 u.buf[i] = x; 3660 vie_advance(vie); 3661 } 3662 3663 /* sign-extend the immediate value before use */ 3664 if (n == 1) 3665 vie->immediate = u.signed8; 3666 else if (n == 2) 3667 vie->immediate = u.signed16; 3668 else 3669 vie->immediate = u.signed32; 3670 3671 return (0); 3672 } 3673 3674 static int 3675 decode_moffset(struct vie *vie) 3676 { 3677 int i, n; 3678 uint8_t x; 3679 union { 3680 char buf[8]; 3681 uint64_t u64; 3682 } u; 3683 3684 if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0) 3685 return (0); 3686 3687 /* 3688 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM: 3689 * The memory offset size follows the address-size of the instruction. 3690 */ 3691 n = vie->addrsize; 3692 KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n)); 3693 3694 u.u64 = 0; 3695 for (i = 0; i < n; i++) { 3696 if (vie_peek(vie, &x)) 3697 return (-1); 3698 3699 u.buf[i] = x; 3700 vie_advance(vie); 3701 } 3702 vie->displacement = u.u64; 3703 return (0); 3704 } 3705 3706 /* 3707 * Verify that the 'guest linear address' provided as collateral of the nested 3708 * page table fault matches with our instruction decoding. 3709 */ 3710 int 3711 vie_verify_gla(struct vie *vie, struct vm *vm, int cpuid, uint64_t gla) 3712 { 3713 int error; 3714 uint64_t base, segbase, idx, gla2; 3715 enum vm_reg_name seg; 3716 struct seg_desc desc; 3717 3718 ASSERT((vie->status & VIES_INST_DECODE) != 0); 3719 3720 /* 3721 * If there was no valid GLA context with the exit, or the decoded 3722 * instruction acts on more than one address, verification is done. 3723 */ 3724 if (gla == VIE_INVALID_GLA || 3725 (vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) != 0) { 3726 return (0); 3727 } 3728 3729 base = 0; 3730 if (vie->base_register != VM_REG_LAST) { 3731 error = vm_get_register(vm, cpuid, vie->base_register, &base); 3732 if (error) { 3733 printf("verify_gla: error %d getting base reg %d\n", 3734 error, vie->base_register); 3735 return (-1); 3736 } 3737 3738 /* 3739 * RIP-relative addressing starts from the following 3740 * instruction 3741 */ 3742 if (vie->base_register == VM_REG_GUEST_RIP) 3743 base += vie->num_processed; 3744 } 3745 3746 idx = 0; 3747 if (vie->index_register != VM_REG_LAST) { 3748 error = vm_get_register(vm, cpuid, vie->index_register, &idx); 3749 if (error) { 3750 printf("verify_gla: error %d getting index reg %d\n", 3751 error, vie->index_register); 3752 return (-1); 3753 } 3754 } 3755 3756 /* 3757 * From "Specifying a Segment Selector", Intel SDM, Vol 1 3758 * 3759 * In 64-bit mode, segmentation is generally (but not 3760 * completely) disabled. The exceptions are the FS and GS 3761 * segments. 3762 * 3763 * In legacy IA-32 mode, when the ESP or EBP register is used 3764 * as the base, the SS segment is the default segment. For 3765 * other data references, except when relative to stack or 3766 * string destination the DS segment is the default. These 3767 * can be overridden to allow other segments to be accessed. 3768 */ 3769 if (vie->segment_override) { 3770 seg = vie->segment_register; 3771 } else if (vie->base_register == VM_REG_GUEST_RSP || 3772 vie->base_register == VM_REG_GUEST_RBP) { 3773 seg = VM_REG_GUEST_SS; 3774 } else { 3775 seg = VM_REG_GUEST_DS; 3776 } 3777 if (vie->paging.cpu_mode == CPU_MODE_64BIT && 3778 seg != VM_REG_GUEST_FS && seg != VM_REG_GUEST_GS) { 3779 segbase = 0; 3780 } else { 3781 error = vm_get_seg_desc(vm, cpuid, seg, &desc); 3782 if (error) { 3783 printf("verify_gla: error %d getting segment" 3784 " descriptor %d", error, vie->segment_register); 3785 return (-1); 3786 } 3787 segbase = desc.base; 3788 } 3789 3790 gla2 = segbase + base + vie->scale * idx + vie->displacement; 3791 gla2 &= size2mask[vie->addrsize]; 3792 if (gla != gla2) { 3793 printf("verify_gla mismatch: segbase(0x%0lx)" 3794 "base(0x%0lx), scale(%d), index(0x%0lx), " 3795 "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n", 3796 segbase, base, vie->scale, idx, vie->displacement, 3797 gla, gla2); 3798 return (-1); 3799 } 3800 3801 return (0); 3802 } 3803 3804 int 3805 vie_decode_instruction(struct vie *vie, struct vm *vm, int cpuid, int cs_d) 3806 { 3807 enum vm_cpu_mode cpu_mode; 3808 3809 if ((vie->status & VIES_INST_FETCH) == 0) { 3810 return (EINVAL); 3811 } 3812 3813 cpu_mode = vie->paging.cpu_mode; 3814 3815 if (decode_prefixes(vie, cpu_mode, cs_d)) 3816 return (-1); 3817 3818 if (decode_opcode(vie)) 3819 return (-1); 3820 3821 if (decode_modrm(vie, cpu_mode)) 3822 return (-1); 3823 3824 if (decode_sib(vie)) 3825 return (-1); 3826 3827 if (decode_displacement(vie)) 3828 return (-1); 3829 3830 if (decode_immediate(vie)) 3831 return (-1); 3832 3833 if (decode_moffset(vie)) 3834 return (-1); 3835 3836 vie->status |= VIES_INST_DECODE; 3837 3838 return (0); 3839 } 3840