1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2012 Sandvine, Inc. 5 * Copyright (c) 2012 NetApp, Inc. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * $FreeBSD$ 30 */ 31 /* 32 * This file and its contents are supplied under the terms of the 33 * Common Development and Distribution License ("CDDL"), version 1.0. 34 * You may only use this file in accordance with the terms of version 35 * 1.0 of the CDDL. 36 * 37 * A full copy of the text of the CDDL should have accompanied this 38 * source. A copy of the CDDL is also available via the Internet at 39 * http://www.illumos.org/license/CDDL. 40 * 41 * Copyright 2015 Pluribus Networks Inc. 42 * Copyright 2018 Joyent, Inc. 43 * Copyright 2021 Oxide Computer Company 44 * Copyright 2022 OmniOS Community Edition (OmniOSce) Association. 45 */ 46 47 #include <sys/cdefs.h> 48 __FBSDID("$FreeBSD$"); 49 50 #include <sys/param.h> 51 #include <sys/pcpu.h> 52 #include <sys/systm.h> 53 #include <sys/proc.h> 54 55 #include <machine/vmparam.h> 56 #include <machine/vmm.h> 57 #include <sys/vmm_kernel.h> 58 #include <sys/vmm_vm.h> 59 60 #include <sys/vmm_instruction_emul.h> 61 #include <x86/psl.h> 62 #include <x86/specialreg.h> 63 64 #include "vmm_ioport.h" 65 66 enum vie_status { 67 VIES_INIT = (1U << 0), 68 VIES_MMIO = (1U << 1), 69 VIES_INOUT = (1U << 2), 70 VIES_OTHER = (1U << 3), 71 VIES_INST_FETCH = (1U << 4), 72 VIES_INST_DECODE = (1U << 5), 73 VIES_PENDING_MMIO = (1U << 6), 74 VIES_PENDING_INOUT = (1U << 7), 75 VIES_REPEAT = (1U << 8), 76 VIES_USER_FALLBACK = (1U << 9), 77 VIES_COMPLETE = (1U << 10), 78 }; 79 80 /* State of request to perform emulated access (inout or MMIO) */ 81 enum vie_req { 82 VR_NONE, 83 VR_PENDING, 84 VR_DONE, 85 }; 86 87 struct vie_mmio { 88 uint64_t data; 89 uint64_t gpa; 90 uint8_t bytes; 91 enum vie_req state; 92 }; 93 94 struct vie_op { 95 uint8_t op_byte; /* actual opcode byte */ 96 uint8_t op_type; /* type of operation (e.g. MOV) */ 97 uint16_t op_flags; 98 }; 99 100 #define VIE_INST_SIZE 15 101 struct vie { 102 uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */ 103 uint8_t num_valid; /* size of the instruction */ 104 uint8_t num_processed; 105 106 uint8_t addrsize:4, opsize:4; /* address and operand sizes */ 107 uint8_t rex_w:1, /* REX prefix */ 108 rex_r:1, 109 rex_x:1, 110 rex_b:1, 111 rex_present:1, 112 repz_present:1, /* REP/REPE/REPZ prefix */ 113 repnz_present:1, /* REPNE/REPNZ prefix */ 114 opsize_override:1, /* Operand size override */ 115 addrsize_override:1, /* Address size override */ 116 segment_override:1; /* Segment override */ 117 118 uint8_t mod:2, /* ModRM byte */ 119 reg:4, 120 rm:4; 121 122 uint8_t ss:2, /* SIB byte */ 123 vex_present:1, /* VEX prefixed */ 124 vex_l:1, /* L bit */ 125 index:4, /* SIB byte */ 126 base:4; /* SIB byte */ 127 128 uint8_t disp_bytes; 129 uint8_t imm_bytes; 130 131 uint8_t scale; 132 133 uint8_t vex_reg:4, /* vvvv: first source reg specifier */ 134 vex_pp:2, /* pp */ 135 _sparebits:2; 136 137 uint8_t _sparebytes[2]; 138 139 int base_register; /* VM_REG_GUEST_xyz */ 140 int index_register; /* VM_REG_GUEST_xyz */ 141 int segment_register; /* VM_REG_GUEST_xyz */ 142 143 int64_t displacement; /* optional addr displacement */ 144 int64_t immediate; /* optional immediate operand */ 145 146 struct vie_op op; /* opcode description */ 147 148 enum vie_status status; 149 150 struct vm_guest_paging paging; /* guest paging state */ 151 152 uint64_t mmio_gpa; /* faulting GPA */ 153 struct vie_mmio mmio_req_read; 154 struct vie_mmio mmio_req_write; 155 156 struct vm_inout inout; /* active in/out op */ 157 enum vie_req inout_req_state; 158 uint32_t inout_req_val; /* value from userspace */ 159 }; 160 161 162 /* struct vie_op.op_type */ 163 enum { 164 VIE_OP_TYPE_NONE = 0, 165 VIE_OP_TYPE_MOV, 166 VIE_OP_TYPE_MOVSX, 167 VIE_OP_TYPE_MOVZX, 168 VIE_OP_TYPE_MOV_CR, 169 VIE_OP_TYPE_AND, 170 VIE_OP_TYPE_OR, 171 VIE_OP_TYPE_SUB, 172 VIE_OP_TYPE_TWO_BYTE, 173 VIE_OP_TYPE_PUSH, 174 VIE_OP_TYPE_CMP, 175 VIE_OP_TYPE_POP, 176 VIE_OP_TYPE_MOVS, 177 VIE_OP_TYPE_GROUP1, 178 VIE_OP_TYPE_STOS, 179 VIE_OP_TYPE_BITTEST, 180 VIE_OP_TYPE_TWOB_GRP15, 181 VIE_OP_TYPE_ADD, 182 VIE_OP_TYPE_TEST, 183 VIE_OP_TYPE_BEXTR, 184 VIE_OP_TYPE_CLTS, 185 VIE_OP_TYPE_MUL, 186 VIE_OP_TYPE_LAST 187 }; 188 189 /* struct vie_op.op_flags */ 190 #define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */ 191 #define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ 192 #define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */ 193 #define VIE_OP_F_NO_MODRM (1 << 3) 194 #define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4) 195 #define VIE_OP_F_REG_REG (1 << 5) /* special-case for mov-cr */ 196 197 static const struct vie_op three_byte_opcodes_0f38[256] = { 198 [0xF7] = { 199 .op_byte = 0xF7, 200 .op_type = VIE_OP_TYPE_BEXTR, 201 }, 202 }; 203 204 static const struct vie_op two_byte_opcodes[256] = { 205 [0x06] = { 206 .op_byte = 0x06, 207 .op_type = VIE_OP_TYPE_CLTS, 208 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 209 }, 210 [0x20] = { 211 .op_byte = 0x20, 212 .op_type = VIE_OP_TYPE_MOV_CR, 213 .op_flags = VIE_OP_F_REG_REG | VIE_OP_F_NO_GLA_VERIFICATION 214 }, 215 [0x22] = { 216 .op_byte = 0x22, 217 .op_type = VIE_OP_TYPE_MOV_CR, 218 .op_flags = VIE_OP_F_REG_REG | VIE_OP_F_NO_GLA_VERIFICATION 219 }, 220 [0xAE] = { 221 .op_byte = 0xAE, 222 .op_type = VIE_OP_TYPE_TWOB_GRP15, 223 }, 224 [0xAF] = { 225 .op_byte = 0xAF, 226 .op_type = VIE_OP_TYPE_MUL, 227 }, 228 [0xB6] = { 229 .op_byte = 0xB6, 230 .op_type = VIE_OP_TYPE_MOVZX, 231 }, 232 [0xB7] = { 233 .op_byte = 0xB7, 234 .op_type = VIE_OP_TYPE_MOVZX, 235 }, 236 [0xBA] = { 237 .op_byte = 0xBA, 238 .op_type = VIE_OP_TYPE_BITTEST, 239 .op_flags = VIE_OP_F_IMM8, 240 }, 241 [0xBE] = { 242 .op_byte = 0xBE, 243 .op_type = VIE_OP_TYPE_MOVSX, 244 }, 245 }; 246 247 static const struct vie_op one_byte_opcodes[256] = { 248 [0x03] = { 249 .op_byte = 0x03, 250 .op_type = VIE_OP_TYPE_ADD, 251 }, 252 [0x0F] = { 253 .op_byte = 0x0F, 254 .op_type = VIE_OP_TYPE_TWO_BYTE 255 }, 256 [0x0B] = { 257 .op_byte = 0x0B, 258 .op_type = VIE_OP_TYPE_OR, 259 }, 260 [0x2B] = { 261 .op_byte = 0x2B, 262 .op_type = VIE_OP_TYPE_SUB, 263 }, 264 [0x39] = { 265 .op_byte = 0x39, 266 .op_type = VIE_OP_TYPE_CMP, 267 }, 268 [0x3B] = { 269 .op_byte = 0x3B, 270 .op_type = VIE_OP_TYPE_CMP, 271 }, 272 [0x88] = { 273 .op_byte = 0x88, 274 .op_type = VIE_OP_TYPE_MOV, 275 }, 276 [0x89] = { 277 .op_byte = 0x89, 278 .op_type = VIE_OP_TYPE_MOV, 279 }, 280 [0x8A] = { 281 .op_byte = 0x8A, 282 .op_type = VIE_OP_TYPE_MOV, 283 }, 284 [0x8B] = { 285 .op_byte = 0x8B, 286 .op_type = VIE_OP_TYPE_MOV, 287 }, 288 [0xA1] = { 289 .op_byte = 0xA1, 290 .op_type = VIE_OP_TYPE_MOV, 291 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 292 }, 293 [0xA3] = { 294 .op_byte = 0xA3, 295 .op_type = VIE_OP_TYPE_MOV, 296 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 297 }, 298 [0xA4] = { 299 .op_byte = 0xA4, 300 .op_type = VIE_OP_TYPE_MOVS, 301 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 302 }, 303 [0xA5] = { 304 .op_byte = 0xA5, 305 .op_type = VIE_OP_TYPE_MOVS, 306 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 307 }, 308 [0xAA] = { 309 .op_byte = 0xAA, 310 .op_type = VIE_OP_TYPE_STOS, 311 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 312 }, 313 [0xAB] = { 314 .op_byte = 0xAB, 315 .op_type = VIE_OP_TYPE_STOS, 316 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 317 }, 318 [0xC6] = { 319 /* XXX Group 11 extended opcode - not just MOV */ 320 .op_byte = 0xC6, 321 .op_type = VIE_OP_TYPE_MOV, 322 .op_flags = VIE_OP_F_IMM8, 323 }, 324 [0xC7] = { 325 .op_byte = 0xC7, 326 .op_type = VIE_OP_TYPE_MOV, 327 .op_flags = VIE_OP_F_IMM, 328 }, 329 [0x23] = { 330 .op_byte = 0x23, 331 .op_type = VIE_OP_TYPE_AND, 332 }, 333 [0x80] = { 334 /* Group 1 extended opcode */ 335 .op_byte = 0x80, 336 .op_type = VIE_OP_TYPE_GROUP1, 337 .op_flags = VIE_OP_F_IMM8, 338 }, 339 [0x81] = { 340 /* Group 1 extended opcode */ 341 .op_byte = 0x81, 342 .op_type = VIE_OP_TYPE_GROUP1, 343 .op_flags = VIE_OP_F_IMM, 344 }, 345 [0x83] = { 346 /* Group 1 extended opcode */ 347 .op_byte = 0x83, 348 .op_type = VIE_OP_TYPE_GROUP1, 349 .op_flags = VIE_OP_F_IMM8, 350 }, 351 [0x8F] = { 352 /* XXX Group 1A extended opcode - not just POP */ 353 .op_byte = 0x8F, 354 .op_type = VIE_OP_TYPE_POP, 355 }, 356 [0xF6] = { 357 /* XXX Group 3 extended opcode - not just TEST */ 358 .op_byte = 0xF6, 359 .op_type = VIE_OP_TYPE_TEST, 360 .op_flags = VIE_OP_F_IMM8, 361 }, 362 [0xF7] = { 363 /* XXX Group 3 extended opcode - not just TEST */ 364 .op_byte = 0xF7, 365 .op_type = VIE_OP_TYPE_TEST, 366 .op_flags = VIE_OP_F_IMM, 367 }, 368 [0xFF] = { 369 /* XXX Group 5 extended opcode - not just PUSH */ 370 .op_byte = 0xFF, 371 .op_type = VIE_OP_TYPE_PUSH, 372 } 373 }; 374 375 /* struct vie.mod */ 376 #define VIE_MOD_INDIRECT 0 377 #define VIE_MOD_INDIRECT_DISP8 1 378 #define VIE_MOD_INDIRECT_DISP32 2 379 #define VIE_MOD_DIRECT 3 380 381 /* struct vie.rm */ 382 #define VIE_RM_SIB 4 383 #define VIE_RM_DISP32 5 384 385 #define GB (1024 * 1024 * 1024) 386 387 388 /* 389 * Paging defines, previously pulled in from machine/pmap.h 390 */ 391 #define PG_V (1 << 0) /* Present */ 392 #define PG_RW (1 << 1) /* Read/Write */ 393 #define PG_U (1 << 2) /* User/Supervisor */ 394 #define PG_A (1 << 5) /* Accessed */ 395 #define PG_M (1 << 6) /* Dirty */ 396 #define PG_PS (1 << 7) /* Largepage */ 397 398 /* 399 * Paging except defines, previously pulled in from machine/pmap.h 400 */ 401 #define PGEX_P (1 << 0) /* Non-present/Protection */ 402 #define PGEX_W (1 << 1) /* Read/Write */ 403 #define PGEX_U (1 << 2) /* User/Supervisor */ 404 #define PGEX_RSV (1 << 3) /* (Non-)Reserved */ 405 #define PGEX_I (1 << 4) /* Instruction */ 406 407 408 static enum vm_reg_name gpr_map[16] = { 409 VM_REG_GUEST_RAX, 410 VM_REG_GUEST_RCX, 411 VM_REG_GUEST_RDX, 412 VM_REG_GUEST_RBX, 413 VM_REG_GUEST_RSP, 414 VM_REG_GUEST_RBP, 415 VM_REG_GUEST_RSI, 416 VM_REG_GUEST_RDI, 417 VM_REG_GUEST_R8, 418 VM_REG_GUEST_R9, 419 VM_REG_GUEST_R10, 420 VM_REG_GUEST_R11, 421 VM_REG_GUEST_R12, 422 VM_REG_GUEST_R13, 423 VM_REG_GUEST_R14, 424 VM_REG_GUEST_R15 425 }; 426 427 static const char *gpr_name_map[][16] = { 428 [1] = { 429 "a[hl]", "c[hl]", "d[hl]", "b[hl]", "spl", "bpl", "sil", "dil", 430 "r8b", "r9b", "r10b", "r11b", "r12b", "r13b", "r14b", "r15b", 431 }, 432 [2] = { 433 "ax", "cx", "dx", "bx", "sp", "bp", "si", "di", 434 "r8w", "r9w", "r10w", "r11w", "r12w", "r13w", "r14w", "r15w", 435 }, 436 [4] = { 437 "eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi", 438 "r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d", "r15d", 439 }, 440 [8] = { 441 "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", 442 "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", 443 }, 444 }; 445 446 static enum vm_reg_name cr_map[16] = { 447 VM_REG_GUEST_CR0, 448 VM_REG_LAST, 449 VM_REG_GUEST_CR2, 450 VM_REG_GUEST_CR3, 451 VM_REG_GUEST_CR4, 452 VM_REG_LAST, 453 VM_REG_LAST, 454 VM_REG_LAST, 455 VM_REG_LAST, 456 VM_REG_LAST, 457 VM_REG_LAST, 458 VM_REG_LAST, 459 VM_REG_LAST, 460 VM_REG_LAST, 461 VM_REG_LAST, 462 VM_REG_LAST 463 }; 464 465 static uint64_t size2mask[] = { 466 [1] = 0xff, 467 [2] = 0xffff, 468 [4] = 0xffffffff, 469 [8] = 0xffffffffffffffff, 470 }; 471 472 473 static int vie_mmio_read(struct vie *vie, struct vm *vm, int cpuid, 474 uint64_t gpa, uint64_t *rval, int bytes); 475 static int vie_mmio_write(struct vie *vie, struct vm *vm, int cpuid, 476 uint64_t gpa, uint64_t wval, int bytes); 477 static int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, 478 struct seg_desc *desc, uint64_t offset, int length, int addrsize, 479 int prot, uint64_t *gla); 480 static int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla); 481 static int vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, 482 uint64_t gla); 483 static uint64_t vie_size2mask(int size); 484 485 struct vie * 486 vie_alloc() 487 { 488 return (kmem_zalloc(sizeof (struct vie), KM_SLEEP)); 489 } 490 491 void 492 vie_free(struct vie *vie) 493 { 494 kmem_free(vie, sizeof (struct vie)); 495 } 496 497 enum vm_reg_name 498 vie_regnum_map(uint8_t regnum) 499 { 500 VERIFY3U(regnum, <, 16); 501 return (gpr_map[regnum]); 502 } 503 504 const char * 505 vie_regnum_name(uint8_t regnum, uint8_t size) 506 { 507 VERIFY3U(regnum, <, 16); 508 VERIFY(size == 1 || size == 2 || size == 4 || size == 8); 509 return (gpr_name_map[size][regnum]); 510 } 511 512 static void 513 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr) 514 { 515 *lhbr = 0; 516 *reg = gpr_map[vie->reg]; 517 518 /* 519 * 64-bit mode imposes limitations on accessing legacy high byte 520 * registers (lhbr). 521 * 522 * The legacy high-byte registers cannot be addressed if the REX 523 * prefix is present. In this case the values 4, 5, 6 and 7 of the 524 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively. 525 * 526 * If the REX prefix is not present then the values 4, 5, 6 and 7 527 * of the 'ModRM:reg' field address the legacy high-byte registers, 528 * %ah, %ch, %dh and %bh respectively. 529 */ 530 if (!vie->rex_present) { 531 if (vie->reg & 0x4) { 532 *lhbr = 1; 533 *reg = gpr_map[vie->reg & 0x3]; 534 } 535 } 536 } 537 538 static int 539 vie_read_bytereg(struct vie *vie, struct vm *vm, int vcpuid, uint8_t *rval) 540 { 541 uint64_t val; 542 int error, lhbr; 543 enum vm_reg_name reg; 544 545 vie_calc_bytereg(vie, ®, &lhbr); 546 error = vm_get_register(vm, vcpuid, reg, &val); 547 548 /* 549 * To obtain the value of a legacy high byte register shift the 550 * base register right by 8 bits (%ah = %rax >> 8). 551 */ 552 if (lhbr) 553 *rval = val >> 8; 554 else 555 *rval = val; 556 return (error); 557 } 558 559 static int 560 vie_write_bytereg(struct vie *vie, struct vm *vm, int vcpuid, uint8_t byte) 561 { 562 uint64_t origval, val, mask; 563 int error, lhbr; 564 enum vm_reg_name reg; 565 566 vie_calc_bytereg(vie, ®, &lhbr); 567 error = vm_get_register(vm, vcpuid, reg, &origval); 568 if (error == 0) { 569 val = byte; 570 mask = 0xff; 571 if (lhbr) { 572 /* 573 * Shift left by 8 to store 'byte' in a legacy high 574 * byte register. 575 */ 576 val <<= 8; 577 mask <<= 8; 578 } 579 val |= origval & ~mask; 580 error = vm_set_register(vm, vcpuid, reg, val); 581 } 582 return (error); 583 } 584 585 static int 586 vie_update_register(struct vm *vm, int vcpuid, enum vm_reg_name reg, 587 uint64_t val, int size) 588 { 589 int error; 590 uint64_t origval; 591 592 switch (size) { 593 case 1: 594 case 2: 595 error = vm_get_register(vm, vcpuid, reg, &origval); 596 if (error) 597 return (error); 598 val &= size2mask[size]; 599 val |= origval & ~size2mask[size]; 600 break; 601 case 4: 602 val &= 0xffffffffUL; 603 break; 604 case 8: 605 break; 606 default: 607 return (EINVAL); 608 } 609 610 error = vm_set_register(vm, vcpuid, reg, val); 611 return (error); 612 } 613 614 static int 615 vie_repeat(struct vie *vie) 616 { 617 vie->status |= VIES_REPEAT; 618 619 /* 620 * Clear out any cached operation values so the repeated instruction can 621 * begin without using that stale state. Other state, such as the 622 * decoding results, are kept around as it will not vary between 623 * iterations of a rep-prefixed instruction. 624 */ 625 if ((vie->status & VIES_MMIO) != 0) { 626 vie->mmio_req_read.state = VR_NONE; 627 vie->mmio_req_write.state = VR_NONE; 628 } else if ((vie->status & VIES_INOUT) != 0) { 629 vie->inout_req_state = VR_NONE; 630 } else { 631 panic("unexpected emulation state"); 632 } 633 634 return (EAGAIN); 635 } 636 637 #define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V) 638 639 /* 640 * Return the status flags that would result from doing (x - y). 641 */ 642 /* BEGIN CSTYLED */ 643 #define GETCC(sz) \ 644 static ulong_t \ 645 getcc##sz(uint##sz##_t x, uint##sz##_t y) \ 646 { \ 647 ulong_t rflags; \ 648 \ 649 __asm __volatile("sub %2,%1; pushfq; popq %0" : \ 650 "=r" (rflags), "+r" (x) : "m" (y)); \ 651 return (rflags); \ 652 } struct __hack 653 /* END CSTYLED */ 654 655 GETCC(8); 656 GETCC(16); 657 GETCC(32); 658 GETCC(64); 659 660 static ulong_t 661 getcc(int opsize, uint64_t x, uint64_t y) 662 { 663 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 664 ("getcc: invalid operand size %d", opsize)); 665 666 if (opsize == 1) 667 return (getcc8(x, y)); 668 else if (opsize == 2) 669 return (getcc16(x, y)); 670 else if (opsize == 4) 671 return (getcc32(x, y)); 672 else 673 return (getcc64(x, y)); 674 } 675 676 /* 677 * Macro creation of functions getaddflags{8,16,32,64} 678 */ 679 /* BEGIN CSTYLED */ 680 #define GETADDFLAGS(sz) \ 681 static ulong_t \ 682 getaddflags##sz(uint##sz##_t x, uint##sz##_t y) \ 683 { \ 684 ulong_t rflags; \ 685 \ 686 __asm __volatile("add %2,%1; pushfq; popq %0" : \ 687 "=r" (rflags), "+r" (x) : "m" (y)); \ 688 return (rflags); \ 689 } struct __hack 690 /* END CSTYLED */ 691 692 GETADDFLAGS(8); 693 GETADDFLAGS(16); 694 GETADDFLAGS(32); 695 GETADDFLAGS(64); 696 697 static ulong_t 698 getaddflags(int opsize, uint64_t x, uint64_t y) 699 { 700 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 701 ("getaddflags: invalid operand size %d", opsize)); 702 703 if (opsize == 1) 704 return (getaddflags8(x, y)); 705 else if (opsize == 2) 706 return (getaddflags16(x, y)); 707 else if (opsize == 4) 708 return (getaddflags32(x, y)); 709 else 710 return (getaddflags64(x, y)); 711 } 712 713 /* 714 * Macro creation of functions getimulflags{16,32,64} 715 */ 716 /* BEGIN CSTYLED */ 717 #define GETIMULFLAGS(sz) \ 718 static ulong_t \ 719 getimulflags##sz(uint##sz##_t x, uint##sz##_t y) \ 720 { \ 721 ulong_t rflags; \ 722 \ 723 __asm __volatile("imul %2,%1; pushfq; popq %0" : \ 724 "=r" (rflags), "+r" (x) : "m" (y)); \ 725 return (rflags); \ 726 } struct __hack 727 /* END CSTYLED */ 728 729 GETIMULFLAGS(16); 730 GETIMULFLAGS(32); 731 GETIMULFLAGS(64); 732 733 static ulong_t 734 getimulflags(int opsize, uint64_t x, uint64_t y) 735 { 736 KASSERT(opsize == 2 || opsize == 4 || opsize == 8, 737 ("getimulflags: invalid operand size %d", opsize)); 738 739 if (opsize == 2) 740 return (getimulflags16(x, y)); 741 else if (opsize == 4) 742 return (getimulflags32(x, y)); 743 else 744 return (getimulflags64(x, y)); 745 } 746 747 /* 748 * Return the status flags that would result from doing (x & y). 749 */ 750 /* BEGIN CSTYLED */ 751 #define GETANDFLAGS(sz) \ 752 static ulong_t \ 753 getandflags##sz(uint##sz##_t x, uint##sz##_t y) \ 754 { \ 755 ulong_t rflags; \ 756 \ 757 __asm __volatile("and %2,%1; pushfq; popq %0" : \ 758 "=r" (rflags), "+r" (x) : "m" (y)); \ 759 return (rflags); \ 760 } struct __hack 761 /* END CSTYLED */ 762 763 GETANDFLAGS(8); 764 GETANDFLAGS(16); 765 GETANDFLAGS(32); 766 GETANDFLAGS(64); 767 768 static ulong_t 769 getandflags(int opsize, uint64_t x, uint64_t y) 770 { 771 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 772 ("getandflags: invalid operand size %d", opsize)); 773 774 if (opsize == 1) 775 return (getandflags8(x, y)); 776 else if (opsize == 2) 777 return (getandflags16(x, y)); 778 else if (opsize == 4) 779 return (getandflags32(x, y)); 780 else 781 return (getandflags64(x, y)); 782 } 783 784 static int 785 vie_emulate_mov_cr(struct vie *vie, struct vm *vm, int vcpuid) 786 { 787 uint64_t val; 788 int err; 789 enum vm_reg_name gpr = gpr_map[vie->rm]; 790 enum vm_reg_name cr = cr_map[vie->reg]; 791 792 uint_t size = 4; 793 if (vie->paging.cpu_mode == CPU_MODE_64BIT) { 794 size = 8; 795 } 796 797 switch (vie->op.op_byte) { 798 case 0x20: 799 /* 800 * MOV control register (ModRM:reg) to reg (ModRM:r/m) 801 * 20/r: mov r32, CR0-CR7 802 * 20/r: mov r64, CR0-CR7 803 * REX.R + 20/0: mov r64, CR8 804 */ 805 if (vie->paging.cpl != 0) { 806 vm_inject_gp(vm, vcpuid); 807 vie->num_processed = 0; 808 return (0); 809 } 810 err = vm_get_register(vm, vcpuid, cr, &val); 811 if (err != 0) { 812 /* #UD for access to non-existent CRs */ 813 vm_inject_ud(vm, vcpuid); 814 vie->num_processed = 0; 815 return (0); 816 } 817 err = vie_update_register(vm, vcpuid, gpr, val, size); 818 break; 819 case 0x22: { 820 /* 821 * MOV reg (ModRM:r/m) to control register (ModRM:reg) 822 * 22/r: mov CR0-CR7, r32 823 * 22/r: mov CR0-CR7, r64 824 * REX.R + 22/0: mov CR8, r64 825 */ 826 uint64_t old, diff; 827 828 if (vie->paging.cpl != 0) { 829 vm_inject_gp(vm, vcpuid); 830 vie->num_processed = 0; 831 return (0); 832 } 833 err = vm_get_register(vm, vcpuid, cr, &old); 834 if (err != 0) { 835 /* #UD for access to non-existent CRs */ 836 vm_inject_ud(vm, vcpuid); 837 vie->num_processed = 0; 838 return (0); 839 } 840 err = vm_get_register(vm, vcpuid, gpr, &val); 841 VERIFY0(err); 842 val &= size2mask[size]; 843 diff = old ^ val; 844 845 switch (cr) { 846 case VM_REG_GUEST_CR0: 847 if ((diff & CR0_PG) != 0) { 848 uint64_t efer; 849 850 err = vm_get_register(vm, vcpuid, 851 VM_REG_GUEST_EFER, &efer); 852 VERIFY0(err); 853 854 /* Keep the long-mode state in EFER in sync */ 855 if ((val & CR0_PG) != 0 && 856 (efer & EFER_LME) != 0) { 857 efer |= EFER_LMA; 858 } 859 if ((val & CR0_PG) == 0 && 860 (efer & EFER_LME) != 0) { 861 efer &= ~EFER_LMA; 862 } 863 864 err = vm_set_register(vm, vcpuid, 865 VM_REG_GUEST_EFER, efer); 866 VERIFY0(err); 867 } 868 /* TODO: enforce more of the #GP checks */ 869 err = vm_set_register(vm, vcpuid, cr, val); 870 VERIFY0(err); 871 break; 872 case VM_REG_GUEST_CR2: 873 case VM_REG_GUEST_CR3: 874 case VM_REG_GUEST_CR4: 875 /* TODO: enforce more of the #GP checks */ 876 err = vm_set_register(vm, vcpuid, cr, val); 877 break; 878 default: 879 /* The cr_map mapping should prevent this */ 880 panic("invalid cr %d", cr); 881 } 882 break; 883 } 884 default: 885 return (EINVAL); 886 } 887 return (err); 888 } 889 890 static int 891 vie_emulate_mov(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 892 { 893 int error, size; 894 enum vm_reg_name reg; 895 uint8_t byte; 896 uint64_t val; 897 898 size = vie->opsize; 899 error = EINVAL; 900 901 switch (vie->op.op_byte) { 902 case 0x88: 903 /* 904 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m) 905 * 88/r: mov r/m8, r8 906 * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available) 907 */ 908 size = 1; /* override for byte operation */ 909 error = vie_read_bytereg(vie, vm, vcpuid, &byte); 910 if (error == 0) { 911 error = vie_mmio_write(vie, vm, vcpuid, gpa, byte, 912 size); 913 } 914 break; 915 case 0x89: 916 /* 917 * MOV from reg (ModRM:reg) to mem (ModRM:r/m) 918 * 89/r: mov r/m16, r16 919 * 89/r: mov r/m32, r32 920 * REX.W + 89/r mov r/m64, r64 921 */ 922 reg = gpr_map[vie->reg]; 923 error = vm_get_register(vm, vcpuid, reg, &val); 924 if (error == 0) { 925 val &= size2mask[size]; 926 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size); 927 } 928 break; 929 case 0x8A: 930 /* 931 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg) 932 * 8A/r: mov r8, r/m8 933 * REX + 8A/r: mov r8, r/m8 934 */ 935 size = 1; /* override for byte operation */ 936 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size); 937 if (error == 0) 938 error = vie_write_bytereg(vie, vm, vcpuid, val); 939 break; 940 case 0x8B: 941 /* 942 * MOV from mem (ModRM:r/m) to reg (ModRM:reg) 943 * 8B/r: mov r16, r/m16 944 * 8B/r: mov r32, r/m32 945 * REX.W 8B/r: mov r64, r/m64 946 */ 947 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size); 948 if (error == 0) { 949 reg = gpr_map[vie->reg]; 950 error = vie_update_register(vm, vcpuid, reg, val, size); 951 } 952 break; 953 case 0xA1: 954 /* 955 * MOV from seg:moffset to AX/EAX/RAX 956 * A1: mov AX, moffs16 957 * A1: mov EAX, moffs32 958 * REX.W + A1: mov RAX, moffs64 959 */ 960 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size); 961 if (error == 0) { 962 reg = VM_REG_GUEST_RAX; 963 error = vie_update_register(vm, vcpuid, reg, val, size); 964 } 965 break; 966 case 0xA3: 967 /* 968 * MOV from AX/EAX/RAX to seg:moffset 969 * A3: mov moffs16, AX 970 * A3: mov moffs32, EAX 971 * REX.W + A3: mov moffs64, RAX 972 */ 973 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); 974 if (error == 0) { 975 val &= size2mask[size]; 976 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size); 977 } 978 break; 979 case 0xC6: 980 /* 981 * MOV from imm8 to mem (ModRM:r/m) 982 * C6/0 mov r/m8, imm8 983 * REX + C6/0 mov r/m8, imm8 984 */ 985 size = 1; /* override for byte operation */ 986 val = vie->immediate; 987 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size); 988 break; 989 case 0xC7: 990 /* 991 * MOV from imm16/imm32 to mem (ModRM:r/m) 992 * C7/0 mov r/m16, imm16 993 * C7/0 mov r/m32, imm32 994 * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) 995 */ 996 val = vie->immediate & size2mask[size]; 997 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size); 998 break; 999 default: 1000 break; 1001 } 1002 1003 return (error); 1004 } 1005 1006 static int 1007 vie_emulate_movx(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1008 { 1009 int error, size; 1010 enum vm_reg_name reg; 1011 uint64_t val; 1012 1013 size = vie->opsize; 1014 error = EINVAL; 1015 1016 switch (vie->op.op_byte) { 1017 case 0xB6: 1018 /* 1019 * MOV and zero extend byte from mem (ModRM:r/m) to 1020 * reg (ModRM:reg). 1021 * 1022 * 0F B6/r movzx r16, r/m8 1023 * 0F B6/r movzx r32, r/m8 1024 * REX.W + 0F B6/r movzx r64, r/m8 1025 */ 1026 1027 /* get the first operand */ 1028 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 1); 1029 if (error) 1030 break; 1031 1032 /* get the second operand */ 1033 reg = gpr_map[vie->reg]; 1034 1035 /* zero-extend byte */ 1036 val = (uint8_t)val; 1037 1038 /* write the result */ 1039 error = vie_update_register(vm, vcpuid, reg, val, size); 1040 break; 1041 case 0xB7: 1042 /* 1043 * MOV and zero extend word from mem (ModRM:r/m) to 1044 * reg (ModRM:reg). 1045 * 1046 * 0F B7/r movzx r32, r/m16 1047 * REX.W + 0F B7/r movzx r64, r/m16 1048 */ 1049 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 2); 1050 if (error) 1051 return (error); 1052 1053 reg = gpr_map[vie->reg]; 1054 1055 /* zero-extend word */ 1056 val = (uint16_t)val; 1057 1058 error = vie_update_register(vm, vcpuid, reg, val, size); 1059 break; 1060 case 0xBE: 1061 /* 1062 * MOV and sign extend byte from mem (ModRM:r/m) to 1063 * reg (ModRM:reg). 1064 * 1065 * 0F BE/r movsx r16, r/m8 1066 * 0F BE/r movsx r32, r/m8 1067 * REX.W + 0F BE/r movsx r64, r/m8 1068 */ 1069 1070 /* get the first operand */ 1071 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 1); 1072 if (error) 1073 break; 1074 1075 /* get the second operand */ 1076 reg = gpr_map[vie->reg]; 1077 1078 /* sign extend byte */ 1079 val = (int8_t)val; 1080 1081 /* write the result */ 1082 error = vie_update_register(vm, vcpuid, reg, val, size); 1083 break; 1084 default: 1085 break; 1086 } 1087 return (error); 1088 } 1089 1090 /* 1091 * Helper function to calculate and validate a linear address. 1092 */ 1093 static int 1094 vie_get_gla(struct vie *vie, struct vm *vm, int vcpuid, int opsize, 1095 int addrsize, int prot, enum vm_reg_name seg, enum vm_reg_name gpr, 1096 uint64_t *gla) 1097 { 1098 struct seg_desc desc; 1099 uint64_t cr0, val, rflags; 1100 int error; 1101 struct vm_guest_paging *paging; 1102 1103 paging = &vie->paging; 1104 1105 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); 1106 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); 1107 1108 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1109 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 1110 1111 error = vm_get_seg_desc(vm, vcpuid, seg, &desc); 1112 KASSERT(error == 0, ("%s: error %d getting segment descriptor %d", 1113 __func__, error, seg)); 1114 1115 error = vm_get_register(vm, vcpuid, gpr, &val); 1116 KASSERT(error == 0, ("%s: error %d getting register %d", __func__, 1117 error, gpr)); 1118 1119 if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize, 1120 addrsize, prot, gla)) { 1121 if (seg == VM_REG_GUEST_SS) 1122 vm_inject_ss(vm, vcpuid, 0); 1123 else 1124 vm_inject_gp(vm, vcpuid); 1125 return (-1); 1126 } 1127 1128 if (vie_canonical_check(paging->cpu_mode, *gla)) { 1129 if (seg == VM_REG_GUEST_SS) 1130 vm_inject_ss(vm, vcpuid, 0); 1131 else 1132 vm_inject_gp(vm, vcpuid); 1133 return (-1); 1134 } 1135 1136 if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) { 1137 vm_inject_ac(vm, vcpuid, 0); 1138 return (-1); 1139 } 1140 1141 return (0); 1142 } 1143 1144 static int 1145 vie_emulate_movs(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1146 { 1147 struct vm_copyinfo copyinfo[2]; 1148 uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val; 1149 uint64_t rcx, rdi, rsi, rflags; 1150 int error, fault, opsize, seg, repeat; 1151 struct vm_guest_paging *paging; 1152 1153 opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize; 1154 val = 0; 1155 error = 0; 1156 paging = &vie->paging; 1157 1158 /* 1159 * XXX although the MOVS instruction is only supposed to be used with 1160 * the "rep" prefix some guests like FreeBSD will use "repnz" instead. 1161 * 1162 * Empirically the "repnz" prefix has identical behavior to "rep" 1163 * and the zero flag does not make a difference. 1164 */ 1165 repeat = vie->repz_present | vie->repnz_present; 1166 1167 if (repeat) { 1168 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); 1169 KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); 1170 1171 /* 1172 * The count register is %rcx, %ecx or %cx depending on the 1173 * address size of the instruction. 1174 */ 1175 if ((rcx & vie_size2mask(vie->addrsize)) == 0) { 1176 error = 0; 1177 goto done; 1178 } 1179 } 1180 1181 /* 1182 * Source Destination Comments 1183 * -------------------------------------------- 1184 * (1) memory memory n/a 1185 * (2) memory mmio emulated 1186 * (3) mmio memory emulated 1187 * (4) mmio mmio emulated 1188 * 1189 * At this point we don't have sufficient information to distinguish 1190 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this 1191 * out because it will succeed only when operating on regular memory. 1192 * 1193 * XXX the emulation doesn't properly handle the case where 'gpa' 1194 * is straddling the boundary between the normal memory and MMIO. 1195 */ 1196 1197 seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS; 1198 if (vie_get_gla(vie, vm, vcpuid, opsize, vie->addrsize, PROT_READ, seg, 1199 VM_REG_GUEST_RSI, &srcaddr) != 0) { 1200 goto done; 1201 } 1202 1203 error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ, 1204 copyinfo, nitems(copyinfo), &fault); 1205 if (error == 0) { 1206 if (fault) 1207 goto done; /* Resume guest to handle fault */ 1208 1209 /* 1210 * case (2): read from system memory and write to mmio. 1211 */ 1212 vm_copyin(vm, vcpuid, copyinfo, &val, opsize); 1213 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 1214 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, opsize); 1215 if (error) 1216 goto done; 1217 } else { 1218 /* 1219 * 'vm_copy_setup()' is expected to fail for cases (3) and (4) 1220 * if 'srcaddr' is in the mmio space. 1221 */ 1222 1223 if (vie_get_gla(vie, vm, vcpuid, opsize, vie->addrsize, 1224 PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, 1225 &dstaddr) != 0) { 1226 goto done; 1227 } 1228 1229 error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize, 1230 PROT_WRITE, copyinfo, nitems(copyinfo), &fault); 1231 if (error == 0) { 1232 if (fault) 1233 goto done; /* Resume guest to handle fault */ 1234 1235 /* 1236 * case (3): read from MMIO and write to system memory. 1237 * 1238 * A MMIO read can have side-effects so we 1239 * commit to it only after vm_copy_setup() is 1240 * successful. If a page-fault needs to be 1241 * injected into the guest then it will happen 1242 * before the MMIO read is attempted. 1243 */ 1244 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 1245 opsize); 1246 1247 if (error == 0) { 1248 vm_copyout(vm, vcpuid, &val, copyinfo, opsize); 1249 } 1250 /* 1251 * Regardless of whether the MMIO read was successful or 1252 * not, the copy resources must be cleaned up. 1253 */ 1254 vm_copy_teardown(vm, vcpuid, copyinfo, 1255 nitems(copyinfo)); 1256 if (error != 0) { 1257 goto done; 1258 } 1259 } else { 1260 /* 1261 * Case (4): read from and write to mmio. 1262 * 1263 * Commit to the MMIO read/write (with potential 1264 * side-effects) only after we are sure that the 1265 * instruction is not going to be restarted due 1266 * to address translation faults. 1267 */ 1268 error = vm_gla2gpa(vm, vcpuid, paging, srcaddr, 1269 PROT_READ, &srcgpa, &fault); 1270 if (error || fault) 1271 goto done; 1272 1273 error = vm_gla2gpa(vm, vcpuid, paging, dstaddr, 1274 PROT_WRITE, &dstgpa, &fault); 1275 if (error || fault) 1276 goto done; 1277 1278 error = vie_mmio_read(vie, vm, vcpuid, srcgpa, &val, 1279 opsize); 1280 if (error) 1281 goto done; 1282 1283 error = vie_mmio_write(vie, vm, vcpuid, dstgpa, val, 1284 opsize); 1285 if (error) 1286 goto done; 1287 } 1288 } 1289 1290 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi); 1291 KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error)); 1292 1293 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); 1294 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); 1295 1296 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1297 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 1298 1299 if (rflags & PSL_D) { 1300 rsi -= opsize; 1301 rdi -= opsize; 1302 } else { 1303 rsi += opsize; 1304 rdi += opsize; 1305 } 1306 1307 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi, 1308 vie->addrsize); 1309 KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error)); 1310 1311 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, 1312 vie->addrsize); 1313 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); 1314 1315 if (repeat) { 1316 rcx = rcx - 1; 1317 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, 1318 rcx, vie->addrsize); 1319 KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); 1320 1321 /* 1322 * Repeat the instruction if the count register is not zero. 1323 */ 1324 if ((rcx & vie_size2mask(vie->addrsize)) != 0) 1325 return (vie_repeat(vie)); 1326 } 1327 done: 1328 return (error); 1329 } 1330 1331 static int 1332 vie_emulate_stos(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1333 { 1334 int error, opsize, repeat; 1335 uint64_t val; 1336 uint64_t rcx, rdi, rflags; 1337 1338 opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize; 1339 repeat = vie->repz_present | vie->repnz_present; 1340 1341 if (repeat) { 1342 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); 1343 KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); 1344 1345 /* 1346 * The count register is %rcx, %ecx or %cx depending on the 1347 * address size of the instruction. 1348 */ 1349 if ((rcx & vie_size2mask(vie->addrsize)) == 0) 1350 return (0); 1351 } 1352 1353 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); 1354 KASSERT(!error, ("%s: error %d getting rax", __func__, error)); 1355 1356 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, opsize); 1357 if (error) 1358 return (error); 1359 1360 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); 1361 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); 1362 1363 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1364 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 1365 1366 if (rflags & PSL_D) 1367 rdi -= opsize; 1368 else 1369 rdi += opsize; 1370 1371 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, 1372 vie->addrsize); 1373 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); 1374 1375 if (repeat) { 1376 rcx = rcx - 1; 1377 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, 1378 rcx, vie->addrsize); 1379 KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); 1380 1381 /* 1382 * Repeat the instruction if the count register is not zero. 1383 */ 1384 if ((rcx & vie_size2mask(vie->addrsize)) != 0) 1385 return (vie_repeat(vie)); 1386 } 1387 1388 return (0); 1389 } 1390 1391 static int 1392 vie_emulate_and(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1393 { 1394 int error, size; 1395 enum vm_reg_name reg; 1396 uint64_t result, rflags, rflags2, val1, val2; 1397 1398 size = vie->opsize; 1399 error = EINVAL; 1400 1401 switch (vie->op.op_byte) { 1402 case 0x23: 1403 /* 1404 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the 1405 * result in reg. 1406 * 1407 * 23/r and r16, r/m16 1408 * 23/r and r32, r/m32 1409 * REX.W + 23/r and r64, r/m64 1410 */ 1411 1412 /* get the first operand */ 1413 reg = gpr_map[vie->reg]; 1414 error = vm_get_register(vm, vcpuid, reg, &val1); 1415 if (error) 1416 break; 1417 1418 /* get the second operand */ 1419 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size); 1420 if (error) 1421 break; 1422 1423 /* perform the operation and write the result */ 1424 result = val1 & val2; 1425 error = vie_update_register(vm, vcpuid, reg, result, size); 1426 break; 1427 case 0x81: 1428 case 0x83: 1429 /* 1430 * AND mem (ModRM:r/m) with immediate and store the 1431 * result in mem. 1432 * 1433 * 81 /4 and r/m16, imm16 1434 * 81 /4 and r/m32, imm32 1435 * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64 1436 * 1437 * 83 /4 and r/m16, imm8 sign-extended to 16 1438 * 83 /4 and r/m32, imm8 sign-extended to 32 1439 * REX.W + 83/4 and r/m64, imm8 sign-extended to 64 1440 */ 1441 1442 /* get the first operand */ 1443 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val1, size); 1444 if (error) 1445 break; 1446 1447 /* 1448 * perform the operation with the pre-fetched immediate 1449 * operand and write the result 1450 */ 1451 result = val1 & vie->immediate; 1452 error = vie_mmio_write(vie, vm, vcpuid, gpa, result, size); 1453 break; 1454 default: 1455 break; 1456 } 1457 if (error) 1458 return (error); 1459 1460 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1461 if (error) 1462 return (error); 1463 1464 /* 1465 * OF and CF are cleared; the SF, ZF and PF flags are set according 1466 * to the result; AF is undefined. 1467 * 1468 * The updated status flags are obtained by subtracting 0 from 'result'. 1469 */ 1470 rflags2 = getcc(size, result, 0); 1471 rflags &= ~RFLAGS_STATUS_BITS; 1472 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 1473 1474 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 1475 return (error); 1476 } 1477 1478 static int 1479 vie_emulate_or(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1480 { 1481 int error, size; 1482 enum vm_reg_name reg; 1483 uint64_t result, rflags, rflags2, val1, val2; 1484 1485 size = vie->opsize; 1486 error = EINVAL; 1487 1488 switch (vie->op.op_byte) { 1489 case 0x0B: 1490 /* 1491 * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the 1492 * result in reg. 1493 * 1494 * 0b/r or r16, r/m16 1495 * 0b/r or r32, r/m32 1496 * REX.W + 0b/r or r64, r/m64 1497 */ 1498 1499 /* get the first operand */ 1500 reg = gpr_map[vie->reg]; 1501 error = vm_get_register(vm, vcpuid, reg, &val1); 1502 if (error) 1503 break; 1504 1505 /* get the second operand */ 1506 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size); 1507 if (error) 1508 break; 1509 1510 /* perform the operation and write the result */ 1511 result = val1 | val2; 1512 error = vie_update_register(vm, vcpuid, reg, result, size); 1513 break; 1514 case 0x81: 1515 case 0x83: 1516 /* 1517 * OR mem (ModRM:r/m) with immediate and store the 1518 * result in mem. 1519 * 1520 * 81 /1 or r/m16, imm16 1521 * 81 /1 or r/m32, imm32 1522 * REX.W + 81 /1 or r/m64, imm32 sign-extended to 64 1523 * 1524 * 83 /1 or r/m16, imm8 sign-extended to 16 1525 * 83 /1 or r/m32, imm8 sign-extended to 32 1526 * REX.W + 83/1 or r/m64, imm8 sign-extended to 64 1527 */ 1528 1529 /* get the first operand */ 1530 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val1, size); 1531 if (error) 1532 break; 1533 1534 /* 1535 * perform the operation with the pre-fetched immediate 1536 * operand and write the result 1537 */ 1538 result = val1 | vie->immediate; 1539 error = vie_mmio_write(vie, vm, vcpuid, gpa, result, size); 1540 break; 1541 default: 1542 break; 1543 } 1544 if (error) 1545 return (error); 1546 1547 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1548 if (error) 1549 return (error); 1550 1551 /* 1552 * OF and CF are cleared; the SF, ZF and PF flags are set according 1553 * to the result; AF is undefined. 1554 * 1555 * The updated status flags are obtained by subtracting 0 from 'result'. 1556 */ 1557 rflags2 = getcc(size, result, 0); 1558 rflags &= ~RFLAGS_STATUS_BITS; 1559 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 1560 1561 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 1562 return (error); 1563 } 1564 1565 static int 1566 vie_emulate_cmp(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1567 { 1568 int error, size; 1569 uint64_t regop, memop, op1, op2, rflags, rflags2; 1570 enum vm_reg_name reg; 1571 1572 size = vie->opsize; 1573 switch (vie->op.op_byte) { 1574 case 0x39: 1575 case 0x3B: 1576 /* 1577 * 39/r CMP r/m16, r16 1578 * 39/r CMP r/m32, r32 1579 * REX.W 39/r CMP r/m64, r64 1580 * 1581 * 3B/r CMP r16, r/m16 1582 * 3B/r CMP r32, r/m32 1583 * REX.W + 3B/r CMP r64, r/m64 1584 * 1585 * Compare the first operand with the second operand and 1586 * set status flags in EFLAGS register. The comparison is 1587 * performed by subtracting the second operand from the first 1588 * operand and then setting the status flags. 1589 */ 1590 1591 /* Get the register operand */ 1592 reg = gpr_map[vie->reg]; 1593 error = vm_get_register(vm, vcpuid, reg, ®op); 1594 if (error) 1595 return (error); 1596 1597 /* Get the memory operand */ 1598 error = vie_mmio_read(vie, vm, vcpuid, gpa, &memop, size); 1599 if (error) 1600 return (error); 1601 1602 if (vie->op.op_byte == 0x3B) { 1603 op1 = regop; 1604 op2 = memop; 1605 } else { 1606 op1 = memop; 1607 op2 = regop; 1608 } 1609 rflags2 = getcc(size, op1, op2); 1610 break; 1611 case 0x80: 1612 case 0x81: 1613 case 0x83: 1614 /* 1615 * 80 /7 cmp r/m8, imm8 1616 * REX + 80 /7 cmp r/m8, imm8 1617 * 1618 * 81 /7 cmp r/m16, imm16 1619 * 81 /7 cmp r/m32, imm32 1620 * REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 64 1621 * 1622 * 83 /7 cmp r/m16, imm8 sign-extended to 16 1623 * 83 /7 cmp r/m32, imm8 sign-extended to 32 1624 * REX.W + 83 /7 cmp r/m64, imm8 sign-extended to 64 1625 * 1626 * Compare mem (ModRM:r/m) with immediate and set 1627 * status flags according to the results. The 1628 * comparison is performed by subtracting the 1629 * immediate from the first operand and then setting 1630 * the status flags. 1631 * 1632 */ 1633 if (vie->op.op_byte == 0x80) 1634 size = 1; 1635 1636 /* get the first operand */ 1637 error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size); 1638 if (error) 1639 return (error); 1640 1641 rflags2 = getcc(size, op1, vie->immediate); 1642 break; 1643 default: 1644 return (EINVAL); 1645 } 1646 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1647 if (error) 1648 return (error); 1649 rflags &= ~RFLAGS_STATUS_BITS; 1650 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1651 1652 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 1653 return (error); 1654 } 1655 1656 static int 1657 vie_emulate_test(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1658 { 1659 int error, size; 1660 uint64_t op1, rflags, rflags2; 1661 1662 size = vie->opsize; 1663 error = EINVAL; 1664 1665 switch (vie->op.op_byte) { 1666 case 0xF6: 1667 /* 1668 * F6 /0 test r/m8, imm8 1669 * 1670 * Test mem (ModRM:r/m) with immediate and set status 1671 * flags according to the results. The comparison is 1672 * performed by anding the immediate from the first 1673 * operand and then setting the status flags. 1674 */ 1675 if ((vie->reg & 7) != 0) 1676 return (EINVAL); 1677 1678 size = 1; /* override for byte operation */ 1679 1680 error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size); 1681 if (error) 1682 return (error); 1683 1684 rflags2 = getandflags(size, op1, vie->immediate); 1685 break; 1686 case 0xF7: 1687 /* 1688 * F7 /0 test r/m16, imm16 1689 * F7 /0 test r/m32, imm32 1690 * REX.W + F7 /0 test r/m64, imm32 sign-extended to 64 1691 * 1692 * Test mem (ModRM:r/m) with immediate and set status 1693 * flags according to the results. The comparison is 1694 * performed by anding the immediate from the first 1695 * operand and then setting the status flags. 1696 */ 1697 if ((vie->reg & 7) != 0) 1698 return (EINVAL); 1699 1700 error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size); 1701 if (error) 1702 return (error); 1703 1704 rflags2 = getandflags(size, op1, vie->immediate); 1705 break; 1706 default: 1707 return (EINVAL); 1708 } 1709 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1710 if (error) 1711 return (error); 1712 1713 /* 1714 * OF and CF are cleared; the SF, ZF and PF flags are set according 1715 * to the result; AF is undefined. 1716 */ 1717 rflags &= ~RFLAGS_STATUS_BITS; 1718 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 1719 1720 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 1721 return (error); 1722 } 1723 1724 static int 1725 vie_emulate_bextr(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1726 { 1727 uint64_t src1, src2, dst, rflags; 1728 unsigned start, len, size; 1729 int error; 1730 struct vm_guest_paging *paging; 1731 1732 size = vie->opsize; 1733 error = EINVAL; 1734 paging = &vie->paging; 1735 1736 /* 1737 * VEX.LZ.0F38.W0 F7 /r BEXTR r32a, r/m32, r32b 1738 * VEX.LZ.0F38.W1 F7 /r BEXTR r64a, r/m64, r64b 1739 * 1740 * Destination operand is ModRM:reg. Source operands are ModRM:r/m and 1741 * Vex.vvvv. 1742 * 1743 * Operand size is always 32-bit if not in 64-bit mode (W1 is ignored). 1744 */ 1745 if (size != 4 && paging->cpu_mode != CPU_MODE_64BIT) 1746 size = 4; 1747 1748 /* 1749 * Extracts contiguous bits from the first /source/ operand (second 1750 * operand) using an index and length specified in the second /source/ 1751 * operand (third operand). 1752 */ 1753 error = vie_mmio_read(vie, vm, vcpuid, gpa, &src1, size); 1754 if (error) 1755 return (error); 1756 error = vm_get_register(vm, vcpuid, gpr_map[vie->vex_reg], &src2); 1757 if (error) 1758 return (error); 1759 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1760 if (error) 1761 return (error); 1762 1763 start = (src2 & 0xff); 1764 len = (src2 & 0xff00) >> 8; 1765 1766 /* If no bits are extracted, the destination register is cleared. */ 1767 dst = 0; 1768 1769 /* If START exceeds the operand size, no bits are extracted. */ 1770 if (start > size * 8) 1771 goto done; 1772 /* Length is bounded by both the destination size and start offset. */ 1773 if (start + len > size * 8) 1774 len = (size * 8) - start; 1775 if (len == 0) 1776 goto done; 1777 1778 if (start > 0) 1779 src1 = (src1 >> start); 1780 if (len < 64) 1781 src1 = src1 & ((1ull << len) - 1); 1782 dst = src1; 1783 1784 done: 1785 error = vie_update_register(vm, vcpuid, gpr_map[vie->reg], dst, size); 1786 if (error) 1787 return (error); 1788 1789 /* 1790 * AMD: OF, CF cleared; SF/AF/PF undefined; ZF set by result. 1791 * Intel: ZF is set by result; AF/SF/PF undefined; all others cleared. 1792 */ 1793 rflags &= ~RFLAGS_STATUS_BITS; 1794 if (dst == 0) 1795 rflags |= PSL_Z; 1796 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 1797 8); 1798 return (error); 1799 } 1800 1801 static int 1802 vie_emulate_add(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1803 { 1804 int error, size; 1805 uint64_t nval, rflags, rflags2, val1, val2; 1806 enum vm_reg_name reg; 1807 1808 size = vie->opsize; 1809 error = EINVAL; 1810 1811 switch (vie->op.op_byte) { 1812 case 0x03: 1813 /* 1814 * ADD r/m to r and store the result in r 1815 * 1816 * 03/r ADD r16, r/m16 1817 * 03/r ADD r32, r/m32 1818 * REX.W + 03/r ADD r64, r/m64 1819 */ 1820 1821 /* get the first operand */ 1822 reg = gpr_map[vie->reg]; 1823 error = vm_get_register(vm, vcpuid, reg, &val1); 1824 if (error) 1825 break; 1826 1827 /* get the second operand */ 1828 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size); 1829 if (error) 1830 break; 1831 1832 /* perform the operation and write the result */ 1833 nval = val1 + val2; 1834 error = vie_update_register(vm, vcpuid, reg, nval, size); 1835 break; 1836 default: 1837 break; 1838 } 1839 1840 if (!error) { 1841 rflags2 = getaddflags(size, val1, val2); 1842 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1843 &rflags); 1844 if (error) 1845 return (error); 1846 1847 rflags &= ~RFLAGS_STATUS_BITS; 1848 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1849 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1850 rflags, 8); 1851 } 1852 1853 return (error); 1854 } 1855 1856 static int 1857 vie_emulate_sub(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1858 { 1859 int error, size; 1860 uint64_t nval, rflags, rflags2, val1, val2; 1861 enum vm_reg_name reg; 1862 1863 size = vie->opsize; 1864 error = EINVAL; 1865 1866 switch (vie->op.op_byte) { 1867 case 0x2B: 1868 /* 1869 * SUB r/m from r and store the result in r 1870 * 1871 * 2B/r SUB r16, r/m16 1872 * 2B/r SUB r32, r/m32 1873 * REX.W + 2B/r SUB r64, r/m64 1874 */ 1875 1876 /* get the first operand */ 1877 reg = gpr_map[vie->reg]; 1878 error = vm_get_register(vm, vcpuid, reg, &val1); 1879 if (error) 1880 break; 1881 1882 /* get the second operand */ 1883 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size); 1884 if (error) 1885 break; 1886 1887 /* perform the operation and write the result */ 1888 nval = val1 - val2; 1889 error = vie_update_register(vm, vcpuid, reg, nval, size); 1890 break; 1891 default: 1892 break; 1893 } 1894 1895 if (!error) { 1896 rflags2 = getcc(size, val1, val2); 1897 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1898 &rflags); 1899 if (error) 1900 return (error); 1901 1902 rflags &= ~RFLAGS_STATUS_BITS; 1903 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1904 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1905 rflags, 8); 1906 } 1907 1908 return (error); 1909 } 1910 1911 static int 1912 vie_emulate_mul(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1913 { 1914 int error, size; 1915 uint64_t rflags, rflags2, val1, val2; 1916 __int128_t nval; 1917 enum vm_reg_name reg; 1918 ulong_t (*getflags)(int, uint64_t, uint64_t) = NULL; 1919 1920 size = vie->opsize; 1921 error = EINVAL; 1922 1923 switch (vie->op.op_byte) { 1924 case 0xAF: 1925 /* 1926 * Multiply the contents of a destination register by 1927 * the contents of a register or memory operand and 1928 * put the signed result in the destination register. 1929 * 1930 * AF/r IMUL r16, r/m16 1931 * AF/r IMUL r32, r/m32 1932 * REX.W + AF/r IMUL r64, r/m64 1933 */ 1934 1935 getflags = getimulflags; 1936 1937 /* get the first operand */ 1938 reg = gpr_map[vie->reg]; 1939 error = vm_get_register(vm, vcpuid, reg, &val1); 1940 if (error != 0) 1941 break; 1942 1943 /* get the second operand */ 1944 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size); 1945 if (error != 0) 1946 break; 1947 1948 /* perform the operation and write the result */ 1949 nval = (int64_t)val1 * (int64_t)val2; 1950 1951 error = vie_update_register(vm, vcpuid, reg, nval, size); 1952 1953 DTRACE_PROBE4(vie__imul, 1954 const char *, vie_regnum_name(vie->reg, size), 1955 uint64_t, val1, uint64_t, val2, __uint128_t, nval); 1956 1957 break; 1958 default: 1959 break; 1960 } 1961 1962 if (error == 0) { 1963 rflags2 = getflags(size, val1, val2); 1964 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1965 &rflags); 1966 if (error) 1967 return (error); 1968 1969 rflags &= ~RFLAGS_STATUS_BITS; 1970 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1971 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1972 rflags, 8); 1973 1974 DTRACE_PROBE2(vie__imul__rflags, 1975 uint64_t, rflags, uint64_t, rflags2); 1976 } 1977 1978 return (error); 1979 } 1980 1981 static int 1982 vie_emulate_stack_op(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1983 { 1984 struct vm_copyinfo copyinfo[2]; 1985 struct seg_desc ss_desc; 1986 uint64_t cr0, rflags, rsp, stack_gla, val; 1987 int error, fault, size, stackaddrsize, pushop; 1988 struct vm_guest_paging *paging; 1989 1990 val = 0; 1991 size = vie->opsize; 1992 pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0; 1993 paging = &vie->paging; 1994 1995 /* 1996 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1 1997 */ 1998 if (paging->cpu_mode == CPU_MODE_REAL) { 1999 stackaddrsize = 2; 2000 } else if (paging->cpu_mode == CPU_MODE_64BIT) { 2001 /* 2002 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3 2003 * - Stack pointer size is always 64-bits. 2004 * - PUSH/POP of 32-bit values is not possible in 64-bit mode. 2005 * - 16-bit PUSH/POP is supported by using the operand size 2006 * override prefix (66H). 2007 */ 2008 stackaddrsize = 8; 2009 size = vie->opsize_override ? 2 : 8; 2010 } else { 2011 /* 2012 * In protected or compatibility mode the 'B' flag in the 2013 * stack-segment descriptor determines the size of the 2014 * stack pointer. 2015 */ 2016 error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc); 2017 KASSERT(error == 0, ("%s: error %d getting SS descriptor", 2018 __func__, error)); 2019 if (SEG_DESC_DEF32(ss_desc.access)) 2020 stackaddrsize = 4; 2021 else 2022 stackaddrsize = 2; 2023 } 2024 2025 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); 2026 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); 2027 2028 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 2029 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 2030 2031 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp); 2032 KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error)); 2033 if (pushop) { 2034 rsp -= size; 2035 } 2036 2037 if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc, 2038 rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ, 2039 &stack_gla)) { 2040 vm_inject_ss(vm, vcpuid, 0); 2041 return (0); 2042 } 2043 2044 if (vie_canonical_check(paging->cpu_mode, stack_gla)) { 2045 vm_inject_ss(vm, vcpuid, 0); 2046 return (0); 2047 } 2048 2049 if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) { 2050 vm_inject_ac(vm, vcpuid, 0); 2051 return (0); 2052 } 2053 2054 error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, 2055 pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo), 2056 &fault); 2057 if (error || fault) 2058 return (error); 2059 2060 if (pushop) { 2061 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size); 2062 if (error == 0) 2063 vm_copyout(vm, vcpuid, &val, copyinfo, size); 2064 } else { 2065 vm_copyin(vm, vcpuid, copyinfo, &val, size); 2066 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size); 2067 rsp += size; 2068 } 2069 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 2070 2071 if (error == 0) { 2072 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp, 2073 stackaddrsize); 2074 KASSERT(error == 0, ("error %d updating rsp", error)); 2075 } 2076 return (error); 2077 } 2078 2079 static int 2080 vie_emulate_push(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 2081 { 2082 int error; 2083 2084 /* 2085 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. 2086 * 2087 * PUSH is part of the group 5 extended opcodes and is identified 2088 * by ModRM:reg = b110. 2089 */ 2090 if ((vie->reg & 7) != 6) 2091 return (EINVAL); 2092 2093 error = vie_emulate_stack_op(vie, vm, vcpuid, gpa); 2094 return (error); 2095 } 2096 2097 static int 2098 vie_emulate_pop(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 2099 { 2100 int error; 2101 2102 /* 2103 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. 2104 * 2105 * POP is part of the group 1A extended opcodes and is identified 2106 * by ModRM:reg = b000. 2107 */ 2108 if ((vie->reg & 7) != 0) 2109 return (EINVAL); 2110 2111 error = vie_emulate_stack_op(vie, vm, vcpuid, gpa); 2112 return (error); 2113 } 2114 2115 static int 2116 vie_emulate_group1(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 2117 { 2118 int error; 2119 2120 switch (vie->reg & 7) { 2121 case 0x1: /* OR */ 2122 error = vie_emulate_or(vie, vm, vcpuid, gpa); 2123 break; 2124 case 0x4: /* AND */ 2125 error = vie_emulate_and(vie, vm, vcpuid, gpa); 2126 break; 2127 case 0x7: /* CMP */ 2128 error = vie_emulate_cmp(vie, vm, vcpuid, gpa); 2129 break; 2130 default: 2131 error = EINVAL; 2132 break; 2133 } 2134 2135 return (error); 2136 } 2137 2138 static int 2139 vie_emulate_bittest(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 2140 { 2141 uint64_t val, rflags; 2142 int error, bitmask, bitoff; 2143 2144 /* 2145 * 0F BA is a Group 8 extended opcode. 2146 * 2147 * Currently we only emulate the 'Bit Test' instruction which is 2148 * identified by a ModR/M:reg encoding of 100b. 2149 */ 2150 if ((vie->reg & 7) != 4) 2151 return (EINVAL); 2152 2153 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 2154 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 2155 2156 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, vie->opsize); 2157 if (error) 2158 return (error); 2159 2160 /* 2161 * Intel SDM, Vol 2, Table 3-2: 2162 * "Range of Bit Positions Specified by Bit Offset Operands" 2163 */ 2164 bitmask = vie->opsize * 8 - 1; 2165 bitoff = vie->immediate & bitmask; 2166 2167 /* Copy the bit into the Carry flag in %rflags */ 2168 if (val & (1UL << bitoff)) 2169 rflags |= PSL_C; 2170 else 2171 rflags &= ~PSL_C; 2172 2173 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 2174 KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error)); 2175 2176 return (0); 2177 } 2178 2179 static int 2180 vie_emulate_twob_group15(struct vie *vie, struct vm *vm, int vcpuid, 2181 uint64_t gpa) 2182 { 2183 int error; 2184 uint64_t buf; 2185 2186 switch (vie->reg & 7) { 2187 case 0x7: /* CLFLUSH, CLFLUSHOPT, and SFENCE */ 2188 if (vie->mod == 0x3) { 2189 /* 2190 * SFENCE. Ignore it, VM exit provides enough 2191 * barriers on its own. 2192 */ 2193 error = 0; 2194 } else { 2195 /* 2196 * CLFLUSH, CLFLUSHOPT. Only check for access 2197 * rights. 2198 */ 2199 error = vie_mmio_read(vie, vm, vcpuid, gpa, &buf, 1); 2200 } 2201 break; 2202 default: 2203 error = EINVAL; 2204 break; 2205 } 2206 2207 return (error); 2208 } 2209 2210 static int 2211 vie_emulate_clts(struct vie *vie, struct vm *vm, int vcpuid) 2212 { 2213 uint64_t val; 2214 int error __maybe_unused; 2215 2216 if (vie->paging.cpl != 0) { 2217 vm_inject_gp(vm, vcpuid); 2218 vie->num_processed = 0; 2219 return (0); 2220 } 2221 2222 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &val); 2223 ASSERT(error == 0); 2224 2225 /* Clear %cr0.TS */ 2226 val &= ~CR0_TS; 2227 2228 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, val); 2229 ASSERT(error == 0); 2230 2231 return (0); 2232 } 2233 2234 static int 2235 vie_mmio_read(struct vie *vie, struct vm *vm, int cpuid, uint64_t gpa, 2236 uint64_t *rval, int bytes) 2237 { 2238 int err; 2239 2240 if (vie->mmio_req_read.state == VR_DONE) { 2241 ASSERT(vie->mmio_req_read.bytes == bytes); 2242 ASSERT(vie->mmio_req_read.gpa == gpa); 2243 2244 *rval = vie->mmio_req_read.data; 2245 return (0); 2246 } 2247 2248 err = vm_service_mmio_read(vm, cpuid, gpa, rval, bytes); 2249 if (err == 0) { 2250 /* 2251 * A successful read from an in-kernel-emulated device may come 2252 * with side effects, so stash the result in case it's used for 2253 * an instruction which subsequently needs to issue an MMIO 2254 * write to userspace. 2255 */ 2256 ASSERT(vie->mmio_req_read.state == VR_NONE); 2257 2258 vie->mmio_req_read.bytes = bytes; 2259 vie->mmio_req_read.gpa = gpa; 2260 vie->mmio_req_read.data = *rval; 2261 vie->mmio_req_read.state = VR_DONE; 2262 2263 } else if (err == ESRCH) { 2264 /* Hope that userspace emulation can fulfill this read */ 2265 vie->mmio_req_read.bytes = bytes; 2266 vie->mmio_req_read.gpa = gpa; 2267 vie->mmio_req_read.state = VR_PENDING; 2268 vie->status |= VIES_PENDING_MMIO; 2269 } else if (err < 0) { 2270 /* 2271 * The MMIO read failed in such a way that fallback to handling 2272 * in userspace is required. 2273 */ 2274 vie->status |= VIES_USER_FALLBACK; 2275 } 2276 return (err); 2277 } 2278 2279 static int 2280 vie_mmio_write(struct vie *vie, struct vm *vm, int cpuid, uint64_t gpa, 2281 uint64_t wval, int bytes) 2282 { 2283 int err; 2284 2285 if (vie->mmio_req_write.state == VR_DONE) { 2286 ASSERT(vie->mmio_req_write.bytes == bytes); 2287 ASSERT(vie->mmio_req_write.gpa == gpa); 2288 2289 return (0); 2290 } 2291 2292 err = vm_service_mmio_write(vm, cpuid, gpa, wval, bytes); 2293 if (err == 0) { 2294 /* 2295 * A successful write to an in-kernel-emulated device probably 2296 * results in side effects, so stash the fact that such a write 2297 * succeeded in case the operation requires other work. 2298 */ 2299 vie->mmio_req_write.bytes = bytes; 2300 vie->mmio_req_write.gpa = gpa; 2301 vie->mmio_req_write.data = wval; 2302 vie->mmio_req_write.state = VR_DONE; 2303 } else if (err == ESRCH) { 2304 /* Hope that userspace emulation can fulfill this write */ 2305 vie->mmio_req_write.bytes = bytes; 2306 vie->mmio_req_write.gpa = gpa; 2307 vie->mmio_req_write.data = wval; 2308 vie->mmio_req_write.state = VR_PENDING; 2309 vie->status |= VIES_PENDING_MMIO; 2310 } else if (err < 0) { 2311 /* 2312 * The MMIO write failed in such a way that fallback to handling 2313 * in userspace is required. 2314 */ 2315 vie->status |= VIES_USER_FALLBACK; 2316 } 2317 return (err); 2318 } 2319 2320 int 2321 vie_emulate_mmio(struct vie *vie, struct vm *vm, int vcpuid) 2322 { 2323 int error; 2324 uint64_t gpa; 2325 2326 if ((vie->status & (VIES_INST_DECODE | VIES_MMIO)) != 2327 (VIES_INST_DECODE | VIES_MMIO)) { 2328 return (EINVAL); 2329 } 2330 2331 gpa = vie->mmio_gpa; 2332 2333 switch (vie->op.op_type) { 2334 case VIE_OP_TYPE_GROUP1: 2335 error = vie_emulate_group1(vie, vm, vcpuid, gpa); 2336 break; 2337 case VIE_OP_TYPE_POP: 2338 error = vie_emulate_pop(vie, vm, vcpuid, gpa); 2339 break; 2340 case VIE_OP_TYPE_PUSH: 2341 error = vie_emulate_push(vie, vm, vcpuid, gpa); 2342 break; 2343 case VIE_OP_TYPE_CMP: 2344 error = vie_emulate_cmp(vie, vm, vcpuid, gpa); 2345 break; 2346 case VIE_OP_TYPE_MOV: 2347 error = vie_emulate_mov(vie, vm, vcpuid, gpa); 2348 break; 2349 case VIE_OP_TYPE_MOVSX: 2350 case VIE_OP_TYPE_MOVZX: 2351 error = vie_emulate_movx(vie, vm, vcpuid, gpa); 2352 break; 2353 case VIE_OP_TYPE_MOVS: 2354 error = vie_emulate_movs(vie, vm, vcpuid, gpa); 2355 break; 2356 case VIE_OP_TYPE_STOS: 2357 error = vie_emulate_stos(vie, vm, vcpuid, gpa); 2358 break; 2359 case VIE_OP_TYPE_AND: 2360 error = vie_emulate_and(vie, vm, vcpuid, gpa); 2361 break; 2362 case VIE_OP_TYPE_OR: 2363 error = vie_emulate_or(vie, vm, vcpuid, gpa); 2364 break; 2365 case VIE_OP_TYPE_SUB: 2366 error = vie_emulate_sub(vie, vm, vcpuid, gpa); 2367 break; 2368 case VIE_OP_TYPE_BITTEST: 2369 error = vie_emulate_bittest(vie, vm, vcpuid, gpa); 2370 break; 2371 case VIE_OP_TYPE_TWOB_GRP15: 2372 error = vie_emulate_twob_group15(vie, vm, vcpuid, gpa); 2373 break; 2374 case VIE_OP_TYPE_ADD: 2375 error = vie_emulate_add(vie, vm, vcpuid, gpa); 2376 break; 2377 case VIE_OP_TYPE_TEST: 2378 error = vie_emulate_test(vie, vm, vcpuid, gpa); 2379 break; 2380 case VIE_OP_TYPE_BEXTR: 2381 error = vie_emulate_bextr(vie, vm, vcpuid, gpa); 2382 break; 2383 case VIE_OP_TYPE_MUL: 2384 error = vie_emulate_mul(vie, vm, vcpuid, gpa); 2385 break; 2386 default: 2387 error = EINVAL; 2388 break; 2389 } 2390 2391 if (error == ESRCH) { 2392 /* Return to userspace with the mmio request */ 2393 return (-1); 2394 } 2395 2396 return (error); 2397 } 2398 2399 static int 2400 vie_emulate_inout_port(struct vie *vie, struct vm *vm, int vcpuid, 2401 uint32_t *eax) 2402 { 2403 uint32_t mask, val; 2404 bool in; 2405 int err; 2406 2407 mask = vie_size2mask(vie->inout.bytes); 2408 in = (vie->inout.flags & INOUT_IN) != 0; 2409 2410 if (!in) { 2411 val = *eax & mask; 2412 } 2413 2414 if (vie->inout_req_state != VR_DONE) { 2415 err = vm_ioport_access(vm, vcpuid, in, vie->inout.port, 2416 vie->inout.bytes, &val); 2417 val &= mask; 2418 } else { 2419 /* 2420 * This port access was handled in userspace and the result was 2421 * injected in to be handled now. 2422 */ 2423 val = vie->inout_req_val & mask; 2424 vie->inout_req_state = VR_NONE; 2425 err = 0; 2426 } 2427 2428 if (err == ESRCH) { 2429 vie->status |= VIES_PENDING_INOUT; 2430 vie->inout_req_state = VR_PENDING; 2431 return (err); 2432 } else if (err != 0) { 2433 return (err); 2434 } 2435 2436 if (in) { 2437 *eax = (*eax & ~mask) | val; 2438 } 2439 return (0); 2440 } 2441 2442 static enum vm_reg_name 2443 vie_inout_segname(const struct vie *vie) 2444 { 2445 uint8_t segidx = vie->inout.segment; 2446 const enum vm_reg_name segmap[] = { 2447 VM_REG_GUEST_ES, 2448 VM_REG_GUEST_CS, 2449 VM_REG_GUEST_SS, 2450 VM_REG_GUEST_DS, 2451 VM_REG_GUEST_FS, 2452 VM_REG_GUEST_GS, 2453 }; 2454 const uint8_t maxidx = (sizeof (segmap) / sizeof (segmap[0])); 2455 2456 if (segidx >= maxidx) { 2457 panic("unexpected segment index %u", segidx); 2458 } 2459 return (segmap[segidx]); 2460 } 2461 2462 static int 2463 vie_emulate_inout_str(struct vie *vie, struct vm *vm, int vcpuid) 2464 { 2465 uint8_t bytes, addrsize; 2466 uint64_t index, count = 0, gla, rflags; 2467 int prot, err, fault; 2468 bool in, repeat; 2469 enum vm_reg_name seg_reg, idx_reg; 2470 struct vm_copyinfo copyinfo[2]; 2471 2472 in = (vie->inout.flags & INOUT_IN) != 0; 2473 bytes = vie->inout.bytes; 2474 addrsize = vie->inout.addrsize; 2475 prot = in ? PROT_WRITE : PROT_READ; 2476 2477 ASSERT(bytes == 1 || bytes == 2 || bytes == 4); 2478 ASSERT(addrsize == 2 || addrsize == 4 || addrsize == 8); 2479 2480 idx_reg = (in) ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; 2481 seg_reg = vie_inout_segname(vie); 2482 err = vm_get_register(vm, vcpuid, idx_reg, &index); 2483 ASSERT(err == 0); 2484 index = index & vie_size2mask(addrsize); 2485 2486 repeat = (vie->inout.flags & INOUT_REP) != 0; 2487 2488 /* Count register */ 2489 if (repeat) { 2490 err = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &count); 2491 count &= vie_size2mask(addrsize); 2492 2493 if (count == 0) { 2494 /* 2495 * If we were asked to emulate a REP INS/OUTS when the 2496 * count register is zero, no further work is required. 2497 */ 2498 return (0); 2499 } 2500 } else { 2501 count = 1; 2502 } 2503 2504 gla = 0; 2505 if (vie_get_gla(vie, vm, vcpuid, bytes, addrsize, prot, seg_reg, 2506 idx_reg, &gla) != 0) { 2507 /* vie_get_gla() already injected the appropriate fault */ 2508 return (0); 2509 } 2510 2511 /* 2512 * The INS/OUTS emulate currently assumes that the memory target resides 2513 * within the guest system memory, rather than a device MMIO region. If 2514 * such a case becomes a necessity, that additional handling could be 2515 * put in place. 2516 */ 2517 err = vm_copy_setup(vm, vcpuid, &vie->paging, gla, bytes, prot, 2518 copyinfo, nitems(copyinfo), &fault); 2519 2520 if (err) { 2521 /* Unrecoverable error */ 2522 return (err); 2523 } else if (fault) { 2524 /* Resume guest to handle fault */ 2525 return (0); 2526 } 2527 2528 if (!in) { 2529 vm_copyin(vm, vcpuid, copyinfo, &vie->inout.eax, bytes); 2530 } 2531 2532 err = vie_emulate_inout_port(vie, vm, vcpuid, &vie->inout.eax); 2533 2534 if (err == 0 && in) { 2535 vm_copyout(vm, vcpuid, &vie->inout.eax, copyinfo, bytes); 2536 } 2537 2538 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 2539 2540 if (err == 0) { 2541 err = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2542 &rflags); 2543 ASSERT(err == 0); 2544 2545 /* Update index */ 2546 if (rflags & PSL_D) { 2547 index -= bytes; 2548 } else { 2549 index += bytes; 2550 } 2551 2552 /* Update index register */ 2553 err = vie_update_register(vm, vcpuid, idx_reg, index, addrsize); 2554 ASSERT(err == 0); 2555 2556 /* 2557 * Update count register only if the instruction had a repeat 2558 * prefix. 2559 */ 2560 if ((vie->inout.flags & INOUT_REP) != 0) { 2561 count--; 2562 err = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, 2563 count, addrsize); 2564 ASSERT(err == 0); 2565 2566 if (count != 0) { 2567 return (vie_repeat(vie)); 2568 } 2569 } 2570 } 2571 2572 return (err); 2573 } 2574 2575 int 2576 vie_emulate_inout(struct vie *vie, struct vm *vm, int vcpuid) 2577 { 2578 int err = 0; 2579 2580 if ((vie->status & VIES_INOUT) == 0) { 2581 return (EINVAL); 2582 } 2583 2584 if ((vie->inout.flags & INOUT_STR) == 0) { 2585 /* 2586 * For now, using the 'rep' prefixes with plain (non-string) 2587 * in/out is not supported. 2588 */ 2589 if ((vie->inout.flags & INOUT_REP) != 0) { 2590 return (EINVAL); 2591 } 2592 2593 err = vie_emulate_inout_port(vie, vm, vcpuid, &vie->inout.eax); 2594 if (err == 0 && (vie->inout.flags & INOUT_IN) != 0) { 2595 /* 2596 * With the inX access now a success, the result needs 2597 * to be stored in the guest %rax. 2598 */ 2599 err = vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX, 2600 vie->inout.eax); 2601 VERIFY0(err); 2602 } 2603 } else { 2604 vie->status &= ~VIES_REPEAT; 2605 err = vie_emulate_inout_str(vie, vm, vcpuid); 2606 2607 } 2608 if (err < 0) { 2609 /* 2610 * Access to an I/O port failed in such a way that fallback to 2611 * handling in userspace is required. 2612 */ 2613 vie->status |= VIES_USER_FALLBACK; 2614 } else if (err == ESRCH) { 2615 ASSERT(vie->status & VIES_PENDING_INOUT); 2616 /* Return to userspace with the in/out request */ 2617 err = -1; 2618 } 2619 2620 return (err); 2621 } 2622 2623 int 2624 vie_emulate_other(struct vie *vie, struct vm *vm, int vcpuid) 2625 { 2626 int error; 2627 2628 if ((vie->status & (VIES_INST_DECODE | VIES_OTHER)) != 2629 (VIES_INST_DECODE | VIES_OTHER)) { 2630 return (EINVAL); 2631 } 2632 2633 switch (vie->op.op_type) { 2634 case VIE_OP_TYPE_CLTS: 2635 error = vie_emulate_clts(vie, vm, vcpuid); 2636 break; 2637 case VIE_OP_TYPE_MOV_CR: 2638 error = vie_emulate_mov_cr(vie, vm, vcpuid); 2639 break; 2640 default: 2641 error = EINVAL; 2642 break; 2643 } 2644 2645 return (error); 2646 } 2647 2648 void 2649 vie_reset(struct vie *vie) 2650 { 2651 vie->status = 0; 2652 vie->num_processed = vie->num_valid = 0; 2653 } 2654 2655 void 2656 vie_advance_pc(struct vie *vie, uint64_t *nextrip) 2657 { 2658 VERIFY((vie->status & VIES_REPEAT) == 0); 2659 2660 *nextrip += vie->num_processed; 2661 vie_reset(vie); 2662 } 2663 2664 void 2665 vie_exitinfo(const struct vie *vie, struct vm_exit *vme) 2666 { 2667 if (vie->status & VIES_USER_FALLBACK) { 2668 /* 2669 * Despite the fact that the instruction was successfully 2670 * decoded, some aspect of the emulation failed in such a way 2671 * that it is left up to userspace to complete the operation. 2672 */ 2673 vie_fallback_exitinfo(vie, vme); 2674 } else if (vie->status & VIES_MMIO) { 2675 vme->exitcode = VM_EXITCODE_MMIO; 2676 if (vie->mmio_req_read.state == VR_PENDING) { 2677 vme->u.mmio.gpa = vie->mmio_req_read.gpa; 2678 vme->u.mmio.data = 0; 2679 vme->u.mmio.bytes = vie->mmio_req_read.bytes; 2680 vme->u.mmio.read = 1; 2681 } else if (vie->mmio_req_write.state == VR_PENDING) { 2682 vme->u.mmio.gpa = vie->mmio_req_write.gpa; 2683 vme->u.mmio.data = vie->mmio_req_write.data & 2684 vie_size2mask(vie->mmio_req_write.bytes); 2685 vme->u.mmio.bytes = vie->mmio_req_write.bytes; 2686 vme->u.mmio.read = 0; 2687 } else { 2688 panic("bad pending MMIO state"); 2689 } 2690 } else if (vie->status & VIES_INOUT) { 2691 vme->exitcode = VM_EXITCODE_INOUT; 2692 vme->u.inout.port = vie->inout.port; 2693 vme->u.inout.bytes = vie->inout.bytes; 2694 if ((vie->inout.flags & INOUT_IN) != 0) { 2695 vme->u.inout.flags = INOUT_IN; 2696 vme->u.inout.eax = 0; 2697 } else { 2698 vme->u.inout.flags = 0; 2699 vme->u.inout.eax = vie->inout.eax & 2700 vie_size2mask(vie->inout.bytes); 2701 } 2702 } else { 2703 panic("no pending operation"); 2704 } 2705 } 2706 2707 /* 2708 * In the case of a decoding or verification failure, bailing out to userspace 2709 * to do the instruction emulation is our only option for now. 2710 */ 2711 void 2712 vie_fallback_exitinfo(const struct vie *vie, struct vm_exit *vme) 2713 { 2714 if ((vie->status & VIES_INST_FETCH) == 0) { 2715 bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul)); 2716 } else { 2717 ASSERT(sizeof (vie->inst) == sizeof (vme->u.inst_emul.inst)); 2718 2719 bcopy(vie->inst, vme->u.inst_emul.inst, sizeof (vie->inst)); 2720 vme->u.inst_emul.num_valid = vie->num_valid; 2721 } 2722 vme->exitcode = VM_EXITCODE_INST_EMUL; 2723 } 2724 2725 void 2726 vie_cs_info(const struct vie *vie, struct vm *vm, int vcpuid, uint64_t *cs_base, 2727 int *cs_d) 2728 { 2729 struct seg_desc cs_desc; 2730 int error __maybe_unused; 2731 2732 error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &cs_desc); 2733 ASSERT(error == 0); 2734 2735 /* Initialization required for the paging info to be populated */ 2736 VERIFY(vie->status & VIES_INIT); 2737 switch (vie->paging.cpu_mode) { 2738 case CPU_MODE_REAL: 2739 *cs_base = cs_desc.base; 2740 *cs_d = 0; 2741 break; 2742 case CPU_MODE_PROTECTED: 2743 case CPU_MODE_COMPATIBILITY: 2744 *cs_base = cs_desc.base; 2745 *cs_d = SEG_DESC_DEF32(cs_desc.access) ? 1 : 0; 2746 break; 2747 default: 2748 *cs_base = 0; 2749 *cs_d = 0; 2750 break; 2751 } 2752 } 2753 2754 bool 2755 vie_pending(const struct vie *vie) 2756 { 2757 /* 2758 * These VIE status bits indicate conditions which must be addressed 2759 * through either device IO fulfillment (with corresponding 2760 * vie_fulfill_*()) or complete userspace emulation (followed by a 2761 * vie_reset()). 2762 */ 2763 const enum vie_status of_interest = 2764 VIES_PENDING_MMIO | VIES_PENDING_INOUT | VIES_USER_FALLBACK; 2765 2766 return ((vie->status & of_interest) != 0); 2767 } 2768 2769 bool 2770 vie_needs_fetch(const struct vie *vie) 2771 { 2772 if (vie->status & VIES_INST_FETCH) { 2773 ASSERT(vie->num_valid != 0); 2774 return (false); 2775 } 2776 return (true); 2777 } 2778 2779 static int 2780 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) 2781 { 2782 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 2783 ("%s: invalid size %d", __func__, size)); 2784 KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl)); 2785 2786 if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) 2787 return (0); 2788 2789 return ((gla & (size - 1)) ? 1 : 0); 2790 } 2791 2792 static int 2793 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla) 2794 { 2795 uint64_t mask; 2796 2797 if (cpu_mode != CPU_MODE_64BIT) 2798 return (0); 2799 2800 /* 2801 * The value of the bit 47 in the 'gla' should be replicated in the 2802 * most significant 16 bits. 2803 */ 2804 mask = ~((1UL << 48) - 1); 2805 if (gla & (1UL << 47)) 2806 return ((gla & mask) != mask); 2807 else 2808 return ((gla & mask) != 0); 2809 } 2810 2811 static uint64_t 2812 vie_size2mask(int size) 2813 { 2814 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 2815 ("vie_size2mask: invalid size %d", size)); 2816 return (size2mask[size]); 2817 } 2818 2819 static int 2820 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, 2821 struct seg_desc *desc, uint64_t offset, int length, int addrsize, 2822 int prot, uint64_t *gla) 2823 { 2824 uint64_t firstoff, low_limit, high_limit, segbase; 2825 int glasize, type; 2826 2827 KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS, 2828 ("%s: invalid segment %d", __func__, seg)); 2829 KASSERT(length == 1 || length == 2 || length == 4 || length == 8, 2830 ("%s: invalid operand size %d", __func__, length)); 2831 KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, 2832 ("%s: invalid prot %x", __func__, prot)); 2833 2834 firstoff = offset; 2835 if (cpu_mode == CPU_MODE_64BIT) { 2836 KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address " 2837 "size %d for cpu_mode %d", __func__, addrsize, cpu_mode)); 2838 glasize = 8; 2839 } else { 2840 KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address " 2841 "size %d for cpu mode %d", __func__, addrsize, cpu_mode)); 2842 glasize = 4; 2843 /* 2844 * If the segment selector is loaded with a NULL selector 2845 * then the descriptor is unusable and attempting to use 2846 * it results in a #GP(0). 2847 */ 2848 if (SEG_DESC_UNUSABLE(desc->access)) 2849 return (-1); 2850 2851 /* 2852 * The processor generates a #NP exception when a segment 2853 * register is loaded with a selector that points to a 2854 * descriptor that is not present. If this was the case then 2855 * it would have been checked before the VM-exit. 2856 */ 2857 KASSERT(SEG_DESC_PRESENT(desc->access), 2858 ("segment %d not present: %x", seg, desc->access)); 2859 2860 /* 2861 * The descriptor type must indicate a code/data segment. 2862 */ 2863 type = SEG_DESC_TYPE(desc->access); 2864 KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " 2865 "descriptor type %x", seg, type)); 2866 2867 if (prot & PROT_READ) { 2868 /* #GP on a read access to a exec-only code segment */ 2869 if ((type & 0xA) == 0x8) 2870 return (-1); 2871 } 2872 2873 if (prot & PROT_WRITE) { 2874 /* 2875 * #GP on a write access to a code segment or a 2876 * read-only data segment. 2877 */ 2878 if (type & 0x8) /* code segment */ 2879 return (-1); 2880 2881 if ((type & 0xA) == 0) /* read-only data seg */ 2882 return (-1); 2883 } 2884 2885 /* 2886 * 'desc->limit' is fully expanded taking granularity into 2887 * account. 2888 */ 2889 if ((type & 0xC) == 0x4) { 2890 /* expand-down data segment */ 2891 low_limit = desc->limit + 1; 2892 high_limit = SEG_DESC_DEF32(desc->access) ? 2893 0xffffffff : 0xffff; 2894 } else { 2895 /* code segment or expand-up data segment */ 2896 low_limit = 0; 2897 high_limit = desc->limit; 2898 } 2899 2900 while (length > 0) { 2901 offset &= vie_size2mask(addrsize); 2902 if (offset < low_limit || offset > high_limit) 2903 return (-1); 2904 offset++; 2905 length--; 2906 } 2907 } 2908 2909 /* 2910 * In 64-bit mode all segments except %fs and %gs have a segment 2911 * base address of 0. 2912 */ 2913 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && 2914 seg != VM_REG_GUEST_GS) { 2915 segbase = 0; 2916 } else { 2917 segbase = desc->base; 2918 } 2919 2920 /* 2921 * Truncate 'firstoff' to the effective address size before adding 2922 * it to the segment base. 2923 */ 2924 firstoff &= vie_size2mask(addrsize); 2925 *gla = (segbase + firstoff) & vie_size2mask(glasize); 2926 return (0); 2927 } 2928 2929 void 2930 vie_init_mmio(struct vie *vie, const char *inst_bytes, uint8_t inst_length, 2931 const struct vm_guest_paging *paging, uint64_t gpa) 2932 { 2933 KASSERT(inst_length <= VIE_INST_SIZE, 2934 ("%s: invalid instruction length (%d)", __func__, inst_length)); 2935 2936 bzero(vie, sizeof (struct vie)); 2937 2938 vie->base_register = VM_REG_LAST; 2939 vie->index_register = VM_REG_LAST; 2940 vie->segment_register = VM_REG_LAST; 2941 vie->status = VIES_INIT | VIES_MMIO; 2942 2943 if (inst_length != 0) { 2944 bcopy(inst_bytes, vie->inst, inst_length); 2945 vie->num_valid = inst_length; 2946 vie->status |= VIES_INST_FETCH; 2947 } 2948 2949 vie->paging = *paging; 2950 vie->mmio_gpa = gpa; 2951 } 2952 2953 void 2954 vie_init_inout(struct vie *vie, const struct vm_inout *inout, uint8_t inst_len, 2955 const struct vm_guest_paging *paging) 2956 { 2957 bzero(vie, sizeof (struct vie)); 2958 2959 vie->status = VIES_INIT | VIES_INOUT; 2960 2961 vie->inout = *inout; 2962 vie->paging = *paging; 2963 2964 /* 2965 * Since VMX/SVM assists already decoded the nature of the in/out 2966 * instruction, let the status reflect that. 2967 */ 2968 vie->status |= VIES_INST_FETCH | VIES_INST_DECODE; 2969 vie->num_processed = inst_len; 2970 } 2971 2972 void 2973 vie_init_other(struct vie *vie, const struct vm_guest_paging *paging) 2974 { 2975 bzero(vie, sizeof (struct vie)); 2976 2977 vie->base_register = VM_REG_LAST; 2978 vie->index_register = VM_REG_LAST; 2979 vie->segment_register = VM_REG_LAST; 2980 vie->status = VIES_INIT | VIES_OTHER; 2981 2982 vie->paging = *paging; 2983 } 2984 2985 int 2986 vie_fulfill_mmio(struct vie *vie, const struct vm_mmio *result) 2987 { 2988 struct vie_mmio *pending; 2989 2990 if ((vie->status & VIES_MMIO) == 0 || 2991 (vie->status & VIES_PENDING_MMIO) == 0) { 2992 return (EINVAL); 2993 } 2994 2995 if (result->read) { 2996 pending = &vie->mmio_req_read; 2997 } else { 2998 pending = &vie->mmio_req_write; 2999 } 3000 3001 if (pending->state != VR_PENDING || 3002 pending->bytes != result->bytes || pending->gpa != result->gpa) { 3003 return (EINVAL); 3004 } 3005 3006 if (result->read) { 3007 pending->data = result->data & vie_size2mask(pending->bytes); 3008 } 3009 pending->state = VR_DONE; 3010 vie->status &= ~VIES_PENDING_MMIO; 3011 3012 return (0); 3013 } 3014 3015 int 3016 vie_fulfill_inout(struct vie *vie, const struct vm_inout *result) 3017 { 3018 if ((vie->status & VIES_INOUT) == 0 || 3019 (vie->status & VIES_PENDING_INOUT) == 0) { 3020 return (EINVAL); 3021 } 3022 if ((vie->inout.flags & INOUT_IN) != (result->flags & INOUT_IN) || 3023 vie->inout.bytes != result->bytes || 3024 vie->inout.port != result->port) { 3025 return (EINVAL); 3026 } 3027 3028 if (result->flags & INOUT_IN) { 3029 vie->inout_req_val = result->eax & 3030 vie_size2mask(vie->inout.bytes); 3031 } 3032 vie->inout_req_state = VR_DONE; 3033 vie->status &= ~(VIES_PENDING_INOUT); 3034 3035 return (0); 3036 } 3037 3038 uint64_t 3039 vie_mmio_gpa(const struct vie *vie) 3040 { 3041 return (vie->mmio_gpa); 3042 } 3043 3044 static int 3045 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte) 3046 { 3047 int error_code = 0; 3048 3049 if (pte & PG_V) 3050 error_code |= PGEX_P; 3051 if (prot & PROT_WRITE) 3052 error_code |= PGEX_W; 3053 if (usermode) 3054 error_code |= PGEX_U; 3055 if (rsvd) 3056 error_code |= PGEX_RSV; 3057 if (prot & PROT_EXEC) 3058 error_code |= PGEX_I; 3059 3060 return (error_code); 3061 } 3062 3063 static void 3064 ptp_release(vm_page_t **vmp) 3065 { 3066 if (*vmp != NULL) { 3067 (void) vmp_release(*vmp); 3068 *vmp = NULL; 3069 } 3070 } 3071 3072 static void * 3073 ptp_hold(struct vm *vm, int vcpu, uintptr_t gpa, size_t len, vm_page_t **vmp) 3074 { 3075 vm_client_t *vmc = vm_get_vmclient(vm, vcpu); 3076 const uintptr_t hold_gpa = gpa & PAGEMASK; 3077 3078 /* Hold must not cross a page boundary */ 3079 VERIFY3U(gpa + len, <=, hold_gpa + PAGESIZE); 3080 3081 if (*vmp != NULL) { 3082 (void) vmp_release(*vmp); 3083 } 3084 3085 *vmp = vmc_hold(vmc, hold_gpa, PROT_READ | PROT_WRITE); 3086 if (*vmp == NULL) { 3087 return (NULL); 3088 } 3089 3090 return ((caddr_t)vmp_get_writable(*vmp) + (gpa - hold_gpa)); 3091 } 3092 3093 static int 3094 _vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 3095 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only) 3096 { 3097 int nlevels, pfcode; 3098 int ptpshift = 0, ptpindex = 0; 3099 uint64_t ptpphys; 3100 uint64_t *ptpbase = NULL, pte = 0, pgsize = 0; 3101 vm_page_t *cookie = NULL; 3102 const bool usermode = paging->cpl == 3; 3103 const bool writable = (prot & PROT_WRITE) != 0; 3104 3105 *guest_fault = 0; 3106 restart: 3107 ptpphys = paging->cr3; /* root of the page tables */ 3108 ptp_release(&cookie); 3109 3110 if (vie_canonical_check(paging->cpu_mode, gla)) { 3111 /* 3112 * XXX assuming a non-stack reference otherwise a stack fault 3113 * should be generated. 3114 */ 3115 if (!check_only) 3116 vm_inject_gp(vm, vcpuid); 3117 *guest_fault = 1; 3118 return (0); 3119 } 3120 3121 if (paging->paging_mode == PAGING_MODE_FLAT) { 3122 *gpa = gla; 3123 return (0); 3124 } 3125 3126 if (paging->paging_mode == PAGING_MODE_32) { 3127 uint32_t *ptpbase32, pte32; 3128 3129 nlevels = 2; 3130 while (--nlevels >= 0) { 3131 /* Zero out the lower 12 bits. */ 3132 ptpphys &= ~0xfff; 3133 3134 ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, 3135 &cookie); 3136 3137 if (ptpbase32 == NULL) { 3138 return (EFAULT); 3139 } 3140 3141 ptpshift = PAGE_SHIFT + nlevels * 10; 3142 ptpindex = (gla >> ptpshift) & 0x3FF; 3143 pgsize = 1UL << ptpshift; 3144 3145 pte32 = ptpbase32[ptpindex]; 3146 3147 if ((pte32 & PG_V) == 0 || 3148 (usermode && (pte32 & PG_U) == 0) || 3149 (writable && (pte32 & PG_RW) == 0)) { 3150 if (!check_only) { 3151 pfcode = pf_error_code(usermode, prot, 3152 0, pte32); 3153 vm_inject_pf(vm, vcpuid, pfcode, gla); 3154 } 3155 3156 ptp_release(&cookie); 3157 *guest_fault = 1; 3158 return (0); 3159 } 3160 3161 /* 3162 * Emulate the x86 MMU's management of the accessed 3163 * and dirty flags. While the accessed flag is set 3164 * at every level of the page table, the dirty flag 3165 * is only set at the last level providing the guest 3166 * physical address. 3167 */ 3168 if (!check_only && (pte32 & PG_A) == 0) { 3169 if (atomic_cmpset_32(&ptpbase32[ptpindex], 3170 pte32, pte32 | PG_A) == 0) { 3171 goto restart; 3172 } 3173 } 3174 3175 /* XXX must be ignored if CR4.PSE=0 */ 3176 if (nlevels > 0 && (pte32 & PG_PS) != 0) 3177 break; 3178 3179 ptpphys = pte32; 3180 } 3181 3182 /* Set the dirty bit in the page table entry if necessary */ 3183 if (!check_only && writable && (pte32 & PG_M) == 0) { 3184 if (atomic_cmpset_32(&ptpbase32[ptpindex], 3185 pte32, pte32 | PG_M) == 0) { 3186 goto restart; 3187 } 3188 } 3189 3190 /* Zero out the lower 'ptpshift' bits */ 3191 pte32 >>= ptpshift; pte32 <<= ptpshift; 3192 *gpa = pte32 | (gla & (pgsize - 1)); 3193 ptp_release(&cookie); 3194 return (0); 3195 } 3196 3197 if (paging->paging_mode == PAGING_MODE_PAE) { 3198 /* Zero out the lower 5 bits and the upper 32 bits */ 3199 ptpphys &= 0xffffffe0UL; 3200 3201 ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof (*ptpbase) * 4, 3202 &cookie); 3203 if (ptpbase == NULL) { 3204 return (EFAULT); 3205 } 3206 3207 ptpindex = (gla >> 30) & 0x3; 3208 3209 pte = ptpbase[ptpindex]; 3210 3211 if ((pte & PG_V) == 0) { 3212 if (!check_only) { 3213 pfcode = pf_error_code(usermode, prot, 0, pte); 3214 vm_inject_pf(vm, vcpuid, pfcode, gla); 3215 } 3216 3217 ptp_release(&cookie); 3218 *guest_fault = 1; 3219 return (0); 3220 } 3221 3222 ptpphys = pte; 3223 3224 nlevels = 2; 3225 } else { 3226 nlevels = 4; 3227 } 3228 3229 while (--nlevels >= 0) { 3230 /* Zero out the lower 12 bits and the upper 12 bits */ 3231 ptpphys &= 0x000ffffffffff000UL; 3232 3233 ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie); 3234 if (ptpbase == NULL) { 3235 return (EFAULT); 3236 } 3237 3238 ptpshift = PAGE_SHIFT + nlevels * 9; 3239 ptpindex = (gla >> ptpshift) & 0x1FF; 3240 pgsize = 1UL << ptpshift; 3241 3242 pte = ptpbase[ptpindex]; 3243 3244 if ((pte & PG_V) == 0 || 3245 (usermode && (pte & PG_U) == 0) || 3246 (writable && (pte & PG_RW) == 0)) { 3247 if (!check_only) { 3248 pfcode = pf_error_code(usermode, prot, 0, pte); 3249 vm_inject_pf(vm, vcpuid, pfcode, gla); 3250 } 3251 3252 ptp_release(&cookie); 3253 *guest_fault = 1; 3254 return (0); 3255 } 3256 3257 /* Set the accessed bit in the page table entry */ 3258 if (!check_only && (pte & PG_A) == 0) { 3259 if (atomic_cmpset_64(&ptpbase[ptpindex], 3260 pte, pte | PG_A) == 0) { 3261 goto restart; 3262 } 3263 } 3264 3265 if (nlevels > 0 && (pte & PG_PS) != 0) { 3266 if (pgsize > 1 * GB) { 3267 if (!check_only) { 3268 pfcode = pf_error_code(usermode, prot, 3269 1, pte); 3270 vm_inject_pf(vm, vcpuid, pfcode, gla); 3271 } 3272 3273 ptp_release(&cookie); 3274 *guest_fault = 1; 3275 return (0); 3276 } 3277 break; 3278 } 3279 3280 ptpphys = pte; 3281 } 3282 3283 /* Set the dirty bit in the page table entry if necessary */ 3284 if (!check_only && writable && (pte & PG_M) == 0) { 3285 if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0) 3286 goto restart; 3287 } 3288 ptp_release(&cookie); 3289 3290 /* Zero out the lower 'ptpshift' bits and the upper 12 bits */ 3291 pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12; 3292 *gpa = pte | (gla & (pgsize - 1)); 3293 return (0); 3294 } 3295 3296 int 3297 vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 3298 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) 3299 { 3300 3301 return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault, 3302 false)); 3303 } 3304 3305 int 3306 vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 3307 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) 3308 { 3309 3310 return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault, 3311 true)); 3312 } 3313 3314 int 3315 vie_fetch_instruction(struct vie *vie, struct vm *vm, int vcpuid, uint64_t rip, 3316 int *faultptr) 3317 { 3318 struct vm_copyinfo copyinfo[2]; 3319 int error, prot; 3320 3321 if ((vie->status & VIES_INIT) == 0) { 3322 return (EINVAL); 3323 } 3324 3325 prot = PROT_READ | PROT_EXEC; 3326 error = vm_copy_setup(vm, vcpuid, &vie->paging, rip, VIE_INST_SIZE, 3327 prot, copyinfo, nitems(copyinfo), faultptr); 3328 if (error || *faultptr) 3329 return (error); 3330 3331 vm_copyin(vm, vcpuid, copyinfo, vie->inst, VIE_INST_SIZE); 3332 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 3333 vie->num_valid = VIE_INST_SIZE; 3334 vie->status |= VIES_INST_FETCH; 3335 return (0); 3336 } 3337 3338 static int 3339 vie_peek(struct vie *vie, uint8_t *x) 3340 { 3341 3342 if (vie->num_processed < vie->num_valid) { 3343 *x = vie->inst[vie->num_processed]; 3344 return (0); 3345 } else 3346 return (-1); 3347 } 3348 3349 static void 3350 vie_advance(struct vie *vie) 3351 { 3352 3353 vie->num_processed++; 3354 } 3355 3356 static bool 3357 segment_override(uint8_t x, int *seg) 3358 { 3359 3360 switch (x) { 3361 case 0x2E: 3362 *seg = VM_REG_GUEST_CS; 3363 break; 3364 case 0x36: 3365 *seg = VM_REG_GUEST_SS; 3366 break; 3367 case 0x3E: 3368 *seg = VM_REG_GUEST_DS; 3369 break; 3370 case 0x26: 3371 *seg = VM_REG_GUEST_ES; 3372 break; 3373 case 0x64: 3374 *seg = VM_REG_GUEST_FS; 3375 break; 3376 case 0x65: 3377 *seg = VM_REG_GUEST_GS; 3378 break; 3379 default: 3380 return (false); 3381 } 3382 return (true); 3383 } 3384 3385 static int 3386 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d) 3387 { 3388 uint8_t x; 3389 3390 while (1) { 3391 if (vie_peek(vie, &x)) 3392 return (-1); 3393 3394 if (x == 0x66) 3395 vie->opsize_override = 1; 3396 else if (x == 0x67) 3397 vie->addrsize_override = 1; 3398 else if (x == 0xF3) 3399 vie->repz_present = 1; 3400 else if (x == 0xF2) 3401 vie->repnz_present = 1; 3402 else if (segment_override(x, &vie->segment_register)) 3403 vie->segment_override = 1; 3404 else 3405 break; 3406 3407 vie_advance(vie); 3408 } 3409 3410 /* 3411 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2: 3412 * - Only one REX prefix is allowed per instruction. 3413 * - The REX prefix must immediately precede the opcode byte or the 3414 * escape opcode byte. 3415 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3) 3416 * the mandatory prefix must come before the REX prefix. 3417 */ 3418 if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) { 3419 vie->rex_present = 1; 3420 vie->rex_w = x & 0x8 ? 1 : 0; 3421 vie->rex_r = x & 0x4 ? 1 : 0; 3422 vie->rex_x = x & 0x2 ? 1 : 0; 3423 vie->rex_b = x & 0x1 ? 1 : 0; 3424 vie_advance(vie); 3425 } 3426 3427 /* 3428 * § 2.3.5, "The VEX Prefix", SDM Vol 2. 3429 */ 3430 if ((cpu_mode == CPU_MODE_64BIT || 3431 cpu_mode == CPU_MODE_COMPATIBILITY) && x == 0xC4) { 3432 const struct vie_op *optab; 3433 3434 /* 3-byte VEX prefix. */ 3435 vie->vex_present = 1; 3436 3437 vie_advance(vie); 3438 if (vie_peek(vie, &x)) 3439 return (-1); 3440 3441 /* 3442 * 2nd byte: [R', X', B', mmmmm[4:0]]. Bits are inverted 3443 * relative to REX encoding. 3444 */ 3445 vie->rex_r = x & 0x80 ? 0 : 1; 3446 vie->rex_x = x & 0x40 ? 0 : 1; 3447 vie->rex_b = x & 0x20 ? 0 : 1; 3448 3449 switch (x & 0x1F) { 3450 case 0x2: 3451 /* 0F 38. */ 3452 optab = three_byte_opcodes_0f38; 3453 break; 3454 case 0x1: 3455 /* 0F class - nothing handled here yet. */ 3456 /* FALLTHROUGH */ 3457 case 0x3: 3458 /* 0F 3A class - nothing handled here yet. */ 3459 /* FALLTHROUGH */ 3460 default: 3461 /* Reserved (#UD). */ 3462 return (-1); 3463 } 3464 3465 vie_advance(vie); 3466 if (vie_peek(vie, &x)) 3467 return (-1); 3468 3469 /* 3rd byte: [W, vvvv[6:3], L, pp[1:0]]. */ 3470 vie->rex_w = x & 0x80 ? 1 : 0; 3471 3472 vie->vex_reg = ((~(unsigned)x & 0x78u) >> 3); 3473 vie->vex_l = !!(x & 0x4); 3474 vie->vex_pp = (x & 0x3); 3475 3476 /* PP: 1=66 2=F3 3=F2 prefixes. */ 3477 switch (vie->vex_pp) { 3478 case 0x1: 3479 vie->opsize_override = 1; 3480 break; 3481 case 0x2: 3482 vie->repz_present = 1; 3483 break; 3484 case 0x3: 3485 vie->repnz_present = 1; 3486 break; 3487 } 3488 3489 vie_advance(vie); 3490 3491 /* Opcode, sans literal prefix prefix. */ 3492 if (vie_peek(vie, &x)) 3493 return (-1); 3494 3495 vie->op = optab[x]; 3496 if (vie->op.op_type == VIE_OP_TYPE_NONE) 3497 return (-1); 3498 3499 vie_advance(vie); 3500 } 3501 3502 /* 3503 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1 3504 */ 3505 if (cpu_mode == CPU_MODE_64BIT) { 3506 /* 3507 * Default address size is 64-bits and default operand size 3508 * is 32-bits. 3509 */ 3510 vie->addrsize = vie->addrsize_override ? 4 : 8; 3511 if (vie->rex_w) 3512 vie->opsize = 8; 3513 else if (vie->opsize_override) 3514 vie->opsize = 2; 3515 else 3516 vie->opsize = 4; 3517 } else if (cs_d) { 3518 /* Default address and operand sizes are 32-bits */ 3519 vie->addrsize = vie->addrsize_override ? 2 : 4; 3520 vie->opsize = vie->opsize_override ? 2 : 4; 3521 } else { 3522 /* Default address and operand sizes are 16-bits */ 3523 vie->addrsize = vie->addrsize_override ? 4 : 2; 3524 vie->opsize = vie->opsize_override ? 4 : 2; 3525 } 3526 return (0); 3527 } 3528 3529 static int 3530 decode_two_byte_opcode(struct vie *vie) 3531 { 3532 uint8_t x; 3533 3534 if (vie_peek(vie, &x)) 3535 return (-1); 3536 3537 vie->op = two_byte_opcodes[x]; 3538 3539 if (vie->op.op_type == VIE_OP_TYPE_NONE) 3540 return (-1); 3541 3542 vie_advance(vie); 3543 return (0); 3544 } 3545 3546 static int 3547 decode_opcode(struct vie *vie) 3548 { 3549 uint8_t x; 3550 3551 if (vie_peek(vie, &x)) 3552 return (-1); 3553 3554 /* Already did this via VEX prefix. */ 3555 if (vie->op.op_type != VIE_OP_TYPE_NONE) 3556 return (0); 3557 3558 vie->op = one_byte_opcodes[x]; 3559 3560 if (vie->op.op_type == VIE_OP_TYPE_NONE) 3561 return (-1); 3562 3563 vie_advance(vie); 3564 3565 if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE) 3566 return (decode_two_byte_opcode(vie)); 3567 3568 return (0); 3569 } 3570 3571 static int 3572 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode) 3573 { 3574 uint8_t x; 3575 /* 3576 * Handling mov-to/from-cr is special since it is not issuing 3577 * mmio/pio requests and can be done in real mode. We must bypass some 3578 * of the other existing decoding restrictions for it. 3579 */ 3580 const bool is_movcr = ((vie->op.op_flags & VIE_OP_F_REG_REG) != 0); 3581 3582 if (vie->op.op_flags & VIE_OP_F_NO_MODRM) 3583 return (0); 3584 3585 if (cpu_mode == CPU_MODE_REAL && !is_movcr) 3586 return (-1); 3587 3588 if (vie_peek(vie, &x)) 3589 return (-1); 3590 3591 vie->mod = (x >> 6) & 0x3; 3592 vie->rm = (x >> 0) & 0x7; 3593 vie->reg = (x >> 3) & 0x7; 3594 3595 /* 3596 * A direct addressing mode makes no sense in the context of an EPT 3597 * fault. There has to be a memory access involved to cause the 3598 * EPT fault. 3599 */ 3600 if (vie->mod == VIE_MOD_DIRECT && !is_movcr) 3601 return (-1); 3602 3603 if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) || 3604 (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) { 3605 /* 3606 * Table 2-5: Special Cases of REX Encodings 3607 * 3608 * mod=0, r/m=5 is used in the compatibility mode to 3609 * indicate a disp32 without a base register. 3610 * 3611 * mod!=3, r/m=4 is used in the compatibility mode to 3612 * indicate that the SIB byte is present. 3613 * 3614 * The 'b' bit in the REX prefix is don't care in 3615 * this case. 3616 */ 3617 } else { 3618 vie->rm |= (vie->rex_b << 3); 3619 } 3620 3621 vie->reg |= (vie->rex_r << 3); 3622 3623 /* SIB */ 3624 if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB) 3625 goto done; 3626 3627 vie->base_register = gpr_map[vie->rm]; 3628 3629 switch (vie->mod) { 3630 case VIE_MOD_INDIRECT_DISP8: 3631 vie->disp_bytes = 1; 3632 break; 3633 case VIE_MOD_INDIRECT_DISP32: 3634 vie->disp_bytes = 4; 3635 break; 3636 case VIE_MOD_INDIRECT: 3637 if (vie->rm == VIE_RM_DISP32) { 3638 vie->disp_bytes = 4; 3639 /* 3640 * Table 2-7. RIP-Relative Addressing 3641 * 3642 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32 3643 * whereas in compatibility mode it just implies disp32. 3644 */ 3645 3646 if (cpu_mode == CPU_MODE_64BIT) 3647 vie->base_register = VM_REG_GUEST_RIP; 3648 else 3649 vie->base_register = VM_REG_LAST; 3650 } 3651 break; 3652 } 3653 3654 done: 3655 vie_advance(vie); 3656 3657 return (0); 3658 } 3659 3660 static int 3661 decode_sib(struct vie *vie) 3662 { 3663 uint8_t x; 3664 3665 /* Proceed only if SIB byte is present */ 3666 if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB) 3667 return (0); 3668 3669 if (vie_peek(vie, &x)) 3670 return (-1); 3671 3672 /* De-construct the SIB byte */ 3673 vie->ss = (x >> 6) & 0x3; 3674 vie->index = (x >> 3) & 0x7; 3675 vie->base = (x >> 0) & 0x7; 3676 3677 /* Apply the REX prefix modifiers */ 3678 vie->index |= vie->rex_x << 3; 3679 vie->base |= vie->rex_b << 3; 3680 3681 switch (vie->mod) { 3682 case VIE_MOD_INDIRECT_DISP8: 3683 vie->disp_bytes = 1; 3684 break; 3685 case VIE_MOD_INDIRECT_DISP32: 3686 vie->disp_bytes = 4; 3687 break; 3688 } 3689 3690 if (vie->mod == VIE_MOD_INDIRECT && 3691 (vie->base == 5 || vie->base == 13)) { 3692 /* 3693 * Special case when base register is unused if mod = 0 3694 * and base = %rbp or %r13. 3695 * 3696 * Documented in: 3697 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 3698 * Table 2-5: Special Cases of REX Encodings 3699 */ 3700 vie->disp_bytes = 4; 3701 } else { 3702 vie->base_register = gpr_map[vie->base]; 3703 } 3704 3705 /* 3706 * All encodings of 'index' are valid except for %rsp (4). 3707 * 3708 * Documented in: 3709 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 3710 * Table 2-5: Special Cases of REX Encodings 3711 */ 3712 if (vie->index != 4) 3713 vie->index_register = gpr_map[vie->index]; 3714 3715 /* 'scale' makes sense only in the context of an index register */ 3716 if (vie->index_register < VM_REG_LAST) 3717 vie->scale = 1 << vie->ss; 3718 3719 vie_advance(vie); 3720 3721 return (0); 3722 } 3723 3724 static int 3725 decode_displacement(struct vie *vie) 3726 { 3727 int n, i; 3728 uint8_t x; 3729 3730 union { 3731 char buf[4]; 3732 int8_t signed8; 3733 int32_t signed32; 3734 } u; 3735 3736 if ((n = vie->disp_bytes) == 0) 3737 return (0); 3738 3739 if (n != 1 && n != 4) 3740 panic("decode_displacement: invalid disp_bytes %d", n); 3741 3742 for (i = 0; i < n; i++) { 3743 if (vie_peek(vie, &x)) 3744 return (-1); 3745 3746 u.buf[i] = x; 3747 vie_advance(vie); 3748 } 3749 3750 if (n == 1) 3751 vie->displacement = u.signed8; /* sign-extended */ 3752 else 3753 vie->displacement = u.signed32; /* sign-extended */ 3754 3755 return (0); 3756 } 3757 3758 static int 3759 decode_immediate(struct vie *vie) 3760 { 3761 int i, n; 3762 uint8_t x; 3763 union { 3764 char buf[4]; 3765 int8_t signed8; 3766 int16_t signed16; 3767 int32_t signed32; 3768 } u; 3769 3770 /* Figure out immediate operand size (if any) */ 3771 if (vie->op.op_flags & VIE_OP_F_IMM) { 3772 /* 3773 * Section 2.2.1.5 "Immediates", Intel SDM: 3774 * In 64-bit mode the typical size of immediate operands 3775 * remains 32-bits. When the operand size if 64-bits, the 3776 * processor sign-extends all immediates to 64-bits prior 3777 * to their use. 3778 */ 3779 if (vie->opsize == 4 || vie->opsize == 8) 3780 vie->imm_bytes = 4; 3781 else 3782 vie->imm_bytes = 2; 3783 } else if (vie->op.op_flags & VIE_OP_F_IMM8) { 3784 vie->imm_bytes = 1; 3785 } 3786 3787 if ((n = vie->imm_bytes) == 0) 3788 return (0); 3789 3790 KASSERT(n == 1 || n == 2 || n == 4, 3791 ("%s: invalid number of immediate bytes: %d", __func__, n)); 3792 3793 for (i = 0; i < n; i++) { 3794 if (vie_peek(vie, &x)) 3795 return (-1); 3796 3797 u.buf[i] = x; 3798 vie_advance(vie); 3799 } 3800 3801 /* sign-extend the immediate value before use */ 3802 if (n == 1) 3803 vie->immediate = u.signed8; 3804 else if (n == 2) 3805 vie->immediate = u.signed16; 3806 else 3807 vie->immediate = u.signed32; 3808 3809 return (0); 3810 } 3811 3812 static int 3813 decode_moffset(struct vie *vie) 3814 { 3815 int i, n; 3816 uint8_t x; 3817 union { 3818 char buf[8]; 3819 uint64_t u64; 3820 } u; 3821 3822 if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0) 3823 return (0); 3824 3825 /* 3826 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM: 3827 * The memory offset size follows the address-size of the instruction. 3828 */ 3829 n = vie->addrsize; 3830 KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n)); 3831 3832 u.u64 = 0; 3833 for (i = 0; i < n; i++) { 3834 if (vie_peek(vie, &x)) 3835 return (-1); 3836 3837 u.buf[i] = x; 3838 vie_advance(vie); 3839 } 3840 vie->displacement = u.u64; 3841 return (0); 3842 } 3843 3844 /* 3845 * Verify that the 'guest linear address' provided as collateral of the nested 3846 * page table fault matches with our instruction decoding. 3847 */ 3848 int 3849 vie_verify_gla(struct vie *vie, struct vm *vm, int cpuid, uint64_t gla) 3850 { 3851 int error; 3852 uint64_t base, segbase, idx, gla2; 3853 enum vm_reg_name seg; 3854 struct seg_desc desc; 3855 3856 ASSERT((vie->status & VIES_INST_DECODE) != 0); 3857 3858 /* 3859 * If there was no valid GLA context with the exit, or the decoded 3860 * instruction acts on more than one address, verification is done. 3861 */ 3862 if (gla == VIE_INVALID_GLA || 3863 (vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) != 0) { 3864 return (0); 3865 } 3866 3867 base = 0; 3868 if (vie->base_register != VM_REG_LAST) { 3869 error = vm_get_register(vm, cpuid, vie->base_register, &base); 3870 if (error) { 3871 printf("verify_gla: error %d getting base reg %d\n", 3872 error, vie->base_register); 3873 return (-1); 3874 } 3875 3876 /* 3877 * RIP-relative addressing starts from the following 3878 * instruction 3879 */ 3880 if (vie->base_register == VM_REG_GUEST_RIP) 3881 base += vie->num_processed; 3882 } 3883 3884 idx = 0; 3885 if (vie->index_register != VM_REG_LAST) { 3886 error = vm_get_register(vm, cpuid, vie->index_register, &idx); 3887 if (error) { 3888 printf("verify_gla: error %d getting index reg %d\n", 3889 error, vie->index_register); 3890 return (-1); 3891 } 3892 } 3893 3894 /* 3895 * From "Specifying a Segment Selector", Intel SDM, Vol 1 3896 * 3897 * In 64-bit mode, segmentation is generally (but not 3898 * completely) disabled. The exceptions are the FS and GS 3899 * segments. 3900 * 3901 * In legacy IA-32 mode, when the ESP or EBP register is used 3902 * as the base, the SS segment is the default segment. For 3903 * other data references, except when relative to stack or 3904 * string destination the DS segment is the default. These 3905 * can be overridden to allow other segments to be accessed. 3906 */ 3907 if (vie->segment_override) { 3908 seg = vie->segment_register; 3909 } else if (vie->base_register == VM_REG_GUEST_RSP || 3910 vie->base_register == VM_REG_GUEST_RBP) { 3911 seg = VM_REG_GUEST_SS; 3912 } else { 3913 seg = VM_REG_GUEST_DS; 3914 } 3915 if (vie->paging.cpu_mode == CPU_MODE_64BIT && 3916 seg != VM_REG_GUEST_FS && seg != VM_REG_GUEST_GS) { 3917 segbase = 0; 3918 } else { 3919 error = vm_get_seg_desc(vm, cpuid, seg, &desc); 3920 if (error) { 3921 printf("verify_gla: error %d getting segment" 3922 " descriptor %d", error, vie->segment_register); 3923 return (-1); 3924 } 3925 segbase = desc.base; 3926 } 3927 3928 gla2 = segbase + base + vie->scale * idx + vie->displacement; 3929 gla2 &= size2mask[vie->addrsize]; 3930 if (gla != gla2) { 3931 printf("verify_gla mismatch: segbase(0x%0lx)" 3932 "base(0x%0lx), scale(%d), index(0x%0lx), " 3933 "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n", 3934 segbase, base, vie->scale, idx, vie->displacement, 3935 gla, gla2); 3936 return (-1); 3937 } 3938 3939 return (0); 3940 } 3941 3942 int 3943 vie_decode_instruction(struct vie *vie, struct vm *vm, int cpuid, int cs_d) 3944 { 3945 enum vm_cpu_mode cpu_mode; 3946 3947 if ((vie->status & VIES_INST_FETCH) == 0) { 3948 return (EINVAL); 3949 } 3950 3951 cpu_mode = vie->paging.cpu_mode; 3952 3953 if (decode_prefixes(vie, cpu_mode, cs_d)) 3954 return (-1); 3955 3956 if (decode_opcode(vie)) 3957 return (-1); 3958 3959 if (decode_modrm(vie, cpu_mode)) 3960 return (-1); 3961 3962 if (decode_sib(vie)) 3963 return (-1); 3964 3965 if (decode_displacement(vie)) 3966 return (-1); 3967 3968 if (decode_immediate(vie)) 3969 return (-1); 3970 3971 if (decode_moffset(vie)) 3972 return (-1); 3973 3974 vie->status |= VIES_INST_DECODE; 3975 3976 return (0); 3977 } 3978