1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2012 Sandvine, Inc. 5 * Copyright (c) 2012 NetApp, Inc. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 /* 30 * This file and its contents are supplied under the terms of the 31 * Common Development and Distribution License ("CDDL"), version 1.0. 32 * You may only use this file in accordance with the terms of version 33 * 1.0 of the CDDL. 34 * 35 * A full copy of the text of the CDDL should have accompanied this 36 * source. A copy of the CDDL is also available via the Internet at 37 * http://www.illumos.org/license/CDDL. 38 * 39 * Copyright 2015 Pluribus Networks Inc. 40 * Copyright 2018 Joyent, Inc. 41 * Copyright 2021 Oxide Computer Company 42 * Copyright 2022 OmniOS Community Edition (OmniOSce) Association. 43 */ 44 45 #include <sys/cdefs.h> 46 47 #include <sys/param.h> 48 #include <sys/pcpu.h> 49 #include <sys/systm.h> 50 #include <sys/proc.h> 51 52 #include <machine/vmparam.h> 53 #include <machine/vmm.h> 54 #include <sys/vmm_kernel.h> 55 #include <sys/vmm_vm.h> 56 57 #include <sys/vmm_instruction_emul.h> 58 #include <x86/psl.h> 59 #include <x86/specialreg.h> 60 61 #include "vmm_ioport.h" 62 63 enum vie_status { 64 VIES_INIT = (1U << 0), 65 VIES_MMIO = (1U << 1), 66 VIES_INOUT = (1U << 2), 67 VIES_OTHER = (1U << 3), 68 VIES_INST_FETCH = (1U << 4), 69 VIES_INST_DECODE = (1U << 5), 70 VIES_PENDING_MMIO = (1U << 6), 71 VIES_PENDING_INOUT = (1U << 7), 72 VIES_REPEAT = (1U << 8), 73 VIES_USER_FALLBACK = (1U << 9), 74 VIES_COMPLETE = (1U << 10), 75 }; 76 77 /* State of request to perform emulated access (inout or MMIO) */ 78 enum vie_req { 79 VR_NONE, 80 VR_PENDING, 81 VR_DONE, 82 }; 83 84 struct vie_mmio { 85 uint64_t data; 86 uint64_t gpa; 87 uint8_t bytes; 88 enum vie_req state; 89 }; 90 91 struct vie_op { 92 uint8_t op_byte; /* actual opcode byte */ 93 uint8_t op_type; /* type of operation (e.g. MOV) */ 94 uint16_t op_flags; 95 }; 96 97 #define VIE_INST_SIZE 15 98 struct vie { 99 uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */ 100 uint8_t num_valid; /* size of the instruction */ 101 uint8_t num_processed; 102 103 uint8_t addrsize:4, opsize:4; /* address and operand sizes */ 104 uint8_t rex_w:1, /* REX prefix */ 105 rex_r:1, 106 rex_x:1, 107 rex_b:1, 108 rex_present:1, 109 repz_present:1, /* REP/REPE/REPZ prefix */ 110 repnz_present:1, /* REPNE/REPNZ prefix */ 111 opsize_override:1, /* Operand size override */ 112 addrsize_override:1, /* Address size override */ 113 segment_override:1; /* Segment override */ 114 115 uint8_t mod:2, /* ModRM byte */ 116 reg:4, 117 rm:4; 118 119 uint8_t ss:2, /* SIB byte */ 120 vex_present:1, /* VEX prefixed */ 121 vex_l:1, /* L bit */ 122 index:4, /* SIB byte */ 123 base:4; /* SIB byte */ 124 125 uint8_t disp_bytes; 126 uint8_t imm_bytes; 127 128 uint8_t scale; 129 130 uint8_t vex_reg:4, /* vvvv: first source reg specifier */ 131 vex_pp:2, /* pp */ 132 _sparebits:2; 133 134 uint8_t _sparebytes[2]; 135 136 int base_register; /* VM_REG_GUEST_xyz */ 137 int index_register; /* VM_REG_GUEST_xyz */ 138 int segment_register; /* VM_REG_GUEST_xyz */ 139 140 int64_t displacement; /* optional addr displacement */ 141 int64_t immediate; /* optional immediate operand */ 142 143 struct vie_op op; /* opcode description */ 144 145 enum vie_status status; 146 147 struct vm_guest_paging paging; /* guest paging state */ 148 149 uint64_t mmio_gpa; /* faulting GPA */ 150 struct vie_mmio mmio_req_read; 151 struct vie_mmio mmio_req_write; 152 153 struct vm_inout inout; /* active in/out op */ 154 enum vie_req inout_req_state; 155 uint32_t inout_req_val; /* value from userspace */ 156 }; 157 158 159 /* struct vie_op.op_type */ 160 enum { 161 VIE_OP_TYPE_NONE = 0, 162 VIE_OP_TYPE_MOV, 163 VIE_OP_TYPE_MOVSX, 164 VIE_OP_TYPE_MOVZX, 165 VIE_OP_TYPE_MOV_CR, 166 VIE_OP_TYPE_AND, 167 VIE_OP_TYPE_OR, 168 VIE_OP_TYPE_SUB, 169 VIE_OP_TYPE_TWO_BYTE, 170 VIE_OP_TYPE_PUSH, 171 VIE_OP_TYPE_CMP, 172 VIE_OP_TYPE_POP, 173 VIE_OP_TYPE_MOVS, 174 VIE_OP_TYPE_GROUP1, 175 VIE_OP_TYPE_STOS, 176 VIE_OP_TYPE_BITTEST, 177 VIE_OP_TYPE_TWOB_GRP15, 178 VIE_OP_TYPE_ADD, 179 VIE_OP_TYPE_TEST, 180 VIE_OP_TYPE_BEXTR, 181 VIE_OP_TYPE_CLTS, 182 VIE_OP_TYPE_MUL, 183 VIE_OP_TYPE_LAST 184 }; 185 186 /* struct vie_op.op_flags */ 187 #define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */ 188 #define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ 189 #define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */ 190 #define VIE_OP_F_NO_MODRM (1 << 3) 191 #define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4) 192 #define VIE_OP_F_REG_REG (1 << 5) /* special-case for mov-cr */ 193 194 static const struct vie_op three_byte_opcodes_0f38[256] = { 195 [0xF7] = { 196 .op_byte = 0xF7, 197 .op_type = VIE_OP_TYPE_BEXTR, 198 }, 199 }; 200 201 static const struct vie_op two_byte_opcodes[256] = { 202 [0x06] = { 203 .op_byte = 0x06, 204 .op_type = VIE_OP_TYPE_CLTS, 205 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 206 }, 207 [0x20] = { 208 .op_byte = 0x20, 209 .op_type = VIE_OP_TYPE_MOV_CR, 210 .op_flags = VIE_OP_F_REG_REG | VIE_OP_F_NO_GLA_VERIFICATION 211 }, 212 [0x22] = { 213 .op_byte = 0x22, 214 .op_type = VIE_OP_TYPE_MOV_CR, 215 .op_flags = VIE_OP_F_REG_REG | VIE_OP_F_NO_GLA_VERIFICATION 216 }, 217 [0xAE] = { 218 .op_byte = 0xAE, 219 .op_type = VIE_OP_TYPE_TWOB_GRP15, 220 }, 221 [0xAF] = { 222 .op_byte = 0xAF, 223 .op_type = VIE_OP_TYPE_MUL, 224 }, 225 [0xB6] = { 226 .op_byte = 0xB6, 227 .op_type = VIE_OP_TYPE_MOVZX, 228 }, 229 [0xB7] = { 230 .op_byte = 0xB7, 231 .op_type = VIE_OP_TYPE_MOVZX, 232 }, 233 [0xBA] = { 234 .op_byte = 0xBA, 235 .op_type = VIE_OP_TYPE_BITTEST, 236 .op_flags = VIE_OP_F_IMM8, 237 }, 238 [0xBE] = { 239 .op_byte = 0xBE, 240 .op_type = VIE_OP_TYPE_MOVSX, 241 }, 242 }; 243 244 static const struct vie_op one_byte_opcodes[256] = { 245 [0x03] = { 246 .op_byte = 0x03, 247 .op_type = VIE_OP_TYPE_ADD, 248 }, 249 [0x0F] = { 250 .op_byte = 0x0F, 251 .op_type = VIE_OP_TYPE_TWO_BYTE 252 }, 253 [0x0B] = { 254 .op_byte = 0x0B, 255 .op_type = VIE_OP_TYPE_OR, 256 }, 257 [0x2B] = { 258 .op_byte = 0x2B, 259 .op_type = VIE_OP_TYPE_SUB, 260 }, 261 [0x39] = { 262 .op_byte = 0x39, 263 .op_type = VIE_OP_TYPE_CMP, 264 }, 265 [0x3B] = { 266 .op_byte = 0x3B, 267 .op_type = VIE_OP_TYPE_CMP, 268 }, 269 [0x88] = { 270 .op_byte = 0x88, 271 .op_type = VIE_OP_TYPE_MOV, 272 }, 273 [0x89] = { 274 .op_byte = 0x89, 275 .op_type = VIE_OP_TYPE_MOV, 276 }, 277 [0x8A] = { 278 .op_byte = 0x8A, 279 .op_type = VIE_OP_TYPE_MOV, 280 }, 281 [0x8B] = { 282 .op_byte = 0x8B, 283 .op_type = VIE_OP_TYPE_MOV, 284 }, 285 [0xA1] = { 286 .op_byte = 0xA1, 287 .op_type = VIE_OP_TYPE_MOV, 288 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 289 }, 290 [0xA3] = { 291 .op_byte = 0xA3, 292 .op_type = VIE_OP_TYPE_MOV, 293 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 294 }, 295 [0xA4] = { 296 .op_byte = 0xA4, 297 .op_type = VIE_OP_TYPE_MOVS, 298 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 299 }, 300 [0xA5] = { 301 .op_byte = 0xA5, 302 .op_type = VIE_OP_TYPE_MOVS, 303 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 304 }, 305 [0xAA] = { 306 .op_byte = 0xAA, 307 .op_type = VIE_OP_TYPE_STOS, 308 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 309 }, 310 [0xAB] = { 311 .op_byte = 0xAB, 312 .op_type = VIE_OP_TYPE_STOS, 313 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 314 }, 315 [0xC6] = { 316 /* XXX Group 11 extended opcode - not just MOV */ 317 .op_byte = 0xC6, 318 .op_type = VIE_OP_TYPE_MOV, 319 .op_flags = VIE_OP_F_IMM8, 320 }, 321 [0xC7] = { 322 .op_byte = 0xC7, 323 .op_type = VIE_OP_TYPE_MOV, 324 .op_flags = VIE_OP_F_IMM, 325 }, 326 [0x23] = { 327 .op_byte = 0x23, 328 .op_type = VIE_OP_TYPE_AND, 329 }, 330 [0x80] = { 331 /* Group 1 extended opcode */ 332 .op_byte = 0x80, 333 .op_type = VIE_OP_TYPE_GROUP1, 334 .op_flags = VIE_OP_F_IMM8, 335 }, 336 [0x81] = { 337 /* Group 1 extended opcode */ 338 .op_byte = 0x81, 339 .op_type = VIE_OP_TYPE_GROUP1, 340 .op_flags = VIE_OP_F_IMM, 341 }, 342 [0x83] = { 343 /* Group 1 extended opcode */ 344 .op_byte = 0x83, 345 .op_type = VIE_OP_TYPE_GROUP1, 346 .op_flags = VIE_OP_F_IMM8, 347 }, 348 [0x8F] = { 349 /* XXX Group 1A extended opcode - not just POP */ 350 .op_byte = 0x8F, 351 .op_type = VIE_OP_TYPE_POP, 352 }, 353 [0xF6] = { 354 /* XXX Group 3 extended opcode - not just TEST */ 355 .op_byte = 0xF6, 356 .op_type = VIE_OP_TYPE_TEST, 357 .op_flags = VIE_OP_F_IMM8, 358 }, 359 [0xF7] = { 360 /* XXX Group 3 extended opcode - not just TEST */ 361 .op_byte = 0xF7, 362 .op_type = VIE_OP_TYPE_TEST, 363 .op_flags = VIE_OP_F_IMM, 364 }, 365 [0xFF] = { 366 /* XXX Group 5 extended opcode - not just PUSH */ 367 .op_byte = 0xFF, 368 .op_type = VIE_OP_TYPE_PUSH, 369 } 370 }; 371 372 /* struct vie.mod */ 373 #define VIE_MOD_INDIRECT 0 374 #define VIE_MOD_INDIRECT_DISP8 1 375 #define VIE_MOD_INDIRECT_DISP32 2 376 #define VIE_MOD_DIRECT 3 377 378 /* struct vie.rm */ 379 #define VIE_RM_SIB 4 380 #define VIE_RM_DISP32 5 381 382 #define GB (1024 * 1024 * 1024) 383 384 385 /* 386 * Paging defines, previously pulled in from machine/pmap.h 387 */ 388 #define PG_V (1 << 0) /* Present */ 389 #define PG_RW (1 << 1) /* Read/Write */ 390 #define PG_U (1 << 2) /* User/Supervisor */ 391 #define PG_A (1 << 5) /* Accessed */ 392 #define PG_M (1 << 6) /* Dirty */ 393 #define PG_PS (1 << 7) /* Largepage */ 394 395 /* 396 * Paging except defines, previously pulled in from machine/pmap.h 397 */ 398 #define PGEX_P (1 << 0) /* Non-present/Protection */ 399 #define PGEX_W (1 << 1) /* Read/Write */ 400 #define PGEX_U (1 << 2) /* User/Supervisor */ 401 #define PGEX_RSV (1 << 3) /* (Non-)Reserved */ 402 #define PGEX_I (1 << 4) /* Instruction */ 403 404 405 static enum vm_reg_name gpr_map[16] = { 406 VM_REG_GUEST_RAX, 407 VM_REG_GUEST_RCX, 408 VM_REG_GUEST_RDX, 409 VM_REG_GUEST_RBX, 410 VM_REG_GUEST_RSP, 411 VM_REG_GUEST_RBP, 412 VM_REG_GUEST_RSI, 413 VM_REG_GUEST_RDI, 414 VM_REG_GUEST_R8, 415 VM_REG_GUEST_R9, 416 VM_REG_GUEST_R10, 417 VM_REG_GUEST_R11, 418 VM_REG_GUEST_R12, 419 VM_REG_GUEST_R13, 420 VM_REG_GUEST_R14, 421 VM_REG_GUEST_R15 422 }; 423 424 static const char *gpr_name_map[][16] = { 425 [1] = { 426 "a[hl]", "c[hl]", "d[hl]", "b[hl]", "spl", "bpl", "sil", "dil", 427 "r8b", "r9b", "r10b", "r11b", "r12b", "r13b", "r14b", "r15b", 428 }, 429 [2] = { 430 "ax", "cx", "dx", "bx", "sp", "bp", "si", "di", 431 "r8w", "r9w", "r10w", "r11w", "r12w", "r13w", "r14w", "r15w", 432 }, 433 [4] = { 434 "eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi", 435 "r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d", "r15d", 436 }, 437 [8] = { 438 "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", 439 "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", 440 }, 441 }; 442 443 static enum vm_reg_name cr_map[16] = { 444 VM_REG_GUEST_CR0, 445 VM_REG_LAST, 446 VM_REG_GUEST_CR2, 447 VM_REG_GUEST_CR3, 448 VM_REG_GUEST_CR4, 449 VM_REG_LAST, 450 VM_REG_LAST, 451 VM_REG_LAST, 452 VM_REG_LAST, 453 VM_REG_LAST, 454 VM_REG_LAST, 455 VM_REG_LAST, 456 VM_REG_LAST, 457 VM_REG_LAST, 458 VM_REG_LAST, 459 VM_REG_LAST 460 }; 461 462 static uint64_t size2mask[] = { 463 [1] = 0xff, 464 [2] = 0xffff, 465 [4] = 0xffffffff, 466 [8] = 0xffffffffffffffff, 467 }; 468 469 470 static int vie_mmio_read(struct vie *vie, struct vm *vm, int cpuid, 471 uint64_t gpa, uint64_t *rval, int bytes); 472 static int vie_mmio_write(struct vie *vie, struct vm *vm, int cpuid, 473 uint64_t gpa, uint64_t wval, int bytes); 474 static int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, 475 struct seg_desc *desc, uint64_t offset, int length, int addrsize, 476 int prot, uint64_t *gla); 477 static int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla); 478 static int vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, 479 uint64_t gla); 480 static uint64_t vie_size2mask(int size); 481 482 struct vie * 483 vie_alloc() 484 { 485 return (kmem_zalloc(sizeof (struct vie), KM_SLEEP)); 486 } 487 488 void 489 vie_free(struct vie *vie) 490 { 491 kmem_free(vie, sizeof (struct vie)); 492 } 493 494 enum vm_reg_name 495 vie_regnum_map(uint8_t regnum) 496 { 497 VERIFY3U(regnum, <, 16); 498 return (gpr_map[regnum]); 499 } 500 501 const char * 502 vie_regnum_name(uint8_t regnum, uint8_t size) 503 { 504 VERIFY3U(regnum, <, 16); 505 VERIFY(size == 1 || size == 2 || size == 4 || size == 8); 506 return (gpr_name_map[size][regnum]); 507 } 508 509 static void 510 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr) 511 { 512 *lhbr = 0; 513 *reg = gpr_map[vie->reg]; 514 515 /* 516 * 64-bit mode imposes limitations on accessing legacy high byte 517 * registers (lhbr). 518 * 519 * The legacy high-byte registers cannot be addressed if the REX 520 * prefix is present. In this case the values 4, 5, 6 and 7 of the 521 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively. 522 * 523 * If the REX prefix is not present then the values 4, 5, 6 and 7 524 * of the 'ModRM:reg' field address the legacy high-byte registers, 525 * %ah, %ch, %dh and %bh respectively. 526 */ 527 if (!vie->rex_present) { 528 if (vie->reg & 0x4) { 529 *lhbr = 1; 530 *reg = gpr_map[vie->reg & 0x3]; 531 } 532 } 533 } 534 535 static int 536 vie_read_bytereg(struct vie *vie, struct vm *vm, int vcpuid, uint8_t *rval) 537 { 538 uint64_t val; 539 int error, lhbr; 540 enum vm_reg_name reg; 541 542 vie_calc_bytereg(vie, ®, &lhbr); 543 error = vm_get_register(vm, vcpuid, reg, &val); 544 545 /* 546 * To obtain the value of a legacy high byte register shift the 547 * base register right by 8 bits (%ah = %rax >> 8). 548 */ 549 if (lhbr) 550 *rval = val >> 8; 551 else 552 *rval = val; 553 return (error); 554 } 555 556 static int 557 vie_write_bytereg(struct vie *vie, struct vm *vm, int vcpuid, uint8_t byte) 558 { 559 uint64_t origval, val, mask; 560 int error, lhbr; 561 enum vm_reg_name reg; 562 563 vie_calc_bytereg(vie, ®, &lhbr); 564 error = vm_get_register(vm, vcpuid, reg, &origval); 565 if (error == 0) { 566 val = byte; 567 mask = 0xff; 568 if (lhbr) { 569 /* 570 * Shift left by 8 to store 'byte' in a legacy high 571 * byte register. 572 */ 573 val <<= 8; 574 mask <<= 8; 575 } 576 val |= origval & ~mask; 577 error = vm_set_register(vm, vcpuid, reg, val); 578 } 579 return (error); 580 } 581 582 static int 583 vie_update_register(struct vm *vm, int vcpuid, enum vm_reg_name reg, 584 uint64_t val, int size) 585 { 586 int error; 587 uint64_t origval; 588 589 switch (size) { 590 case 1: 591 case 2: 592 error = vm_get_register(vm, vcpuid, reg, &origval); 593 if (error) 594 return (error); 595 val &= size2mask[size]; 596 val |= origval & ~size2mask[size]; 597 break; 598 case 4: 599 val &= 0xffffffffUL; 600 break; 601 case 8: 602 break; 603 default: 604 return (EINVAL); 605 } 606 607 error = vm_set_register(vm, vcpuid, reg, val); 608 return (error); 609 } 610 611 static int 612 vie_repeat(struct vie *vie) 613 { 614 vie->status |= VIES_REPEAT; 615 616 /* 617 * Clear out any cached operation values so the repeated instruction can 618 * begin without using that stale state. Other state, such as the 619 * decoding results, are kept around as it will not vary between 620 * iterations of a rep-prefixed instruction. 621 */ 622 if ((vie->status & VIES_MMIO) != 0) { 623 vie->mmio_req_read.state = VR_NONE; 624 vie->mmio_req_write.state = VR_NONE; 625 } else if ((vie->status & VIES_INOUT) != 0) { 626 vie->inout_req_state = VR_NONE; 627 } else { 628 panic("unexpected emulation state"); 629 } 630 631 return (EAGAIN); 632 } 633 634 #define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V) 635 636 /* 637 * Return the status flags that would result from doing (x - y). 638 */ 639 /* BEGIN CSTYLED */ 640 #define GETCC(sz) \ 641 static ulong_t \ 642 getcc##sz(uint##sz##_t x, uint##sz##_t y) \ 643 { \ 644 ulong_t rflags; \ 645 \ 646 __asm __volatile("sub %2,%1; pushfq; popq %0" : \ 647 "=r" (rflags), "+r" (x) : "m" (y)); \ 648 return (rflags); \ 649 } struct __hack 650 /* END CSTYLED */ 651 652 GETCC(8); 653 GETCC(16); 654 GETCC(32); 655 GETCC(64); 656 657 static ulong_t 658 getcc(int opsize, uint64_t x, uint64_t y) 659 { 660 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 661 ("getcc: invalid operand size %d", opsize)); 662 663 if (opsize == 1) 664 return (getcc8(x, y)); 665 else if (opsize == 2) 666 return (getcc16(x, y)); 667 else if (opsize == 4) 668 return (getcc32(x, y)); 669 else 670 return (getcc64(x, y)); 671 } 672 673 /* 674 * Macro creation of functions getaddflags{8,16,32,64} 675 */ 676 /* BEGIN CSTYLED */ 677 #define GETADDFLAGS(sz) \ 678 static ulong_t \ 679 getaddflags##sz(uint##sz##_t x, uint##sz##_t y) \ 680 { \ 681 ulong_t rflags; \ 682 \ 683 __asm __volatile("add %2,%1; pushfq; popq %0" : \ 684 "=r" (rflags), "+r" (x) : "m" (y)); \ 685 return (rflags); \ 686 } struct __hack 687 /* END CSTYLED */ 688 689 GETADDFLAGS(8); 690 GETADDFLAGS(16); 691 GETADDFLAGS(32); 692 GETADDFLAGS(64); 693 694 static ulong_t 695 getaddflags(int opsize, uint64_t x, uint64_t y) 696 { 697 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 698 ("getaddflags: invalid operand size %d", opsize)); 699 700 if (opsize == 1) 701 return (getaddflags8(x, y)); 702 else if (opsize == 2) 703 return (getaddflags16(x, y)); 704 else if (opsize == 4) 705 return (getaddflags32(x, y)); 706 else 707 return (getaddflags64(x, y)); 708 } 709 710 /* 711 * Macro creation of functions getimulflags{16,32,64} 712 */ 713 /* BEGIN CSTYLED */ 714 #define GETIMULFLAGS(sz) \ 715 static ulong_t \ 716 getimulflags##sz(uint##sz##_t x, uint##sz##_t y) \ 717 { \ 718 ulong_t rflags; \ 719 \ 720 __asm __volatile("imul %2,%1; pushfq; popq %0" : \ 721 "=r" (rflags), "+r" (x) : "m" (y)); \ 722 return (rflags); \ 723 } struct __hack 724 /* END CSTYLED */ 725 726 GETIMULFLAGS(16); 727 GETIMULFLAGS(32); 728 GETIMULFLAGS(64); 729 730 static ulong_t 731 getimulflags(int opsize, uint64_t x, uint64_t y) 732 { 733 KASSERT(opsize == 2 || opsize == 4 || opsize == 8, 734 ("getimulflags: invalid operand size %d", opsize)); 735 736 if (opsize == 2) 737 return (getimulflags16(x, y)); 738 else if (opsize == 4) 739 return (getimulflags32(x, y)); 740 else 741 return (getimulflags64(x, y)); 742 } 743 744 /* 745 * Return the status flags that would result from doing (x & y). 746 */ 747 /* BEGIN CSTYLED */ 748 #define GETANDFLAGS(sz) \ 749 static ulong_t \ 750 getandflags##sz(uint##sz##_t x, uint##sz##_t y) \ 751 { \ 752 ulong_t rflags; \ 753 \ 754 __asm __volatile("and %2,%1; pushfq; popq %0" : \ 755 "=r" (rflags), "+r" (x) : "m" (y)); \ 756 return (rflags); \ 757 } struct __hack 758 /* END CSTYLED */ 759 760 GETANDFLAGS(8); 761 GETANDFLAGS(16); 762 GETANDFLAGS(32); 763 GETANDFLAGS(64); 764 765 static ulong_t 766 getandflags(int opsize, uint64_t x, uint64_t y) 767 { 768 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 769 ("getandflags: invalid operand size %d", opsize)); 770 771 if (opsize == 1) 772 return (getandflags8(x, y)); 773 else if (opsize == 2) 774 return (getandflags16(x, y)); 775 else if (opsize == 4) 776 return (getandflags32(x, y)); 777 else 778 return (getandflags64(x, y)); 779 } 780 781 static int 782 vie_emulate_mov_cr(struct vie *vie, struct vm *vm, int vcpuid) 783 { 784 uint64_t val; 785 int err; 786 enum vm_reg_name gpr = gpr_map[vie->rm]; 787 enum vm_reg_name cr = cr_map[vie->reg]; 788 789 uint_t size = 4; 790 if (vie->paging.cpu_mode == CPU_MODE_64BIT) { 791 size = 8; 792 } 793 794 switch (vie->op.op_byte) { 795 case 0x20: 796 /* 797 * MOV control register (ModRM:reg) to reg (ModRM:r/m) 798 * 20/r: mov r32, CR0-CR7 799 * 20/r: mov r64, CR0-CR7 800 * REX.R + 20/0: mov r64, CR8 801 */ 802 if (vie->paging.cpl != 0) { 803 vm_inject_gp(vm, vcpuid); 804 vie->num_processed = 0; 805 return (0); 806 } 807 err = vm_get_register(vm, vcpuid, cr, &val); 808 if (err != 0) { 809 /* #UD for access to non-existent CRs */ 810 vm_inject_ud(vm, vcpuid); 811 vie->num_processed = 0; 812 return (0); 813 } 814 err = vie_update_register(vm, vcpuid, gpr, val, size); 815 break; 816 case 0x22: { 817 /* 818 * MOV reg (ModRM:r/m) to control register (ModRM:reg) 819 * 22/r: mov CR0-CR7, r32 820 * 22/r: mov CR0-CR7, r64 821 * REX.R + 22/0: mov CR8, r64 822 */ 823 uint64_t old, diff; 824 825 if (vie->paging.cpl != 0) { 826 vm_inject_gp(vm, vcpuid); 827 vie->num_processed = 0; 828 return (0); 829 } 830 err = vm_get_register(vm, vcpuid, cr, &old); 831 if (err != 0) { 832 /* #UD for access to non-existent CRs */ 833 vm_inject_ud(vm, vcpuid); 834 vie->num_processed = 0; 835 return (0); 836 } 837 err = vm_get_register(vm, vcpuid, gpr, &val); 838 VERIFY0(err); 839 val &= size2mask[size]; 840 diff = old ^ val; 841 842 switch (cr) { 843 case VM_REG_GUEST_CR0: 844 if ((diff & CR0_PG) != 0) { 845 uint64_t efer; 846 847 err = vm_get_register(vm, vcpuid, 848 VM_REG_GUEST_EFER, &efer); 849 VERIFY0(err); 850 851 /* Keep the long-mode state in EFER in sync */ 852 if ((val & CR0_PG) != 0 && 853 (efer & EFER_LME) != 0) { 854 efer |= EFER_LMA; 855 } 856 if ((val & CR0_PG) == 0 && 857 (efer & EFER_LME) != 0) { 858 efer &= ~EFER_LMA; 859 } 860 861 err = vm_set_register(vm, vcpuid, 862 VM_REG_GUEST_EFER, efer); 863 VERIFY0(err); 864 } 865 /* TODO: enforce more of the #GP checks */ 866 err = vm_set_register(vm, vcpuid, cr, val); 867 VERIFY0(err); 868 break; 869 case VM_REG_GUEST_CR2: 870 case VM_REG_GUEST_CR3: 871 case VM_REG_GUEST_CR4: 872 /* TODO: enforce more of the #GP checks */ 873 err = vm_set_register(vm, vcpuid, cr, val); 874 break; 875 default: 876 /* The cr_map mapping should prevent this */ 877 panic("invalid cr %d", cr); 878 } 879 break; 880 } 881 default: 882 return (EINVAL); 883 } 884 return (err); 885 } 886 887 static int 888 vie_emulate_mov(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 889 { 890 int error, size; 891 enum vm_reg_name reg; 892 uint8_t byte; 893 uint64_t val; 894 895 size = vie->opsize; 896 error = EINVAL; 897 898 switch (vie->op.op_byte) { 899 case 0x88: 900 /* 901 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m) 902 * 88/r: mov r/m8, r8 903 * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available) 904 */ 905 size = 1; /* override for byte operation */ 906 error = vie_read_bytereg(vie, vm, vcpuid, &byte); 907 if (error == 0) { 908 error = vie_mmio_write(vie, vm, vcpuid, gpa, byte, 909 size); 910 } 911 break; 912 case 0x89: 913 /* 914 * MOV from reg (ModRM:reg) to mem (ModRM:r/m) 915 * 89/r: mov r/m16, r16 916 * 89/r: mov r/m32, r32 917 * REX.W + 89/r mov r/m64, r64 918 */ 919 reg = gpr_map[vie->reg]; 920 error = vm_get_register(vm, vcpuid, reg, &val); 921 if (error == 0) { 922 val &= size2mask[size]; 923 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size); 924 } 925 break; 926 case 0x8A: 927 /* 928 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg) 929 * 8A/r: mov r8, r/m8 930 * REX + 8A/r: mov r8, r/m8 931 */ 932 size = 1; /* override for byte operation */ 933 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size); 934 if (error == 0) 935 error = vie_write_bytereg(vie, vm, vcpuid, val); 936 break; 937 case 0x8B: 938 /* 939 * MOV from mem (ModRM:r/m) to reg (ModRM:reg) 940 * 8B/r: mov r16, r/m16 941 * 8B/r: mov r32, r/m32 942 * REX.W 8B/r: mov r64, r/m64 943 */ 944 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size); 945 if (error == 0) { 946 reg = gpr_map[vie->reg]; 947 error = vie_update_register(vm, vcpuid, reg, val, size); 948 } 949 break; 950 case 0xA1: 951 /* 952 * MOV from seg:moffset to AX/EAX/RAX 953 * A1: mov AX, moffs16 954 * A1: mov EAX, moffs32 955 * REX.W + A1: mov RAX, moffs64 956 */ 957 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size); 958 if (error == 0) { 959 reg = VM_REG_GUEST_RAX; 960 error = vie_update_register(vm, vcpuid, reg, val, size); 961 } 962 break; 963 case 0xA3: 964 /* 965 * MOV from AX/EAX/RAX to seg:moffset 966 * A3: mov moffs16, AX 967 * A3: mov moffs32, EAX 968 * REX.W + A3: mov moffs64, RAX 969 */ 970 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); 971 if (error == 0) { 972 val &= size2mask[size]; 973 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size); 974 } 975 break; 976 case 0xC6: 977 /* 978 * MOV from imm8 to mem (ModRM:r/m) 979 * C6/0 mov r/m8, imm8 980 * REX + C6/0 mov r/m8, imm8 981 */ 982 size = 1; /* override for byte operation */ 983 val = vie->immediate; 984 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size); 985 break; 986 case 0xC7: 987 /* 988 * MOV from imm16/imm32 to mem (ModRM:r/m) 989 * C7/0 mov r/m16, imm16 990 * C7/0 mov r/m32, imm32 991 * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) 992 */ 993 val = vie->immediate & size2mask[size]; 994 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size); 995 break; 996 default: 997 break; 998 } 999 1000 return (error); 1001 } 1002 1003 static int 1004 vie_emulate_movx(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1005 { 1006 int error, size; 1007 enum vm_reg_name reg; 1008 uint64_t val; 1009 1010 size = vie->opsize; 1011 error = EINVAL; 1012 1013 switch (vie->op.op_byte) { 1014 case 0xB6: 1015 /* 1016 * MOV and zero extend byte from mem (ModRM:r/m) to 1017 * reg (ModRM:reg). 1018 * 1019 * 0F B6/r movzx r16, r/m8 1020 * 0F B6/r movzx r32, r/m8 1021 * REX.W + 0F B6/r movzx r64, r/m8 1022 */ 1023 1024 /* get the first operand */ 1025 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 1); 1026 if (error) 1027 break; 1028 1029 /* get the second operand */ 1030 reg = gpr_map[vie->reg]; 1031 1032 /* zero-extend byte */ 1033 val = (uint8_t)val; 1034 1035 /* write the result */ 1036 error = vie_update_register(vm, vcpuid, reg, val, size); 1037 break; 1038 case 0xB7: 1039 /* 1040 * MOV and zero extend word from mem (ModRM:r/m) to 1041 * reg (ModRM:reg). 1042 * 1043 * 0F B7/r movzx r32, r/m16 1044 * REX.W + 0F B7/r movzx r64, r/m16 1045 */ 1046 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 2); 1047 if (error) 1048 return (error); 1049 1050 reg = gpr_map[vie->reg]; 1051 1052 /* zero-extend word */ 1053 val = (uint16_t)val; 1054 1055 error = vie_update_register(vm, vcpuid, reg, val, size); 1056 break; 1057 case 0xBE: 1058 /* 1059 * MOV and sign extend byte from mem (ModRM:r/m) to 1060 * reg (ModRM:reg). 1061 * 1062 * 0F BE/r movsx r16, r/m8 1063 * 0F BE/r movsx r32, r/m8 1064 * REX.W + 0F BE/r movsx r64, r/m8 1065 */ 1066 1067 /* get the first operand */ 1068 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 1); 1069 if (error) 1070 break; 1071 1072 /* get the second operand */ 1073 reg = gpr_map[vie->reg]; 1074 1075 /* sign extend byte */ 1076 val = (int8_t)val; 1077 1078 /* write the result */ 1079 error = vie_update_register(vm, vcpuid, reg, val, size); 1080 break; 1081 default: 1082 break; 1083 } 1084 return (error); 1085 } 1086 1087 /* 1088 * Helper function to calculate and validate a linear address. 1089 */ 1090 static int 1091 vie_get_gla(struct vie *vie, struct vm *vm, int vcpuid, int opsize, 1092 int addrsize, int prot, enum vm_reg_name seg, enum vm_reg_name gpr, 1093 uint64_t *gla) 1094 { 1095 struct seg_desc desc; 1096 uint64_t cr0, val, rflags; 1097 int error; 1098 struct vm_guest_paging *paging; 1099 1100 paging = &vie->paging; 1101 1102 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); 1103 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); 1104 1105 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1106 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 1107 1108 error = vm_get_seg_desc(vm, vcpuid, seg, &desc); 1109 KASSERT(error == 0, ("%s: error %d getting segment descriptor %d", 1110 __func__, error, seg)); 1111 1112 error = vm_get_register(vm, vcpuid, gpr, &val); 1113 KASSERT(error == 0, ("%s: error %d getting register %d", __func__, 1114 error, gpr)); 1115 1116 if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize, 1117 addrsize, prot, gla)) { 1118 if (seg == VM_REG_GUEST_SS) 1119 vm_inject_ss(vm, vcpuid, 0); 1120 else 1121 vm_inject_gp(vm, vcpuid); 1122 return (-1); 1123 } 1124 1125 if (vie_canonical_check(paging->cpu_mode, *gla)) { 1126 if (seg == VM_REG_GUEST_SS) 1127 vm_inject_ss(vm, vcpuid, 0); 1128 else 1129 vm_inject_gp(vm, vcpuid); 1130 return (-1); 1131 } 1132 1133 if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) { 1134 vm_inject_ac(vm, vcpuid, 0); 1135 return (-1); 1136 } 1137 1138 return (0); 1139 } 1140 1141 static int 1142 vie_emulate_movs(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1143 { 1144 struct vm_copyinfo copyinfo[2]; 1145 uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val; 1146 uint64_t rcx, rdi, rsi, rflags; 1147 int error, fault, opsize, seg, repeat; 1148 struct vm_guest_paging *paging; 1149 1150 opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize; 1151 val = 0; 1152 error = 0; 1153 paging = &vie->paging; 1154 1155 /* 1156 * XXX although the MOVS instruction is only supposed to be used with 1157 * the "rep" prefix some guests like FreeBSD will use "repnz" instead. 1158 * 1159 * Empirically the "repnz" prefix has identical behavior to "rep" 1160 * and the zero flag does not make a difference. 1161 */ 1162 repeat = vie->repz_present | vie->repnz_present; 1163 1164 if (repeat) { 1165 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); 1166 KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); 1167 1168 /* 1169 * The count register is %rcx, %ecx or %cx depending on the 1170 * address size of the instruction. 1171 */ 1172 if ((rcx & vie_size2mask(vie->addrsize)) == 0) { 1173 error = 0; 1174 goto done; 1175 } 1176 } 1177 1178 /* 1179 * Source Destination Comments 1180 * -------------------------------------------- 1181 * (1) memory memory n/a 1182 * (2) memory mmio emulated 1183 * (3) mmio memory emulated 1184 * (4) mmio mmio emulated 1185 * 1186 * At this point we don't have sufficient information to distinguish 1187 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this 1188 * out because it will succeed only when operating on regular memory. 1189 * 1190 * XXX the emulation doesn't properly handle the case where 'gpa' 1191 * is straddling the boundary between the normal memory and MMIO. 1192 */ 1193 1194 seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS; 1195 if (vie_get_gla(vie, vm, vcpuid, opsize, vie->addrsize, PROT_READ, seg, 1196 VM_REG_GUEST_RSI, &srcaddr) != 0) { 1197 goto done; 1198 } 1199 1200 error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ, 1201 copyinfo, nitems(copyinfo), &fault); 1202 if (error == 0) { 1203 if (fault) 1204 goto done; /* Resume guest to handle fault */ 1205 1206 /* 1207 * case (2): read from system memory and write to mmio. 1208 */ 1209 vm_copyin(vm, vcpuid, copyinfo, &val, opsize); 1210 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 1211 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, opsize); 1212 if (error) 1213 goto done; 1214 } else { 1215 /* 1216 * 'vm_copy_setup()' is expected to fail for cases (3) and (4) 1217 * if 'srcaddr' is in the mmio space. 1218 */ 1219 1220 if (vie_get_gla(vie, vm, vcpuid, opsize, vie->addrsize, 1221 PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, 1222 &dstaddr) != 0) { 1223 goto done; 1224 } 1225 1226 error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize, 1227 PROT_WRITE, copyinfo, nitems(copyinfo), &fault); 1228 if (error == 0) { 1229 if (fault) 1230 goto done; /* Resume guest to handle fault */ 1231 1232 /* 1233 * case (3): read from MMIO and write to system memory. 1234 * 1235 * A MMIO read can have side-effects so we 1236 * commit to it only after vm_copy_setup() is 1237 * successful. If a page-fault needs to be 1238 * injected into the guest then it will happen 1239 * before the MMIO read is attempted. 1240 */ 1241 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 1242 opsize); 1243 1244 if (error == 0) { 1245 vm_copyout(vm, vcpuid, &val, copyinfo, opsize); 1246 } 1247 /* 1248 * Regardless of whether the MMIO read was successful or 1249 * not, the copy resources must be cleaned up. 1250 */ 1251 vm_copy_teardown(vm, vcpuid, copyinfo, 1252 nitems(copyinfo)); 1253 if (error != 0) { 1254 goto done; 1255 } 1256 } else { 1257 /* 1258 * Case (4): read from and write to mmio. 1259 * 1260 * Commit to the MMIO read/write (with potential 1261 * side-effects) only after we are sure that the 1262 * instruction is not going to be restarted due 1263 * to address translation faults. 1264 */ 1265 error = vm_gla2gpa(vm, vcpuid, paging, srcaddr, 1266 PROT_READ, &srcgpa, &fault); 1267 if (error || fault) 1268 goto done; 1269 1270 error = vm_gla2gpa(vm, vcpuid, paging, dstaddr, 1271 PROT_WRITE, &dstgpa, &fault); 1272 if (error || fault) 1273 goto done; 1274 1275 error = vie_mmio_read(vie, vm, vcpuid, srcgpa, &val, 1276 opsize); 1277 if (error) 1278 goto done; 1279 1280 error = vie_mmio_write(vie, vm, vcpuid, dstgpa, val, 1281 opsize); 1282 if (error) 1283 goto done; 1284 } 1285 } 1286 1287 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi); 1288 KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error)); 1289 1290 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); 1291 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); 1292 1293 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1294 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 1295 1296 if (rflags & PSL_D) { 1297 rsi -= opsize; 1298 rdi -= opsize; 1299 } else { 1300 rsi += opsize; 1301 rdi += opsize; 1302 } 1303 1304 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi, 1305 vie->addrsize); 1306 KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error)); 1307 1308 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, 1309 vie->addrsize); 1310 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); 1311 1312 if (repeat) { 1313 rcx = rcx - 1; 1314 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, 1315 rcx, vie->addrsize); 1316 KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); 1317 1318 /* 1319 * Repeat the instruction if the count register is not zero. 1320 */ 1321 if ((rcx & vie_size2mask(vie->addrsize)) != 0) 1322 return (vie_repeat(vie)); 1323 } 1324 done: 1325 return (error); 1326 } 1327 1328 static int 1329 vie_emulate_stos(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1330 { 1331 int error, opsize, repeat; 1332 uint64_t val; 1333 uint64_t rcx, rdi, rflags; 1334 1335 opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize; 1336 repeat = vie->repz_present | vie->repnz_present; 1337 1338 if (repeat) { 1339 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); 1340 KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); 1341 1342 /* 1343 * The count register is %rcx, %ecx or %cx depending on the 1344 * address size of the instruction. 1345 */ 1346 if ((rcx & vie_size2mask(vie->addrsize)) == 0) 1347 return (0); 1348 } 1349 1350 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); 1351 KASSERT(!error, ("%s: error %d getting rax", __func__, error)); 1352 1353 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, opsize); 1354 if (error) 1355 return (error); 1356 1357 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); 1358 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); 1359 1360 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1361 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 1362 1363 if (rflags & PSL_D) 1364 rdi -= opsize; 1365 else 1366 rdi += opsize; 1367 1368 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, 1369 vie->addrsize); 1370 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); 1371 1372 if (repeat) { 1373 rcx = rcx - 1; 1374 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, 1375 rcx, vie->addrsize); 1376 KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); 1377 1378 /* 1379 * Repeat the instruction if the count register is not zero. 1380 */ 1381 if ((rcx & vie_size2mask(vie->addrsize)) != 0) 1382 return (vie_repeat(vie)); 1383 } 1384 1385 return (0); 1386 } 1387 1388 static int 1389 vie_emulate_and(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1390 { 1391 int error, size; 1392 enum vm_reg_name reg; 1393 uint64_t result, rflags, rflags2, val1, val2; 1394 1395 size = vie->opsize; 1396 error = EINVAL; 1397 1398 switch (vie->op.op_byte) { 1399 case 0x23: 1400 /* 1401 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the 1402 * result in reg. 1403 * 1404 * 23/r and r16, r/m16 1405 * 23/r and r32, r/m32 1406 * REX.W + 23/r and r64, r/m64 1407 */ 1408 1409 /* get the first operand */ 1410 reg = gpr_map[vie->reg]; 1411 error = vm_get_register(vm, vcpuid, reg, &val1); 1412 if (error) 1413 break; 1414 1415 /* get the second operand */ 1416 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size); 1417 if (error) 1418 break; 1419 1420 /* perform the operation and write the result */ 1421 result = val1 & val2; 1422 error = vie_update_register(vm, vcpuid, reg, result, size); 1423 break; 1424 case 0x81: 1425 case 0x83: 1426 /* 1427 * AND mem (ModRM:r/m) with immediate and store the 1428 * result in mem. 1429 * 1430 * 81 /4 and r/m16, imm16 1431 * 81 /4 and r/m32, imm32 1432 * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64 1433 * 1434 * 83 /4 and r/m16, imm8 sign-extended to 16 1435 * 83 /4 and r/m32, imm8 sign-extended to 32 1436 * REX.W + 83/4 and r/m64, imm8 sign-extended to 64 1437 */ 1438 1439 /* get the first operand */ 1440 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val1, size); 1441 if (error) 1442 break; 1443 1444 /* 1445 * perform the operation with the pre-fetched immediate 1446 * operand and write the result 1447 */ 1448 result = val1 & vie->immediate; 1449 error = vie_mmio_write(vie, vm, vcpuid, gpa, result, size); 1450 break; 1451 default: 1452 break; 1453 } 1454 if (error) 1455 return (error); 1456 1457 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1458 if (error) 1459 return (error); 1460 1461 /* 1462 * OF and CF are cleared; the SF, ZF and PF flags are set according 1463 * to the result; AF is undefined. 1464 * 1465 * The updated status flags are obtained by subtracting 0 from 'result'. 1466 */ 1467 rflags2 = getcc(size, result, 0); 1468 rflags &= ~RFLAGS_STATUS_BITS; 1469 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 1470 1471 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 1472 return (error); 1473 } 1474 1475 static int 1476 vie_emulate_or(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1477 { 1478 int error, size; 1479 enum vm_reg_name reg; 1480 uint64_t result, rflags, rflags2, val1, val2; 1481 1482 size = vie->opsize; 1483 error = EINVAL; 1484 1485 switch (vie->op.op_byte) { 1486 case 0x0B: 1487 /* 1488 * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the 1489 * result in reg. 1490 * 1491 * 0b/r or r16, r/m16 1492 * 0b/r or r32, r/m32 1493 * REX.W + 0b/r or r64, r/m64 1494 */ 1495 1496 /* get the first operand */ 1497 reg = gpr_map[vie->reg]; 1498 error = vm_get_register(vm, vcpuid, reg, &val1); 1499 if (error) 1500 break; 1501 1502 /* get the second operand */ 1503 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size); 1504 if (error) 1505 break; 1506 1507 /* perform the operation and write the result */ 1508 result = val1 | val2; 1509 error = vie_update_register(vm, vcpuid, reg, result, size); 1510 break; 1511 case 0x81: 1512 case 0x83: 1513 /* 1514 * OR mem (ModRM:r/m) with immediate and store the 1515 * result in mem. 1516 * 1517 * 81 /1 or r/m16, imm16 1518 * 81 /1 or r/m32, imm32 1519 * REX.W + 81 /1 or r/m64, imm32 sign-extended to 64 1520 * 1521 * 83 /1 or r/m16, imm8 sign-extended to 16 1522 * 83 /1 or r/m32, imm8 sign-extended to 32 1523 * REX.W + 83/1 or r/m64, imm8 sign-extended to 64 1524 */ 1525 1526 /* get the first operand */ 1527 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val1, size); 1528 if (error) 1529 break; 1530 1531 /* 1532 * perform the operation with the pre-fetched immediate 1533 * operand and write the result 1534 */ 1535 result = val1 | vie->immediate; 1536 error = vie_mmio_write(vie, vm, vcpuid, gpa, result, size); 1537 break; 1538 default: 1539 break; 1540 } 1541 if (error) 1542 return (error); 1543 1544 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1545 if (error) 1546 return (error); 1547 1548 /* 1549 * OF and CF are cleared; the SF, ZF and PF flags are set according 1550 * to the result; AF is undefined. 1551 * 1552 * The updated status flags are obtained by subtracting 0 from 'result'. 1553 */ 1554 rflags2 = getcc(size, result, 0); 1555 rflags &= ~RFLAGS_STATUS_BITS; 1556 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 1557 1558 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 1559 return (error); 1560 } 1561 1562 static int 1563 vie_emulate_cmp(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1564 { 1565 int error, size; 1566 uint64_t regop, memop, op1, op2, rflags, rflags2; 1567 enum vm_reg_name reg; 1568 1569 size = vie->opsize; 1570 switch (vie->op.op_byte) { 1571 case 0x39: 1572 case 0x3B: 1573 /* 1574 * 39/r CMP r/m16, r16 1575 * 39/r CMP r/m32, r32 1576 * REX.W 39/r CMP r/m64, r64 1577 * 1578 * 3B/r CMP r16, r/m16 1579 * 3B/r CMP r32, r/m32 1580 * REX.W + 3B/r CMP r64, r/m64 1581 * 1582 * Compare the first operand with the second operand and 1583 * set status flags in EFLAGS register. The comparison is 1584 * performed by subtracting the second operand from the first 1585 * operand and then setting the status flags. 1586 */ 1587 1588 /* Get the register operand */ 1589 reg = gpr_map[vie->reg]; 1590 error = vm_get_register(vm, vcpuid, reg, ®op); 1591 if (error) 1592 return (error); 1593 1594 /* Get the memory operand */ 1595 error = vie_mmio_read(vie, vm, vcpuid, gpa, &memop, size); 1596 if (error) 1597 return (error); 1598 1599 if (vie->op.op_byte == 0x3B) { 1600 op1 = regop; 1601 op2 = memop; 1602 } else { 1603 op1 = memop; 1604 op2 = regop; 1605 } 1606 rflags2 = getcc(size, op1, op2); 1607 break; 1608 case 0x80: 1609 case 0x81: 1610 case 0x83: 1611 /* 1612 * 80 /7 cmp r/m8, imm8 1613 * REX + 80 /7 cmp r/m8, imm8 1614 * 1615 * 81 /7 cmp r/m16, imm16 1616 * 81 /7 cmp r/m32, imm32 1617 * REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 64 1618 * 1619 * 83 /7 cmp r/m16, imm8 sign-extended to 16 1620 * 83 /7 cmp r/m32, imm8 sign-extended to 32 1621 * REX.W + 83 /7 cmp r/m64, imm8 sign-extended to 64 1622 * 1623 * Compare mem (ModRM:r/m) with immediate and set 1624 * status flags according to the results. The 1625 * comparison is performed by subtracting the 1626 * immediate from the first operand and then setting 1627 * the status flags. 1628 * 1629 */ 1630 if (vie->op.op_byte == 0x80) 1631 size = 1; 1632 1633 /* get the first operand */ 1634 error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size); 1635 if (error) 1636 return (error); 1637 1638 rflags2 = getcc(size, op1, vie->immediate); 1639 break; 1640 default: 1641 return (EINVAL); 1642 } 1643 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1644 if (error) 1645 return (error); 1646 rflags &= ~RFLAGS_STATUS_BITS; 1647 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1648 1649 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 1650 return (error); 1651 } 1652 1653 static int 1654 vie_emulate_test(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1655 { 1656 int error, size; 1657 uint64_t op1, rflags, rflags2; 1658 1659 size = vie->opsize; 1660 error = EINVAL; 1661 1662 switch (vie->op.op_byte) { 1663 case 0xF6: 1664 /* 1665 * F6 /0 test r/m8, imm8 1666 * 1667 * Test mem (ModRM:r/m) with immediate and set status 1668 * flags according to the results. The comparison is 1669 * performed by anding the immediate from the first 1670 * operand and then setting the status flags. 1671 */ 1672 if ((vie->reg & 7) != 0) 1673 return (EINVAL); 1674 1675 size = 1; /* override for byte operation */ 1676 1677 error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size); 1678 if (error) 1679 return (error); 1680 1681 rflags2 = getandflags(size, op1, vie->immediate); 1682 break; 1683 case 0xF7: 1684 /* 1685 * F7 /0 test r/m16, imm16 1686 * F7 /0 test r/m32, imm32 1687 * REX.W + F7 /0 test r/m64, imm32 sign-extended to 64 1688 * 1689 * Test mem (ModRM:r/m) with immediate and set status 1690 * flags according to the results. The comparison is 1691 * performed by anding the immediate from the first 1692 * operand and then setting the status flags. 1693 */ 1694 if ((vie->reg & 7) != 0) 1695 return (EINVAL); 1696 1697 error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size); 1698 if (error) 1699 return (error); 1700 1701 rflags2 = getandflags(size, op1, vie->immediate); 1702 break; 1703 default: 1704 return (EINVAL); 1705 } 1706 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1707 if (error) 1708 return (error); 1709 1710 /* 1711 * OF and CF are cleared; the SF, ZF and PF flags are set according 1712 * to the result; AF is undefined. 1713 */ 1714 rflags &= ~RFLAGS_STATUS_BITS; 1715 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 1716 1717 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 1718 return (error); 1719 } 1720 1721 static int 1722 vie_emulate_bextr(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1723 { 1724 uint64_t src1, src2, dst, rflags; 1725 unsigned start, len, size; 1726 int error; 1727 struct vm_guest_paging *paging; 1728 1729 size = vie->opsize; 1730 error = EINVAL; 1731 paging = &vie->paging; 1732 1733 /* 1734 * VEX.LZ.0F38.W0 F7 /r BEXTR r32a, r/m32, r32b 1735 * VEX.LZ.0F38.W1 F7 /r BEXTR r64a, r/m64, r64b 1736 * 1737 * Destination operand is ModRM:reg. Source operands are ModRM:r/m and 1738 * Vex.vvvv. 1739 * 1740 * Operand size is always 32-bit if not in 64-bit mode (W1 is ignored). 1741 */ 1742 if (size != 4 && paging->cpu_mode != CPU_MODE_64BIT) 1743 size = 4; 1744 1745 /* 1746 * Extracts contiguous bits from the first /source/ operand (second 1747 * operand) using an index and length specified in the second /source/ 1748 * operand (third operand). 1749 */ 1750 error = vie_mmio_read(vie, vm, vcpuid, gpa, &src1, size); 1751 if (error) 1752 return (error); 1753 error = vm_get_register(vm, vcpuid, gpr_map[vie->vex_reg], &src2); 1754 if (error) 1755 return (error); 1756 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1757 if (error) 1758 return (error); 1759 1760 start = (src2 & 0xff); 1761 len = (src2 & 0xff00) >> 8; 1762 1763 /* If no bits are extracted, the destination register is cleared. */ 1764 dst = 0; 1765 1766 /* If START exceeds the operand size, no bits are extracted. */ 1767 if (start > size * 8) 1768 goto done; 1769 /* Length is bounded by both the destination size and start offset. */ 1770 if (start + len > size * 8) 1771 len = (size * 8) - start; 1772 if (len == 0) 1773 goto done; 1774 1775 if (start > 0) 1776 src1 = (src1 >> start); 1777 if (len < 64) 1778 src1 = src1 & ((1ull << len) - 1); 1779 dst = src1; 1780 1781 done: 1782 error = vie_update_register(vm, vcpuid, gpr_map[vie->reg], dst, size); 1783 if (error) 1784 return (error); 1785 1786 /* 1787 * AMD: OF, CF cleared; SF/AF/PF undefined; ZF set by result. 1788 * Intel: ZF is set by result; AF/SF/PF undefined; all others cleared. 1789 */ 1790 rflags &= ~RFLAGS_STATUS_BITS; 1791 if (dst == 0) 1792 rflags |= PSL_Z; 1793 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 1794 8); 1795 return (error); 1796 } 1797 1798 static int 1799 vie_emulate_add(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1800 { 1801 int error, size; 1802 uint64_t nval, rflags, rflags2, val1, val2; 1803 enum vm_reg_name reg; 1804 1805 size = vie->opsize; 1806 error = EINVAL; 1807 1808 switch (vie->op.op_byte) { 1809 case 0x03: 1810 /* 1811 * ADD r/m to r and store the result in r 1812 * 1813 * 03/r ADD r16, r/m16 1814 * 03/r ADD r32, r/m32 1815 * REX.W + 03/r ADD r64, r/m64 1816 */ 1817 1818 /* get the first operand */ 1819 reg = gpr_map[vie->reg]; 1820 error = vm_get_register(vm, vcpuid, reg, &val1); 1821 if (error) 1822 break; 1823 1824 /* get the second operand */ 1825 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size); 1826 if (error) 1827 break; 1828 1829 /* perform the operation and write the result */ 1830 nval = val1 + val2; 1831 error = vie_update_register(vm, vcpuid, reg, nval, size); 1832 break; 1833 default: 1834 break; 1835 } 1836 1837 if (!error) { 1838 rflags2 = getaddflags(size, val1, val2); 1839 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1840 &rflags); 1841 if (error) 1842 return (error); 1843 1844 rflags &= ~RFLAGS_STATUS_BITS; 1845 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1846 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1847 rflags, 8); 1848 } 1849 1850 return (error); 1851 } 1852 1853 static int 1854 vie_emulate_sub(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1855 { 1856 int error, size; 1857 uint64_t nval, rflags, rflags2, val1, val2; 1858 enum vm_reg_name reg; 1859 1860 size = vie->opsize; 1861 error = EINVAL; 1862 1863 switch (vie->op.op_byte) { 1864 case 0x2B: 1865 /* 1866 * SUB r/m from r and store the result in r 1867 * 1868 * 2B/r SUB r16, r/m16 1869 * 2B/r SUB r32, r/m32 1870 * REX.W + 2B/r SUB r64, r/m64 1871 */ 1872 1873 /* get the first operand */ 1874 reg = gpr_map[vie->reg]; 1875 error = vm_get_register(vm, vcpuid, reg, &val1); 1876 if (error) 1877 break; 1878 1879 /* get the second operand */ 1880 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size); 1881 if (error) 1882 break; 1883 1884 /* perform the operation and write the result */ 1885 nval = val1 - val2; 1886 error = vie_update_register(vm, vcpuid, reg, nval, size); 1887 break; 1888 default: 1889 break; 1890 } 1891 1892 if (!error) { 1893 rflags2 = getcc(size, val1, val2); 1894 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1895 &rflags); 1896 if (error) 1897 return (error); 1898 1899 rflags &= ~RFLAGS_STATUS_BITS; 1900 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1901 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1902 rflags, 8); 1903 } 1904 1905 return (error); 1906 } 1907 1908 static int 1909 vie_emulate_mul(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1910 { 1911 int error, size; 1912 uint64_t rflags, rflags2, val1, val2; 1913 __int128_t nval; 1914 enum vm_reg_name reg; 1915 ulong_t (*getflags)(int, uint64_t, uint64_t) = NULL; 1916 1917 size = vie->opsize; 1918 error = EINVAL; 1919 1920 switch (vie->op.op_byte) { 1921 case 0xAF: 1922 /* 1923 * Multiply the contents of a destination register by 1924 * the contents of a register or memory operand and 1925 * put the signed result in the destination register. 1926 * 1927 * AF/r IMUL r16, r/m16 1928 * AF/r IMUL r32, r/m32 1929 * REX.W + AF/r IMUL r64, r/m64 1930 */ 1931 1932 getflags = getimulflags; 1933 1934 /* get the first operand */ 1935 reg = gpr_map[vie->reg]; 1936 error = vm_get_register(vm, vcpuid, reg, &val1); 1937 if (error != 0) 1938 break; 1939 1940 /* get the second operand */ 1941 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size); 1942 if (error != 0) 1943 break; 1944 1945 /* perform the operation and write the result */ 1946 nval = (int64_t)val1 * (int64_t)val2; 1947 1948 error = vie_update_register(vm, vcpuid, reg, nval, size); 1949 1950 DTRACE_PROBE4(vie__imul, 1951 const char *, vie_regnum_name(vie->reg, size), 1952 uint64_t, val1, uint64_t, val2, __uint128_t, nval); 1953 1954 break; 1955 default: 1956 break; 1957 } 1958 1959 if (error == 0) { 1960 rflags2 = getflags(size, val1, val2); 1961 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1962 &rflags); 1963 if (error) 1964 return (error); 1965 1966 rflags &= ~RFLAGS_STATUS_BITS; 1967 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1968 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1969 rflags, 8); 1970 1971 DTRACE_PROBE2(vie__imul__rflags, 1972 uint64_t, rflags, uint64_t, rflags2); 1973 } 1974 1975 return (error); 1976 } 1977 1978 static int 1979 vie_emulate_stack_op(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 1980 { 1981 struct vm_copyinfo copyinfo[2]; 1982 struct seg_desc ss_desc; 1983 uint64_t cr0, rflags, rsp, stack_gla, val; 1984 int error, fault, size, stackaddrsize, pushop; 1985 struct vm_guest_paging *paging; 1986 1987 val = 0; 1988 size = vie->opsize; 1989 pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0; 1990 paging = &vie->paging; 1991 1992 /* 1993 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1 1994 */ 1995 if (paging->cpu_mode == CPU_MODE_REAL) { 1996 stackaddrsize = 2; 1997 } else if (paging->cpu_mode == CPU_MODE_64BIT) { 1998 /* 1999 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3 2000 * - Stack pointer size is always 64-bits. 2001 * - PUSH/POP of 32-bit values is not possible in 64-bit mode. 2002 * - 16-bit PUSH/POP is supported by using the operand size 2003 * override prefix (66H). 2004 */ 2005 stackaddrsize = 8; 2006 size = vie->opsize_override ? 2 : 8; 2007 } else { 2008 /* 2009 * In protected or compatibility mode the 'B' flag in the 2010 * stack-segment descriptor determines the size of the 2011 * stack pointer. 2012 */ 2013 error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc); 2014 KASSERT(error == 0, ("%s: error %d getting SS descriptor", 2015 __func__, error)); 2016 if (SEG_DESC_DEF32(ss_desc.access)) 2017 stackaddrsize = 4; 2018 else 2019 stackaddrsize = 2; 2020 } 2021 2022 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); 2023 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); 2024 2025 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 2026 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 2027 2028 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp); 2029 KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error)); 2030 if (pushop) { 2031 rsp -= size; 2032 } 2033 2034 if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc, 2035 rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ, 2036 &stack_gla)) { 2037 vm_inject_ss(vm, vcpuid, 0); 2038 return (0); 2039 } 2040 2041 if (vie_canonical_check(paging->cpu_mode, stack_gla)) { 2042 vm_inject_ss(vm, vcpuid, 0); 2043 return (0); 2044 } 2045 2046 if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) { 2047 vm_inject_ac(vm, vcpuid, 0); 2048 return (0); 2049 } 2050 2051 error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, 2052 pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo), 2053 &fault); 2054 if (error || fault) 2055 return (error); 2056 2057 if (pushop) { 2058 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size); 2059 if (error == 0) 2060 vm_copyout(vm, vcpuid, &val, copyinfo, size); 2061 } else { 2062 vm_copyin(vm, vcpuid, copyinfo, &val, size); 2063 error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size); 2064 rsp += size; 2065 } 2066 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 2067 2068 if (error == 0) { 2069 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp, 2070 stackaddrsize); 2071 KASSERT(error == 0, ("error %d updating rsp", error)); 2072 } 2073 return (error); 2074 } 2075 2076 static int 2077 vie_emulate_push(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 2078 { 2079 int error; 2080 2081 /* 2082 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. 2083 * 2084 * PUSH is part of the group 5 extended opcodes and is identified 2085 * by ModRM:reg = b110. 2086 */ 2087 if ((vie->reg & 7) != 6) 2088 return (EINVAL); 2089 2090 error = vie_emulate_stack_op(vie, vm, vcpuid, gpa); 2091 return (error); 2092 } 2093 2094 static int 2095 vie_emulate_pop(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 2096 { 2097 int error; 2098 2099 /* 2100 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. 2101 * 2102 * POP is part of the group 1A extended opcodes and is identified 2103 * by ModRM:reg = b000. 2104 */ 2105 if ((vie->reg & 7) != 0) 2106 return (EINVAL); 2107 2108 error = vie_emulate_stack_op(vie, vm, vcpuid, gpa); 2109 return (error); 2110 } 2111 2112 static int 2113 vie_emulate_group1(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 2114 { 2115 int error; 2116 2117 switch (vie->reg & 7) { 2118 case 0x1: /* OR */ 2119 error = vie_emulate_or(vie, vm, vcpuid, gpa); 2120 break; 2121 case 0x4: /* AND */ 2122 error = vie_emulate_and(vie, vm, vcpuid, gpa); 2123 break; 2124 case 0x7: /* CMP */ 2125 error = vie_emulate_cmp(vie, vm, vcpuid, gpa); 2126 break; 2127 default: 2128 error = EINVAL; 2129 break; 2130 } 2131 2132 return (error); 2133 } 2134 2135 static int 2136 vie_emulate_bittest(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa) 2137 { 2138 uint64_t val, rflags; 2139 int error, bitmask, bitoff; 2140 2141 /* 2142 * 0F BA is a Group 8 extended opcode. 2143 * 2144 * Currently we only emulate the 'Bit Test' instruction which is 2145 * identified by a ModR/M:reg encoding of 100b. 2146 */ 2147 if ((vie->reg & 7) != 4) 2148 return (EINVAL); 2149 2150 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 2151 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 2152 2153 error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, vie->opsize); 2154 if (error) 2155 return (error); 2156 2157 /* 2158 * Intel SDM, Vol 2, Table 3-2: 2159 * "Range of Bit Positions Specified by Bit Offset Operands" 2160 */ 2161 bitmask = vie->opsize * 8 - 1; 2162 bitoff = vie->immediate & bitmask; 2163 2164 /* Copy the bit into the Carry flag in %rflags */ 2165 if (val & (1UL << bitoff)) 2166 rflags |= PSL_C; 2167 else 2168 rflags &= ~PSL_C; 2169 2170 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 2171 KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error)); 2172 2173 return (0); 2174 } 2175 2176 static int 2177 vie_emulate_twob_group15(struct vie *vie, struct vm *vm, int vcpuid, 2178 uint64_t gpa) 2179 { 2180 int error; 2181 uint64_t buf; 2182 2183 switch (vie->reg & 7) { 2184 case 0x7: /* CLFLUSH, CLFLUSHOPT, and SFENCE */ 2185 if (vie->mod == 0x3) { 2186 /* 2187 * SFENCE. Ignore it, VM exit provides enough 2188 * barriers on its own. 2189 */ 2190 error = 0; 2191 } else { 2192 /* 2193 * CLFLUSH, CLFLUSHOPT. Only check for access 2194 * rights. 2195 */ 2196 error = vie_mmio_read(vie, vm, vcpuid, gpa, &buf, 1); 2197 } 2198 break; 2199 default: 2200 error = EINVAL; 2201 break; 2202 } 2203 2204 return (error); 2205 } 2206 2207 static int 2208 vie_emulate_clts(struct vie *vie, struct vm *vm, int vcpuid) 2209 { 2210 uint64_t val; 2211 int error __maybe_unused; 2212 2213 if (vie->paging.cpl != 0) { 2214 vm_inject_gp(vm, vcpuid); 2215 vie->num_processed = 0; 2216 return (0); 2217 } 2218 2219 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &val); 2220 ASSERT(error == 0); 2221 2222 /* Clear %cr0.TS */ 2223 val &= ~CR0_TS; 2224 2225 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, val); 2226 ASSERT(error == 0); 2227 2228 return (0); 2229 } 2230 2231 static int 2232 vie_mmio_read(struct vie *vie, struct vm *vm, int cpuid, uint64_t gpa, 2233 uint64_t *rval, int bytes) 2234 { 2235 int err; 2236 2237 if (vie->mmio_req_read.state == VR_DONE) { 2238 ASSERT(vie->mmio_req_read.bytes == bytes); 2239 ASSERT(vie->mmio_req_read.gpa == gpa); 2240 2241 *rval = vie->mmio_req_read.data; 2242 return (0); 2243 } 2244 2245 err = vm_service_mmio_read(vm, cpuid, gpa, rval, bytes); 2246 if (err == 0) { 2247 /* 2248 * A successful read from an in-kernel-emulated device may come 2249 * with side effects, so stash the result in case it's used for 2250 * an instruction which subsequently needs to issue an MMIO 2251 * write to userspace. 2252 */ 2253 ASSERT(vie->mmio_req_read.state == VR_NONE); 2254 2255 vie->mmio_req_read.bytes = bytes; 2256 vie->mmio_req_read.gpa = gpa; 2257 vie->mmio_req_read.data = *rval; 2258 vie->mmio_req_read.state = VR_DONE; 2259 2260 } else if (err == ESRCH) { 2261 /* Hope that userspace emulation can fulfill this read */ 2262 vie->mmio_req_read.bytes = bytes; 2263 vie->mmio_req_read.gpa = gpa; 2264 vie->mmio_req_read.state = VR_PENDING; 2265 vie->status |= VIES_PENDING_MMIO; 2266 } else if (err < 0) { 2267 /* 2268 * The MMIO read failed in such a way that fallback to handling 2269 * in userspace is required. 2270 */ 2271 vie->status |= VIES_USER_FALLBACK; 2272 } 2273 return (err); 2274 } 2275 2276 static int 2277 vie_mmio_write(struct vie *vie, struct vm *vm, int cpuid, uint64_t gpa, 2278 uint64_t wval, int bytes) 2279 { 2280 int err; 2281 2282 if (vie->mmio_req_write.state == VR_DONE) { 2283 ASSERT(vie->mmio_req_write.bytes == bytes); 2284 ASSERT(vie->mmio_req_write.gpa == gpa); 2285 2286 return (0); 2287 } 2288 2289 err = vm_service_mmio_write(vm, cpuid, gpa, wval, bytes); 2290 if (err == 0) { 2291 /* 2292 * A successful write to an in-kernel-emulated device probably 2293 * results in side effects, so stash the fact that such a write 2294 * succeeded in case the operation requires other work. 2295 */ 2296 vie->mmio_req_write.bytes = bytes; 2297 vie->mmio_req_write.gpa = gpa; 2298 vie->mmio_req_write.data = wval; 2299 vie->mmio_req_write.state = VR_DONE; 2300 } else if (err == ESRCH) { 2301 /* Hope that userspace emulation can fulfill this write */ 2302 vie->mmio_req_write.bytes = bytes; 2303 vie->mmio_req_write.gpa = gpa; 2304 vie->mmio_req_write.data = wval; 2305 vie->mmio_req_write.state = VR_PENDING; 2306 vie->status |= VIES_PENDING_MMIO; 2307 } else if (err < 0) { 2308 /* 2309 * The MMIO write failed in such a way that fallback to handling 2310 * in userspace is required. 2311 */ 2312 vie->status |= VIES_USER_FALLBACK; 2313 } 2314 return (err); 2315 } 2316 2317 int 2318 vie_emulate_mmio(struct vie *vie, struct vm *vm, int vcpuid) 2319 { 2320 int error; 2321 uint64_t gpa; 2322 2323 if ((vie->status & (VIES_INST_DECODE | VIES_MMIO)) != 2324 (VIES_INST_DECODE | VIES_MMIO)) { 2325 return (EINVAL); 2326 } 2327 2328 gpa = vie->mmio_gpa; 2329 2330 switch (vie->op.op_type) { 2331 case VIE_OP_TYPE_GROUP1: 2332 error = vie_emulate_group1(vie, vm, vcpuid, gpa); 2333 break; 2334 case VIE_OP_TYPE_POP: 2335 error = vie_emulate_pop(vie, vm, vcpuid, gpa); 2336 break; 2337 case VIE_OP_TYPE_PUSH: 2338 error = vie_emulate_push(vie, vm, vcpuid, gpa); 2339 break; 2340 case VIE_OP_TYPE_CMP: 2341 error = vie_emulate_cmp(vie, vm, vcpuid, gpa); 2342 break; 2343 case VIE_OP_TYPE_MOV: 2344 error = vie_emulate_mov(vie, vm, vcpuid, gpa); 2345 break; 2346 case VIE_OP_TYPE_MOVSX: 2347 case VIE_OP_TYPE_MOVZX: 2348 error = vie_emulate_movx(vie, vm, vcpuid, gpa); 2349 break; 2350 case VIE_OP_TYPE_MOVS: 2351 error = vie_emulate_movs(vie, vm, vcpuid, gpa); 2352 break; 2353 case VIE_OP_TYPE_STOS: 2354 error = vie_emulate_stos(vie, vm, vcpuid, gpa); 2355 break; 2356 case VIE_OP_TYPE_AND: 2357 error = vie_emulate_and(vie, vm, vcpuid, gpa); 2358 break; 2359 case VIE_OP_TYPE_OR: 2360 error = vie_emulate_or(vie, vm, vcpuid, gpa); 2361 break; 2362 case VIE_OP_TYPE_SUB: 2363 error = vie_emulate_sub(vie, vm, vcpuid, gpa); 2364 break; 2365 case VIE_OP_TYPE_BITTEST: 2366 error = vie_emulate_bittest(vie, vm, vcpuid, gpa); 2367 break; 2368 case VIE_OP_TYPE_TWOB_GRP15: 2369 error = vie_emulate_twob_group15(vie, vm, vcpuid, gpa); 2370 break; 2371 case VIE_OP_TYPE_ADD: 2372 error = vie_emulate_add(vie, vm, vcpuid, gpa); 2373 break; 2374 case VIE_OP_TYPE_TEST: 2375 error = vie_emulate_test(vie, vm, vcpuid, gpa); 2376 break; 2377 case VIE_OP_TYPE_BEXTR: 2378 error = vie_emulate_bextr(vie, vm, vcpuid, gpa); 2379 break; 2380 case VIE_OP_TYPE_MUL: 2381 error = vie_emulate_mul(vie, vm, vcpuid, gpa); 2382 break; 2383 default: 2384 error = EINVAL; 2385 break; 2386 } 2387 2388 if (error == ESRCH) { 2389 /* Return to userspace with the mmio request */ 2390 return (-1); 2391 } 2392 2393 return (error); 2394 } 2395 2396 static int 2397 vie_emulate_inout_port(struct vie *vie, struct vm *vm, int vcpuid, 2398 uint32_t *eax) 2399 { 2400 uint32_t mask, val; 2401 bool in; 2402 int err; 2403 2404 mask = vie_size2mask(vie->inout.bytes); 2405 in = (vie->inout.flags & INOUT_IN) != 0; 2406 2407 if (!in) { 2408 val = *eax & mask; 2409 } 2410 2411 if (vie->inout_req_state != VR_DONE) { 2412 err = vm_ioport_access(vm, vcpuid, in, vie->inout.port, 2413 vie->inout.bytes, &val); 2414 val &= mask; 2415 } else { 2416 /* 2417 * This port access was handled in userspace and the result was 2418 * injected in to be handled now. 2419 */ 2420 val = vie->inout_req_val & mask; 2421 vie->inout_req_state = VR_NONE; 2422 err = 0; 2423 } 2424 2425 if (err == ESRCH) { 2426 vie->status |= VIES_PENDING_INOUT; 2427 vie->inout_req_state = VR_PENDING; 2428 return (err); 2429 } else if (err != 0) { 2430 return (err); 2431 } 2432 2433 if (in) { 2434 *eax = (*eax & ~mask) | val; 2435 } 2436 return (0); 2437 } 2438 2439 static enum vm_reg_name 2440 vie_inout_segname(const struct vie *vie) 2441 { 2442 uint8_t segidx = vie->inout.segment; 2443 const enum vm_reg_name segmap[] = { 2444 VM_REG_GUEST_ES, 2445 VM_REG_GUEST_CS, 2446 VM_REG_GUEST_SS, 2447 VM_REG_GUEST_DS, 2448 VM_REG_GUEST_FS, 2449 VM_REG_GUEST_GS, 2450 }; 2451 const uint8_t maxidx = (sizeof (segmap) / sizeof (segmap[0])); 2452 2453 if (segidx >= maxidx) { 2454 panic("unexpected segment index %u", segidx); 2455 } 2456 return (segmap[segidx]); 2457 } 2458 2459 static int 2460 vie_emulate_inout_str(struct vie *vie, struct vm *vm, int vcpuid) 2461 { 2462 uint8_t bytes, addrsize; 2463 uint64_t index, count = 0, gla, rflags; 2464 int prot, err, fault; 2465 bool in, repeat; 2466 enum vm_reg_name seg_reg, idx_reg; 2467 struct vm_copyinfo copyinfo[2]; 2468 2469 in = (vie->inout.flags & INOUT_IN) != 0; 2470 bytes = vie->inout.bytes; 2471 addrsize = vie->inout.addrsize; 2472 prot = in ? PROT_WRITE : PROT_READ; 2473 2474 ASSERT(bytes == 1 || bytes == 2 || bytes == 4); 2475 ASSERT(addrsize == 2 || addrsize == 4 || addrsize == 8); 2476 2477 idx_reg = (in) ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; 2478 seg_reg = vie_inout_segname(vie); 2479 err = vm_get_register(vm, vcpuid, idx_reg, &index); 2480 ASSERT(err == 0); 2481 index = index & vie_size2mask(addrsize); 2482 2483 repeat = (vie->inout.flags & INOUT_REP) != 0; 2484 2485 /* Count register */ 2486 if (repeat) { 2487 err = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &count); 2488 count &= vie_size2mask(addrsize); 2489 2490 if (count == 0) { 2491 /* 2492 * If we were asked to emulate a REP INS/OUTS when the 2493 * count register is zero, no further work is required. 2494 */ 2495 return (0); 2496 } 2497 } else { 2498 count = 1; 2499 } 2500 2501 gla = 0; 2502 if (vie_get_gla(vie, vm, vcpuid, bytes, addrsize, prot, seg_reg, 2503 idx_reg, &gla) != 0) { 2504 /* vie_get_gla() already injected the appropriate fault */ 2505 return (0); 2506 } 2507 2508 /* 2509 * The INS/OUTS emulate currently assumes that the memory target resides 2510 * within the guest system memory, rather than a device MMIO region. If 2511 * such a case becomes a necessity, that additional handling could be 2512 * put in place. 2513 */ 2514 err = vm_copy_setup(vm, vcpuid, &vie->paging, gla, bytes, prot, 2515 copyinfo, nitems(copyinfo), &fault); 2516 2517 if (err) { 2518 /* Unrecoverable error */ 2519 return (err); 2520 } else if (fault) { 2521 /* Resume guest to handle fault */ 2522 return (0); 2523 } 2524 2525 if (!in) { 2526 vm_copyin(vm, vcpuid, copyinfo, &vie->inout.eax, bytes); 2527 } 2528 2529 err = vie_emulate_inout_port(vie, vm, vcpuid, &vie->inout.eax); 2530 2531 if (err == 0 && in) { 2532 vm_copyout(vm, vcpuid, &vie->inout.eax, copyinfo, bytes); 2533 } 2534 2535 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 2536 2537 if (err == 0) { 2538 err = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2539 &rflags); 2540 ASSERT(err == 0); 2541 2542 /* Update index */ 2543 if (rflags & PSL_D) { 2544 index -= bytes; 2545 } else { 2546 index += bytes; 2547 } 2548 2549 /* Update index register */ 2550 err = vie_update_register(vm, vcpuid, idx_reg, index, addrsize); 2551 ASSERT(err == 0); 2552 2553 /* 2554 * Update count register only if the instruction had a repeat 2555 * prefix. 2556 */ 2557 if ((vie->inout.flags & INOUT_REP) != 0) { 2558 count--; 2559 err = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, 2560 count, addrsize); 2561 ASSERT(err == 0); 2562 2563 if (count != 0) { 2564 return (vie_repeat(vie)); 2565 } 2566 } 2567 } 2568 2569 return (err); 2570 } 2571 2572 int 2573 vie_emulate_inout(struct vie *vie, struct vm *vm, int vcpuid) 2574 { 2575 int err = 0; 2576 2577 if ((vie->status & VIES_INOUT) == 0) { 2578 return (EINVAL); 2579 } 2580 2581 if ((vie->inout.flags & INOUT_STR) == 0) { 2582 /* 2583 * For now, using the 'rep' prefixes with plain (non-string) 2584 * in/out is not supported. 2585 */ 2586 if ((vie->inout.flags & INOUT_REP) != 0) { 2587 return (EINVAL); 2588 } 2589 2590 err = vie_emulate_inout_port(vie, vm, vcpuid, &vie->inout.eax); 2591 if (err == 0 && (vie->inout.flags & INOUT_IN) != 0) { 2592 /* 2593 * With the inX access now a success, the result needs 2594 * to be stored in the guest %rax. 2595 */ 2596 err = vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX, 2597 vie->inout.eax); 2598 VERIFY0(err); 2599 } 2600 } else { 2601 vie->status &= ~VIES_REPEAT; 2602 err = vie_emulate_inout_str(vie, vm, vcpuid); 2603 2604 } 2605 if (err < 0) { 2606 /* 2607 * Access to an I/O port failed in such a way that fallback to 2608 * handling in userspace is required. 2609 */ 2610 vie->status |= VIES_USER_FALLBACK; 2611 } else if (err == ESRCH) { 2612 ASSERT(vie->status & VIES_PENDING_INOUT); 2613 /* Return to userspace with the in/out request */ 2614 err = -1; 2615 } 2616 2617 return (err); 2618 } 2619 2620 int 2621 vie_emulate_other(struct vie *vie, struct vm *vm, int vcpuid) 2622 { 2623 int error; 2624 2625 if ((vie->status & (VIES_INST_DECODE | VIES_OTHER)) != 2626 (VIES_INST_DECODE | VIES_OTHER)) { 2627 return (EINVAL); 2628 } 2629 2630 switch (vie->op.op_type) { 2631 case VIE_OP_TYPE_CLTS: 2632 error = vie_emulate_clts(vie, vm, vcpuid); 2633 break; 2634 case VIE_OP_TYPE_MOV_CR: 2635 error = vie_emulate_mov_cr(vie, vm, vcpuid); 2636 break; 2637 default: 2638 error = EINVAL; 2639 break; 2640 } 2641 2642 return (error); 2643 } 2644 2645 void 2646 vie_reset(struct vie *vie) 2647 { 2648 vie->status = 0; 2649 vie->num_processed = vie->num_valid = 0; 2650 } 2651 2652 void 2653 vie_advance_pc(struct vie *vie, uint64_t *nextrip) 2654 { 2655 VERIFY((vie->status & VIES_REPEAT) == 0); 2656 2657 *nextrip += vie->num_processed; 2658 vie_reset(vie); 2659 } 2660 2661 void 2662 vie_exitinfo(const struct vie *vie, struct vm_exit *vme) 2663 { 2664 if (vie->status & VIES_USER_FALLBACK) { 2665 /* 2666 * Despite the fact that the instruction was successfully 2667 * decoded, some aspect of the emulation failed in such a way 2668 * that it is left up to userspace to complete the operation. 2669 */ 2670 vie_fallback_exitinfo(vie, vme); 2671 } else if (vie->status & VIES_MMIO) { 2672 vme->exitcode = VM_EXITCODE_MMIO; 2673 if (vie->mmio_req_read.state == VR_PENDING) { 2674 vme->u.mmio.gpa = vie->mmio_req_read.gpa; 2675 vme->u.mmio.data = 0; 2676 vme->u.mmio.bytes = vie->mmio_req_read.bytes; 2677 vme->u.mmio.read = 1; 2678 } else if (vie->mmio_req_write.state == VR_PENDING) { 2679 vme->u.mmio.gpa = vie->mmio_req_write.gpa; 2680 vme->u.mmio.data = vie->mmio_req_write.data & 2681 vie_size2mask(vie->mmio_req_write.bytes); 2682 vme->u.mmio.bytes = vie->mmio_req_write.bytes; 2683 vme->u.mmio.read = 0; 2684 } else { 2685 panic("bad pending MMIO state"); 2686 } 2687 } else if (vie->status & VIES_INOUT) { 2688 vme->exitcode = VM_EXITCODE_INOUT; 2689 vme->u.inout.port = vie->inout.port; 2690 vme->u.inout.bytes = vie->inout.bytes; 2691 if ((vie->inout.flags & INOUT_IN) != 0) { 2692 vme->u.inout.flags = INOUT_IN; 2693 vme->u.inout.eax = 0; 2694 } else { 2695 vme->u.inout.flags = 0; 2696 vme->u.inout.eax = vie->inout.eax & 2697 vie_size2mask(vie->inout.bytes); 2698 } 2699 } else { 2700 panic("no pending operation"); 2701 } 2702 } 2703 2704 /* 2705 * In the case of a decoding or verification failure, bailing out to userspace 2706 * to do the instruction emulation is our only option for now. 2707 */ 2708 void 2709 vie_fallback_exitinfo(const struct vie *vie, struct vm_exit *vme) 2710 { 2711 if ((vie->status & VIES_INST_FETCH) == 0) { 2712 bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul)); 2713 } else { 2714 ASSERT(sizeof (vie->inst) == sizeof (vme->u.inst_emul.inst)); 2715 2716 bcopy(vie->inst, vme->u.inst_emul.inst, sizeof (vie->inst)); 2717 vme->u.inst_emul.num_valid = vie->num_valid; 2718 } 2719 vme->exitcode = VM_EXITCODE_INST_EMUL; 2720 } 2721 2722 void 2723 vie_cs_info(const struct vie *vie, struct vm *vm, int vcpuid, uint64_t *cs_base, 2724 int *cs_d) 2725 { 2726 struct seg_desc cs_desc; 2727 int error __maybe_unused; 2728 2729 error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &cs_desc); 2730 ASSERT(error == 0); 2731 2732 /* Initialization required for the paging info to be populated */ 2733 VERIFY(vie->status & VIES_INIT); 2734 switch (vie->paging.cpu_mode) { 2735 case CPU_MODE_REAL: 2736 *cs_base = cs_desc.base; 2737 *cs_d = 0; 2738 break; 2739 case CPU_MODE_PROTECTED: 2740 case CPU_MODE_COMPATIBILITY: 2741 *cs_base = cs_desc.base; 2742 *cs_d = SEG_DESC_DEF32(cs_desc.access) ? 1 : 0; 2743 break; 2744 default: 2745 *cs_base = 0; 2746 *cs_d = 0; 2747 break; 2748 } 2749 } 2750 2751 bool 2752 vie_pending(const struct vie *vie) 2753 { 2754 /* 2755 * These VIE status bits indicate conditions which must be addressed 2756 * through either device IO fulfillment (with corresponding 2757 * vie_fulfill_*()) or complete userspace emulation (followed by a 2758 * vie_reset()). 2759 */ 2760 const enum vie_status of_interest = 2761 VIES_PENDING_MMIO | VIES_PENDING_INOUT | VIES_USER_FALLBACK; 2762 2763 return ((vie->status & of_interest) != 0); 2764 } 2765 2766 bool 2767 vie_needs_fetch(const struct vie *vie) 2768 { 2769 if (vie->status & VIES_INST_FETCH) { 2770 ASSERT(vie->num_valid != 0); 2771 return (false); 2772 } 2773 return (true); 2774 } 2775 2776 static int 2777 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) 2778 { 2779 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 2780 ("%s: invalid size %d", __func__, size)); 2781 KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl)); 2782 2783 if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) 2784 return (0); 2785 2786 return ((gla & (size - 1)) ? 1 : 0); 2787 } 2788 2789 static int 2790 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla) 2791 { 2792 uint64_t mask; 2793 2794 if (cpu_mode != CPU_MODE_64BIT) 2795 return (0); 2796 2797 /* 2798 * The value of the bit 47 in the 'gla' should be replicated in the 2799 * most significant 16 bits. 2800 */ 2801 mask = ~((1UL << 48) - 1); 2802 if (gla & (1UL << 47)) 2803 return ((gla & mask) != mask); 2804 else 2805 return ((gla & mask) != 0); 2806 } 2807 2808 static uint64_t 2809 vie_size2mask(int size) 2810 { 2811 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 2812 ("vie_size2mask: invalid size %d", size)); 2813 return (size2mask[size]); 2814 } 2815 2816 static int 2817 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, 2818 struct seg_desc *desc, uint64_t offset, int length, int addrsize, 2819 int prot, uint64_t *gla) 2820 { 2821 uint64_t firstoff, low_limit, high_limit, segbase; 2822 int glasize, type; 2823 2824 KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS, 2825 ("%s: invalid segment %d", __func__, seg)); 2826 KASSERT(length == 1 || length == 2 || length == 4 || length == 8, 2827 ("%s: invalid operand size %d", __func__, length)); 2828 KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, 2829 ("%s: invalid prot %x", __func__, prot)); 2830 2831 firstoff = offset; 2832 if (cpu_mode == CPU_MODE_64BIT) { 2833 KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address " 2834 "size %d for cpu_mode %d", __func__, addrsize, cpu_mode)); 2835 glasize = 8; 2836 } else { 2837 KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address " 2838 "size %d for cpu mode %d", __func__, addrsize, cpu_mode)); 2839 glasize = 4; 2840 /* 2841 * If the segment selector is loaded with a NULL selector 2842 * then the descriptor is unusable and attempting to use 2843 * it results in a #GP(0). 2844 */ 2845 if (SEG_DESC_UNUSABLE(desc->access)) 2846 return (-1); 2847 2848 /* 2849 * The processor generates a #NP exception when a segment 2850 * register is loaded with a selector that points to a 2851 * descriptor that is not present. If this was the case then 2852 * it would have been checked before the VM-exit. 2853 */ 2854 KASSERT(SEG_DESC_PRESENT(desc->access), 2855 ("segment %d not present: %x", seg, desc->access)); 2856 2857 /* 2858 * The descriptor type must indicate a code/data segment. 2859 */ 2860 type = SEG_DESC_TYPE(desc->access); 2861 KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " 2862 "descriptor type %x", seg, type)); 2863 2864 if (prot & PROT_READ) { 2865 /* #GP on a read access to a exec-only code segment */ 2866 if ((type & 0xA) == 0x8) 2867 return (-1); 2868 } 2869 2870 if (prot & PROT_WRITE) { 2871 /* 2872 * #GP on a write access to a code segment or a 2873 * read-only data segment. 2874 */ 2875 if (type & 0x8) /* code segment */ 2876 return (-1); 2877 2878 if ((type & 0xA) == 0) /* read-only data seg */ 2879 return (-1); 2880 } 2881 2882 /* 2883 * 'desc->limit' is fully expanded taking granularity into 2884 * account. 2885 */ 2886 if ((type & 0xC) == 0x4) { 2887 /* expand-down data segment */ 2888 low_limit = desc->limit + 1; 2889 high_limit = SEG_DESC_DEF32(desc->access) ? 2890 0xffffffff : 0xffff; 2891 } else { 2892 /* code segment or expand-up data segment */ 2893 low_limit = 0; 2894 high_limit = desc->limit; 2895 } 2896 2897 while (length > 0) { 2898 offset &= vie_size2mask(addrsize); 2899 if (offset < low_limit || offset > high_limit) 2900 return (-1); 2901 offset++; 2902 length--; 2903 } 2904 } 2905 2906 /* 2907 * In 64-bit mode all segments except %fs and %gs have a segment 2908 * base address of 0. 2909 */ 2910 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && 2911 seg != VM_REG_GUEST_GS) { 2912 segbase = 0; 2913 } else { 2914 segbase = desc->base; 2915 } 2916 2917 /* 2918 * Truncate 'firstoff' to the effective address size before adding 2919 * it to the segment base. 2920 */ 2921 firstoff &= vie_size2mask(addrsize); 2922 *gla = (segbase + firstoff) & vie_size2mask(glasize); 2923 return (0); 2924 } 2925 2926 void 2927 vie_init_mmio(struct vie *vie, const char *inst_bytes, uint8_t inst_length, 2928 const struct vm_guest_paging *paging, uint64_t gpa) 2929 { 2930 KASSERT(inst_length <= VIE_INST_SIZE, 2931 ("%s: invalid instruction length (%d)", __func__, inst_length)); 2932 2933 bzero(vie, sizeof (struct vie)); 2934 2935 vie->base_register = VM_REG_LAST; 2936 vie->index_register = VM_REG_LAST; 2937 vie->segment_register = VM_REG_LAST; 2938 vie->status = VIES_INIT | VIES_MMIO; 2939 2940 if (inst_length != 0) { 2941 bcopy(inst_bytes, vie->inst, inst_length); 2942 vie->num_valid = inst_length; 2943 vie->status |= VIES_INST_FETCH; 2944 } 2945 2946 vie->paging = *paging; 2947 vie->mmio_gpa = gpa; 2948 } 2949 2950 void 2951 vie_init_inout(struct vie *vie, const struct vm_inout *inout, uint8_t inst_len, 2952 const struct vm_guest_paging *paging) 2953 { 2954 bzero(vie, sizeof (struct vie)); 2955 2956 vie->status = VIES_INIT | VIES_INOUT; 2957 2958 vie->inout = *inout; 2959 vie->paging = *paging; 2960 2961 /* 2962 * Since VMX/SVM assists already decoded the nature of the in/out 2963 * instruction, let the status reflect that. 2964 */ 2965 vie->status |= VIES_INST_FETCH | VIES_INST_DECODE; 2966 vie->num_processed = inst_len; 2967 } 2968 2969 void 2970 vie_init_other(struct vie *vie, const struct vm_guest_paging *paging) 2971 { 2972 bzero(vie, sizeof (struct vie)); 2973 2974 vie->base_register = VM_REG_LAST; 2975 vie->index_register = VM_REG_LAST; 2976 vie->segment_register = VM_REG_LAST; 2977 vie->status = VIES_INIT | VIES_OTHER; 2978 2979 vie->paging = *paging; 2980 } 2981 2982 int 2983 vie_fulfill_mmio(struct vie *vie, const struct vm_mmio *result) 2984 { 2985 struct vie_mmio *pending; 2986 2987 if ((vie->status & VIES_MMIO) == 0 || 2988 (vie->status & VIES_PENDING_MMIO) == 0) { 2989 return (EINVAL); 2990 } 2991 2992 if (result->read) { 2993 pending = &vie->mmio_req_read; 2994 } else { 2995 pending = &vie->mmio_req_write; 2996 } 2997 2998 if (pending->state != VR_PENDING || 2999 pending->bytes != result->bytes || pending->gpa != result->gpa) { 3000 return (EINVAL); 3001 } 3002 3003 if (result->read) { 3004 pending->data = result->data & vie_size2mask(pending->bytes); 3005 } 3006 pending->state = VR_DONE; 3007 vie->status &= ~VIES_PENDING_MMIO; 3008 3009 return (0); 3010 } 3011 3012 int 3013 vie_fulfill_inout(struct vie *vie, const struct vm_inout *result) 3014 { 3015 if ((vie->status & VIES_INOUT) == 0 || 3016 (vie->status & VIES_PENDING_INOUT) == 0) { 3017 return (EINVAL); 3018 } 3019 if ((vie->inout.flags & INOUT_IN) != (result->flags & INOUT_IN) || 3020 vie->inout.bytes != result->bytes || 3021 vie->inout.port != result->port) { 3022 return (EINVAL); 3023 } 3024 3025 if (result->flags & INOUT_IN) { 3026 vie->inout_req_val = result->eax & 3027 vie_size2mask(vie->inout.bytes); 3028 } 3029 vie->inout_req_state = VR_DONE; 3030 vie->status &= ~(VIES_PENDING_INOUT); 3031 3032 return (0); 3033 } 3034 3035 uint64_t 3036 vie_mmio_gpa(const struct vie *vie) 3037 { 3038 return (vie->mmio_gpa); 3039 } 3040 3041 static int 3042 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte) 3043 { 3044 int error_code = 0; 3045 3046 if (pte & PG_V) 3047 error_code |= PGEX_P; 3048 if (prot & PROT_WRITE) 3049 error_code |= PGEX_W; 3050 if (usermode) 3051 error_code |= PGEX_U; 3052 if (rsvd) 3053 error_code |= PGEX_RSV; 3054 if (prot & PROT_EXEC) 3055 error_code |= PGEX_I; 3056 3057 return (error_code); 3058 } 3059 3060 static void 3061 ptp_release(vm_page_t **vmp) 3062 { 3063 if (*vmp != NULL) { 3064 (void) vmp_release(*vmp); 3065 *vmp = NULL; 3066 } 3067 } 3068 3069 static void * 3070 ptp_hold(struct vm *vm, int vcpu, uintptr_t gpa, size_t len, vm_page_t **vmp) 3071 { 3072 vm_client_t *vmc = vm_get_vmclient(vm, vcpu); 3073 const uintptr_t hold_gpa = gpa & PAGEMASK; 3074 3075 /* Hold must not cross a page boundary */ 3076 VERIFY3U(gpa + len, <=, hold_gpa + PAGESIZE); 3077 3078 if (*vmp != NULL) { 3079 (void) vmp_release(*vmp); 3080 } 3081 3082 *vmp = vmc_hold(vmc, hold_gpa, PROT_READ | PROT_WRITE); 3083 if (*vmp == NULL) { 3084 return (NULL); 3085 } 3086 3087 return ((caddr_t)vmp_get_writable(*vmp) + (gpa - hold_gpa)); 3088 } 3089 3090 static int 3091 _vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 3092 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only) 3093 { 3094 int nlevels, pfcode; 3095 int ptpshift = 0, ptpindex = 0; 3096 uint64_t ptpphys; 3097 uint64_t *ptpbase = NULL, pte = 0, pgsize = 0; 3098 vm_page_t *cookie = NULL; 3099 const bool usermode = paging->cpl == 3; 3100 const bool writable = (prot & PROT_WRITE) != 0; 3101 3102 *guest_fault = 0; 3103 restart: 3104 ptpphys = paging->cr3; /* root of the page tables */ 3105 ptp_release(&cookie); 3106 3107 if (vie_canonical_check(paging->cpu_mode, gla)) { 3108 /* 3109 * XXX assuming a non-stack reference otherwise a stack fault 3110 * should be generated. 3111 */ 3112 if (!check_only) 3113 vm_inject_gp(vm, vcpuid); 3114 *guest_fault = 1; 3115 return (0); 3116 } 3117 3118 if (paging->paging_mode == PAGING_MODE_FLAT) { 3119 *gpa = gla; 3120 return (0); 3121 } 3122 3123 if (paging->paging_mode == PAGING_MODE_32) { 3124 uint32_t *ptpbase32, pte32; 3125 3126 nlevels = 2; 3127 while (--nlevels >= 0) { 3128 /* Zero out the lower 12 bits. */ 3129 ptpphys &= ~0xfff; 3130 3131 ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, 3132 &cookie); 3133 3134 if (ptpbase32 == NULL) { 3135 return (EFAULT); 3136 } 3137 3138 ptpshift = PAGE_SHIFT + nlevels * 10; 3139 ptpindex = (gla >> ptpshift) & 0x3FF; 3140 pgsize = 1UL << ptpshift; 3141 3142 pte32 = ptpbase32[ptpindex]; 3143 3144 if ((pte32 & PG_V) == 0 || 3145 (usermode && (pte32 & PG_U) == 0) || 3146 (writable && (pte32 & PG_RW) == 0)) { 3147 if (!check_only) { 3148 pfcode = pf_error_code(usermode, prot, 3149 0, pte32); 3150 vm_inject_pf(vm, vcpuid, pfcode, gla); 3151 } 3152 3153 ptp_release(&cookie); 3154 *guest_fault = 1; 3155 return (0); 3156 } 3157 3158 /* 3159 * Emulate the x86 MMU's management of the accessed 3160 * and dirty flags. While the accessed flag is set 3161 * at every level of the page table, the dirty flag 3162 * is only set at the last level providing the guest 3163 * physical address. 3164 */ 3165 if (!check_only && (pte32 & PG_A) == 0) { 3166 if (atomic_cmpset_32(&ptpbase32[ptpindex], 3167 pte32, pte32 | PG_A) == 0) { 3168 goto restart; 3169 } 3170 } 3171 3172 /* XXX must be ignored if CR4.PSE=0 */ 3173 if (nlevels > 0 && (pte32 & PG_PS) != 0) 3174 break; 3175 3176 ptpphys = pte32; 3177 } 3178 3179 /* Set the dirty bit in the page table entry if necessary */ 3180 if (!check_only && writable && (pte32 & PG_M) == 0) { 3181 if (atomic_cmpset_32(&ptpbase32[ptpindex], 3182 pte32, pte32 | PG_M) == 0) { 3183 goto restart; 3184 } 3185 } 3186 3187 /* Zero out the lower 'ptpshift' bits */ 3188 pte32 >>= ptpshift; pte32 <<= ptpshift; 3189 *gpa = pte32 | (gla & (pgsize - 1)); 3190 ptp_release(&cookie); 3191 return (0); 3192 } 3193 3194 if (paging->paging_mode == PAGING_MODE_PAE) { 3195 /* Zero out the lower 5 bits and the upper 32 bits */ 3196 ptpphys &= 0xffffffe0UL; 3197 3198 ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof (*ptpbase) * 4, 3199 &cookie); 3200 if (ptpbase == NULL) { 3201 return (EFAULT); 3202 } 3203 3204 ptpindex = (gla >> 30) & 0x3; 3205 3206 pte = ptpbase[ptpindex]; 3207 3208 if ((pte & PG_V) == 0) { 3209 if (!check_only) { 3210 pfcode = pf_error_code(usermode, prot, 0, pte); 3211 vm_inject_pf(vm, vcpuid, pfcode, gla); 3212 } 3213 3214 ptp_release(&cookie); 3215 *guest_fault = 1; 3216 return (0); 3217 } 3218 3219 ptpphys = pte; 3220 3221 nlevels = 2; 3222 } else { 3223 nlevels = 4; 3224 } 3225 3226 while (--nlevels >= 0) { 3227 /* Zero out the lower 12 bits and the upper 12 bits */ 3228 ptpphys &= 0x000ffffffffff000UL; 3229 3230 ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie); 3231 if (ptpbase == NULL) { 3232 return (EFAULT); 3233 } 3234 3235 ptpshift = PAGE_SHIFT + nlevels * 9; 3236 ptpindex = (gla >> ptpshift) & 0x1FF; 3237 pgsize = 1UL << ptpshift; 3238 3239 pte = ptpbase[ptpindex]; 3240 3241 if ((pte & PG_V) == 0 || 3242 (usermode && (pte & PG_U) == 0) || 3243 (writable && (pte & PG_RW) == 0)) { 3244 if (!check_only) { 3245 pfcode = pf_error_code(usermode, prot, 0, pte); 3246 vm_inject_pf(vm, vcpuid, pfcode, gla); 3247 } 3248 3249 ptp_release(&cookie); 3250 *guest_fault = 1; 3251 return (0); 3252 } 3253 3254 /* Set the accessed bit in the page table entry */ 3255 if (!check_only && (pte & PG_A) == 0) { 3256 if (atomic_cmpset_64(&ptpbase[ptpindex], 3257 pte, pte | PG_A) == 0) { 3258 goto restart; 3259 } 3260 } 3261 3262 if (nlevels > 0 && (pte & PG_PS) != 0) { 3263 if (pgsize > 1 * GB) { 3264 if (!check_only) { 3265 pfcode = pf_error_code(usermode, prot, 3266 1, pte); 3267 vm_inject_pf(vm, vcpuid, pfcode, gla); 3268 } 3269 3270 ptp_release(&cookie); 3271 *guest_fault = 1; 3272 return (0); 3273 } 3274 break; 3275 } 3276 3277 ptpphys = pte; 3278 } 3279 3280 /* Set the dirty bit in the page table entry if necessary */ 3281 if (!check_only && writable && (pte & PG_M) == 0) { 3282 if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0) 3283 goto restart; 3284 } 3285 ptp_release(&cookie); 3286 3287 /* Zero out the lower 'ptpshift' bits and the upper 12 bits */ 3288 pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12; 3289 *gpa = pte | (gla & (pgsize - 1)); 3290 return (0); 3291 } 3292 3293 int 3294 vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 3295 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) 3296 { 3297 3298 return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault, 3299 false)); 3300 } 3301 3302 int 3303 vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 3304 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) 3305 { 3306 3307 return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault, 3308 true)); 3309 } 3310 3311 int 3312 vie_fetch_instruction(struct vie *vie, struct vm *vm, int vcpuid, uint64_t rip, 3313 int *faultptr) 3314 { 3315 struct vm_copyinfo copyinfo[2]; 3316 int error, prot; 3317 3318 if ((vie->status & VIES_INIT) == 0) { 3319 return (EINVAL); 3320 } 3321 3322 prot = PROT_READ | PROT_EXEC; 3323 error = vm_copy_setup(vm, vcpuid, &vie->paging, rip, VIE_INST_SIZE, 3324 prot, copyinfo, nitems(copyinfo), faultptr); 3325 if (error || *faultptr) 3326 return (error); 3327 3328 vm_copyin(vm, vcpuid, copyinfo, vie->inst, VIE_INST_SIZE); 3329 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 3330 vie->num_valid = VIE_INST_SIZE; 3331 vie->status |= VIES_INST_FETCH; 3332 return (0); 3333 } 3334 3335 static int 3336 vie_peek(struct vie *vie, uint8_t *x) 3337 { 3338 3339 if (vie->num_processed < vie->num_valid) { 3340 *x = vie->inst[vie->num_processed]; 3341 return (0); 3342 } else 3343 return (-1); 3344 } 3345 3346 static void 3347 vie_advance(struct vie *vie) 3348 { 3349 3350 vie->num_processed++; 3351 } 3352 3353 static bool 3354 segment_override(uint8_t x, int *seg) 3355 { 3356 3357 switch (x) { 3358 case 0x2E: 3359 *seg = VM_REG_GUEST_CS; 3360 break; 3361 case 0x36: 3362 *seg = VM_REG_GUEST_SS; 3363 break; 3364 case 0x3E: 3365 *seg = VM_REG_GUEST_DS; 3366 break; 3367 case 0x26: 3368 *seg = VM_REG_GUEST_ES; 3369 break; 3370 case 0x64: 3371 *seg = VM_REG_GUEST_FS; 3372 break; 3373 case 0x65: 3374 *seg = VM_REG_GUEST_GS; 3375 break; 3376 default: 3377 return (false); 3378 } 3379 return (true); 3380 } 3381 3382 static int 3383 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d) 3384 { 3385 uint8_t x; 3386 3387 while (1) { 3388 if (vie_peek(vie, &x)) 3389 return (-1); 3390 3391 if (x == 0x66) 3392 vie->opsize_override = 1; 3393 else if (x == 0x67) 3394 vie->addrsize_override = 1; 3395 else if (x == 0xF3) 3396 vie->repz_present = 1; 3397 else if (x == 0xF2) 3398 vie->repnz_present = 1; 3399 else if (segment_override(x, &vie->segment_register)) 3400 vie->segment_override = 1; 3401 else 3402 break; 3403 3404 vie_advance(vie); 3405 } 3406 3407 /* 3408 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2: 3409 * - Only one REX prefix is allowed per instruction. 3410 * - The REX prefix must immediately precede the opcode byte or the 3411 * escape opcode byte. 3412 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3) 3413 * the mandatory prefix must come before the REX prefix. 3414 */ 3415 if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) { 3416 vie->rex_present = 1; 3417 vie->rex_w = x & 0x8 ? 1 : 0; 3418 vie->rex_r = x & 0x4 ? 1 : 0; 3419 vie->rex_x = x & 0x2 ? 1 : 0; 3420 vie->rex_b = x & 0x1 ? 1 : 0; 3421 vie_advance(vie); 3422 } 3423 3424 /* 3425 * § 2.3.5, "The VEX Prefix", SDM Vol 2. 3426 */ 3427 if ((cpu_mode == CPU_MODE_64BIT || 3428 cpu_mode == CPU_MODE_COMPATIBILITY) && x == 0xC4) { 3429 const struct vie_op *optab; 3430 3431 /* 3-byte VEX prefix. */ 3432 vie->vex_present = 1; 3433 3434 vie_advance(vie); 3435 if (vie_peek(vie, &x)) 3436 return (-1); 3437 3438 /* 3439 * 2nd byte: [R', X', B', mmmmm[4:0]]. Bits are inverted 3440 * relative to REX encoding. 3441 */ 3442 vie->rex_r = x & 0x80 ? 0 : 1; 3443 vie->rex_x = x & 0x40 ? 0 : 1; 3444 vie->rex_b = x & 0x20 ? 0 : 1; 3445 3446 switch (x & 0x1F) { 3447 case 0x2: 3448 /* 0F 38. */ 3449 optab = three_byte_opcodes_0f38; 3450 break; 3451 case 0x1: 3452 /* 0F class - nothing handled here yet. */ 3453 /* FALLTHROUGH */ 3454 case 0x3: 3455 /* 0F 3A class - nothing handled here yet. */ 3456 /* FALLTHROUGH */ 3457 default: 3458 /* Reserved (#UD). */ 3459 return (-1); 3460 } 3461 3462 vie_advance(vie); 3463 if (vie_peek(vie, &x)) 3464 return (-1); 3465 3466 /* 3rd byte: [W, vvvv[6:3], L, pp[1:0]]. */ 3467 vie->rex_w = x & 0x80 ? 1 : 0; 3468 3469 vie->vex_reg = ((~(unsigned)x & 0x78u) >> 3); 3470 vie->vex_l = !!(x & 0x4); 3471 vie->vex_pp = (x & 0x3); 3472 3473 /* PP: 1=66 2=F3 3=F2 prefixes. */ 3474 switch (vie->vex_pp) { 3475 case 0x1: 3476 vie->opsize_override = 1; 3477 break; 3478 case 0x2: 3479 vie->repz_present = 1; 3480 break; 3481 case 0x3: 3482 vie->repnz_present = 1; 3483 break; 3484 } 3485 3486 vie_advance(vie); 3487 3488 /* Opcode, sans literal prefix prefix. */ 3489 if (vie_peek(vie, &x)) 3490 return (-1); 3491 3492 vie->op = optab[x]; 3493 if (vie->op.op_type == VIE_OP_TYPE_NONE) 3494 return (-1); 3495 3496 vie_advance(vie); 3497 } 3498 3499 /* 3500 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1 3501 */ 3502 if (cpu_mode == CPU_MODE_64BIT) { 3503 /* 3504 * Default address size is 64-bits and default operand size 3505 * is 32-bits. 3506 */ 3507 vie->addrsize = vie->addrsize_override ? 4 : 8; 3508 if (vie->rex_w) 3509 vie->opsize = 8; 3510 else if (vie->opsize_override) 3511 vie->opsize = 2; 3512 else 3513 vie->opsize = 4; 3514 } else if (cs_d) { 3515 /* Default address and operand sizes are 32-bits */ 3516 vie->addrsize = vie->addrsize_override ? 2 : 4; 3517 vie->opsize = vie->opsize_override ? 2 : 4; 3518 } else { 3519 /* Default address and operand sizes are 16-bits */ 3520 vie->addrsize = vie->addrsize_override ? 4 : 2; 3521 vie->opsize = vie->opsize_override ? 4 : 2; 3522 } 3523 return (0); 3524 } 3525 3526 static int 3527 decode_two_byte_opcode(struct vie *vie) 3528 { 3529 uint8_t x; 3530 3531 if (vie_peek(vie, &x)) 3532 return (-1); 3533 3534 vie->op = two_byte_opcodes[x]; 3535 3536 if (vie->op.op_type == VIE_OP_TYPE_NONE) 3537 return (-1); 3538 3539 vie_advance(vie); 3540 return (0); 3541 } 3542 3543 static int 3544 decode_opcode(struct vie *vie) 3545 { 3546 uint8_t x; 3547 3548 if (vie_peek(vie, &x)) 3549 return (-1); 3550 3551 /* Already did this via VEX prefix. */ 3552 if (vie->op.op_type != VIE_OP_TYPE_NONE) 3553 return (0); 3554 3555 vie->op = one_byte_opcodes[x]; 3556 3557 if (vie->op.op_type == VIE_OP_TYPE_NONE) 3558 return (-1); 3559 3560 vie_advance(vie); 3561 3562 if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE) 3563 return (decode_two_byte_opcode(vie)); 3564 3565 return (0); 3566 } 3567 3568 static int 3569 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode) 3570 { 3571 uint8_t x; 3572 /* 3573 * Handling mov-to/from-cr is special since it is not issuing 3574 * mmio/pio requests and can be done in real mode. We must bypass some 3575 * of the other existing decoding restrictions for it. 3576 */ 3577 const bool is_movcr = ((vie->op.op_flags & VIE_OP_F_REG_REG) != 0); 3578 3579 if (vie->op.op_flags & VIE_OP_F_NO_MODRM) 3580 return (0); 3581 3582 if (cpu_mode == CPU_MODE_REAL && !is_movcr) 3583 return (-1); 3584 3585 if (vie_peek(vie, &x)) 3586 return (-1); 3587 3588 vie->mod = (x >> 6) & 0x3; 3589 vie->rm = (x >> 0) & 0x7; 3590 vie->reg = (x >> 3) & 0x7; 3591 3592 /* 3593 * A direct addressing mode makes no sense in the context of an EPT 3594 * fault. There has to be a memory access involved to cause the 3595 * EPT fault. 3596 */ 3597 if (vie->mod == VIE_MOD_DIRECT && !is_movcr) 3598 return (-1); 3599 3600 if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) || 3601 (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) { 3602 /* 3603 * Table 2-5: Special Cases of REX Encodings 3604 * 3605 * mod=0, r/m=5 is used in the compatibility mode to 3606 * indicate a disp32 without a base register. 3607 * 3608 * mod!=3, r/m=4 is used in the compatibility mode to 3609 * indicate that the SIB byte is present. 3610 * 3611 * The 'b' bit in the REX prefix is don't care in 3612 * this case. 3613 */ 3614 } else { 3615 vie->rm |= (vie->rex_b << 3); 3616 } 3617 3618 vie->reg |= (vie->rex_r << 3); 3619 3620 /* SIB */ 3621 if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB) 3622 goto done; 3623 3624 vie->base_register = gpr_map[vie->rm]; 3625 3626 switch (vie->mod) { 3627 case VIE_MOD_INDIRECT_DISP8: 3628 vie->disp_bytes = 1; 3629 break; 3630 case VIE_MOD_INDIRECT_DISP32: 3631 vie->disp_bytes = 4; 3632 break; 3633 case VIE_MOD_INDIRECT: 3634 if (vie->rm == VIE_RM_DISP32) { 3635 vie->disp_bytes = 4; 3636 /* 3637 * Table 2-7. RIP-Relative Addressing 3638 * 3639 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32 3640 * whereas in compatibility mode it just implies disp32. 3641 */ 3642 3643 if (cpu_mode == CPU_MODE_64BIT) 3644 vie->base_register = VM_REG_GUEST_RIP; 3645 else 3646 vie->base_register = VM_REG_LAST; 3647 } 3648 break; 3649 } 3650 3651 done: 3652 vie_advance(vie); 3653 3654 return (0); 3655 } 3656 3657 static int 3658 decode_sib(struct vie *vie) 3659 { 3660 uint8_t x; 3661 3662 /* Proceed only if SIB byte is present */ 3663 if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB) 3664 return (0); 3665 3666 if (vie_peek(vie, &x)) 3667 return (-1); 3668 3669 /* De-construct the SIB byte */ 3670 vie->ss = (x >> 6) & 0x3; 3671 vie->index = (x >> 3) & 0x7; 3672 vie->base = (x >> 0) & 0x7; 3673 3674 /* Apply the REX prefix modifiers */ 3675 vie->index |= vie->rex_x << 3; 3676 vie->base |= vie->rex_b << 3; 3677 3678 switch (vie->mod) { 3679 case VIE_MOD_INDIRECT_DISP8: 3680 vie->disp_bytes = 1; 3681 break; 3682 case VIE_MOD_INDIRECT_DISP32: 3683 vie->disp_bytes = 4; 3684 break; 3685 } 3686 3687 if (vie->mod == VIE_MOD_INDIRECT && 3688 (vie->base == 5 || vie->base == 13)) { 3689 /* 3690 * Special case when base register is unused if mod = 0 3691 * and base = %rbp or %r13. 3692 * 3693 * Documented in: 3694 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 3695 * Table 2-5: Special Cases of REX Encodings 3696 */ 3697 vie->disp_bytes = 4; 3698 } else { 3699 vie->base_register = gpr_map[vie->base]; 3700 } 3701 3702 /* 3703 * All encodings of 'index' are valid except for %rsp (4). 3704 * 3705 * Documented in: 3706 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 3707 * Table 2-5: Special Cases of REX Encodings 3708 */ 3709 if (vie->index != 4) 3710 vie->index_register = gpr_map[vie->index]; 3711 3712 /* 'scale' makes sense only in the context of an index register */ 3713 if (vie->index_register < VM_REG_LAST) 3714 vie->scale = 1 << vie->ss; 3715 3716 vie_advance(vie); 3717 3718 return (0); 3719 } 3720 3721 static int 3722 decode_displacement(struct vie *vie) 3723 { 3724 int n, i; 3725 uint8_t x; 3726 3727 union { 3728 char buf[4]; 3729 int8_t signed8; 3730 int32_t signed32; 3731 } u; 3732 3733 if ((n = vie->disp_bytes) == 0) 3734 return (0); 3735 3736 if (n != 1 && n != 4) 3737 panic("decode_displacement: invalid disp_bytes %d", n); 3738 3739 for (i = 0; i < n; i++) { 3740 if (vie_peek(vie, &x)) 3741 return (-1); 3742 3743 u.buf[i] = x; 3744 vie_advance(vie); 3745 } 3746 3747 if (n == 1) 3748 vie->displacement = u.signed8; /* sign-extended */ 3749 else 3750 vie->displacement = u.signed32; /* sign-extended */ 3751 3752 return (0); 3753 } 3754 3755 static int 3756 decode_immediate(struct vie *vie) 3757 { 3758 int i, n; 3759 uint8_t x; 3760 union { 3761 char buf[4]; 3762 int8_t signed8; 3763 int16_t signed16; 3764 int32_t signed32; 3765 } u; 3766 3767 /* Figure out immediate operand size (if any) */ 3768 if (vie->op.op_flags & VIE_OP_F_IMM) { 3769 /* 3770 * Section 2.2.1.5 "Immediates", Intel SDM: 3771 * In 64-bit mode the typical size of immediate operands 3772 * remains 32-bits. When the operand size if 64-bits, the 3773 * processor sign-extends all immediates to 64-bits prior 3774 * to their use. 3775 */ 3776 if (vie->opsize == 4 || vie->opsize == 8) 3777 vie->imm_bytes = 4; 3778 else 3779 vie->imm_bytes = 2; 3780 } else if (vie->op.op_flags & VIE_OP_F_IMM8) { 3781 vie->imm_bytes = 1; 3782 } 3783 3784 if ((n = vie->imm_bytes) == 0) 3785 return (0); 3786 3787 KASSERT(n == 1 || n == 2 || n == 4, 3788 ("%s: invalid number of immediate bytes: %d", __func__, n)); 3789 3790 for (i = 0; i < n; i++) { 3791 if (vie_peek(vie, &x)) 3792 return (-1); 3793 3794 u.buf[i] = x; 3795 vie_advance(vie); 3796 } 3797 3798 /* sign-extend the immediate value before use */ 3799 if (n == 1) 3800 vie->immediate = u.signed8; 3801 else if (n == 2) 3802 vie->immediate = u.signed16; 3803 else 3804 vie->immediate = u.signed32; 3805 3806 return (0); 3807 } 3808 3809 static int 3810 decode_moffset(struct vie *vie) 3811 { 3812 int i, n; 3813 uint8_t x; 3814 union { 3815 char buf[8]; 3816 uint64_t u64; 3817 } u; 3818 3819 if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0) 3820 return (0); 3821 3822 /* 3823 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM: 3824 * The memory offset size follows the address-size of the instruction. 3825 */ 3826 n = vie->addrsize; 3827 KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n)); 3828 3829 u.u64 = 0; 3830 for (i = 0; i < n; i++) { 3831 if (vie_peek(vie, &x)) 3832 return (-1); 3833 3834 u.buf[i] = x; 3835 vie_advance(vie); 3836 } 3837 vie->displacement = u.u64; 3838 return (0); 3839 } 3840 3841 /* 3842 * Verify that the 'guest linear address' provided as collateral of the nested 3843 * page table fault matches with our instruction decoding. 3844 */ 3845 int 3846 vie_verify_gla(struct vie *vie, struct vm *vm, int cpuid, uint64_t gla) 3847 { 3848 int error; 3849 uint64_t base, segbase, idx, gla2; 3850 enum vm_reg_name seg; 3851 struct seg_desc desc; 3852 3853 ASSERT((vie->status & VIES_INST_DECODE) != 0); 3854 3855 /* 3856 * If there was no valid GLA context with the exit, or the decoded 3857 * instruction acts on more than one address, verification is done. 3858 */ 3859 if (gla == VIE_INVALID_GLA || 3860 (vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) != 0) { 3861 return (0); 3862 } 3863 3864 base = 0; 3865 if (vie->base_register != VM_REG_LAST) { 3866 error = vm_get_register(vm, cpuid, vie->base_register, &base); 3867 if (error) { 3868 printf("verify_gla: error %d getting base reg %d\n", 3869 error, vie->base_register); 3870 return (-1); 3871 } 3872 3873 /* 3874 * RIP-relative addressing starts from the following 3875 * instruction 3876 */ 3877 if (vie->base_register == VM_REG_GUEST_RIP) 3878 base += vie->num_processed; 3879 } 3880 3881 idx = 0; 3882 if (vie->index_register != VM_REG_LAST) { 3883 error = vm_get_register(vm, cpuid, vie->index_register, &idx); 3884 if (error) { 3885 printf("verify_gla: error %d getting index reg %d\n", 3886 error, vie->index_register); 3887 return (-1); 3888 } 3889 } 3890 3891 /* 3892 * From "Specifying a Segment Selector", Intel SDM, Vol 1 3893 * 3894 * In 64-bit mode, segmentation is generally (but not 3895 * completely) disabled. The exceptions are the FS and GS 3896 * segments. 3897 * 3898 * In legacy IA-32 mode, when the ESP or EBP register is used 3899 * as the base, the SS segment is the default segment. For 3900 * other data references, except when relative to stack or 3901 * string destination the DS segment is the default. These 3902 * can be overridden to allow other segments to be accessed. 3903 */ 3904 if (vie->segment_override) { 3905 seg = vie->segment_register; 3906 } else if (vie->base_register == VM_REG_GUEST_RSP || 3907 vie->base_register == VM_REG_GUEST_RBP) { 3908 seg = VM_REG_GUEST_SS; 3909 } else { 3910 seg = VM_REG_GUEST_DS; 3911 } 3912 if (vie->paging.cpu_mode == CPU_MODE_64BIT && 3913 seg != VM_REG_GUEST_FS && seg != VM_REG_GUEST_GS) { 3914 segbase = 0; 3915 } else { 3916 error = vm_get_seg_desc(vm, cpuid, seg, &desc); 3917 if (error) { 3918 printf("verify_gla: error %d getting segment" 3919 " descriptor %d", error, vie->segment_register); 3920 return (-1); 3921 } 3922 segbase = desc.base; 3923 } 3924 3925 gla2 = segbase + base + vie->scale * idx + vie->displacement; 3926 gla2 &= size2mask[vie->addrsize]; 3927 if (gla != gla2) { 3928 printf("verify_gla mismatch: segbase(0x%0lx)" 3929 "base(0x%0lx), scale(%d), index(0x%0lx), " 3930 "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n", 3931 segbase, base, vie->scale, idx, vie->displacement, 3932 gla, gla2); 3933 return (-1); 3934 } 3935 3936 return (0); 3937 } 3938 3939 int 3940 vie_decode_instruction(struct vie *vie, struct vm *vm, int cpuid, int cs_d) 3941 { 3942 enum vm_cpu_mode cpu_mode; 3943 3944 if ((vie->status & VIES_INST_FETCH) == 0) { 3945 return (EINVAL); 3946 } 3947 3948 cpu_mode = vie->paging.cpu_mode; 3949 3950 if (decode_prefixes(vie, cpu_mode, cs_d)) 3951 return (-1); 3952 3953 if (decode_opcode(vie)) 3954 return (-1); 3955 3956 if (decode_modrm(vie, cpu_mode)) 3957 return (-1); 3958 3959 if (decode_sib(vie)) 3960 return (-1); 3961 3962 if (decode_displacement(vie)) 3963 return (-1); 3964 3965 if (decode_immediate(vie)) 3966 return (-1); 3967 3968 if (decode_moffset(vie)) 3969 return (-1); 3970 3971 vie->status |= VIES_INST_DECODE; 3972 3973 return (0); 3974 } 3975