1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2012 Sandvine, Inc. 5 * Copyright (c) 2012 NetApp, Inc. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * $FreeBSD$ 30 */ 31 32 #include <sys/cdefs.h> 33 __FBSDID("$FreeBSD$"); 34 35 #ifdef _KERNEL 36 #include <sys/param.h> 37 #include <sys/pcpu.h> 38 #include <sys/systm.h> 39 #include <sys/proc.h> 40 41 #include <vm/vm.h> 42 #include <vm/pmap.h> 43 44 #include <machine/vmparam.h> 45 #include <machine/vmm.h> 46 #else /* !_KERNEL */ 47 #include <sys/types.h> 48 #include <sys/errno.h> 49 #include <sys/_iovec.h> 50 51 #include <machine/vmm.h> 52 53 #include <assert.h> 54 #include <vmmapi.h> 55 #define KASSERT(exp,msg) assert((exp)) 56 #endif /* _KERNEL */ 57 58 #include <machine/vmm_instruction_emul.h> 59 #include <x86/psl.h> 60 #include <x86/specialreg.h> 61 62 /* struct vie_op.op_type */ 63 enum { 64 VIE_OP_TYPE_NONE = 0, 65 VIE_OP_TYPE_MOV, 66 VIE_OP_TYPE_MOVSX, 67 VIE_OP_TYPE_MOVZX, 68 VIE_OP_TYPE_AND, 69 VIE_OP_TYPE_OR, 70 VIE_OP_TYPE_SUB, 71 VIE_OP_TYPE_TWO_BYTE, 72 VIE_OP_TYPE_PUSH, 73 VIE_OP_TYPE_CMP, 74 VIE_OP_TYPE_POP, 75 VIE_OP_TYPE_MOVS, 76 VIE_OP_TYPE_GROUP1, 77 VIE_OP_TYPE_STOS, 78 VIE_OP_TYPE_BITTEST, 79 VIE_OP_TYPE_TWOB_GRP15, 80 VIE_OP_TYPE_LAST 81 }; 82 83 /* struct vie_op.op_flags */ 84 #define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */ 85 #define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ 86 #define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */ 87 #define VIE_OP_F_NO_MODRM (1 << 3) 88 #define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4) 89 90 static const struct vie_op two_byte_opcodes[256] = { 91 [0xAE] = { 92 .op_byte = 0xAE, 93 .op_type = VIE_OP_TYPE_TWOB_GRP15, 94 }, 95 [0xB6] = { 96 .op_byte = 0xB6, 97 .op_type = VIE_OP_TYPE_MOVZX, 98 }, 99 [0xB7] = { 100 .op_byte = 0xB7, 101 .op_type = VIE_OP_TYPE_MOVZX, 102 }, 103 [0xBA] = { 104 .op_byte = 0xBA, 105 .op_type = VIE_OP_TYPE_BITTEST, 106 .op_flags = VIE_OP_F_IMM8, 107 }, 108 [0xBE] = { 109 .op_byte = 0xBE, 110 .op_type = VIE_OP_TYPE_MOVSX, 111 }, 112 }; 113 114 static const struct vie_op one_byte_opcodes[256] = { 115 [0x0F] = { 116 .op_byte = 0x0F, 117 .op_type = VIE_OP_TYPE_TWO_BYTE 118 }, 119 [0x0B] = { 120 .op_byte = 0x0B, 121 .op_type = VIE_OP_TYPE_OR, 122 }, 123 [0x2B] = { 124 .op_byte = 0x2B, 125 .op_type = VIE_OP_TYPE_SUB, 126 }, 127 [0x39] = { 128 .op_byte = 0x39, 129 .op_type = VIE_OP_TYPE_CMP, 130 }, 131 [0x3B] = { 132 .op_byte = 0x3B, 133 .op_type = VIE_OP_TYPE_CMP, 134 }, 135 [0x88] = { 136 .op_byte = 0x88, 137 .op_type = VIE_OP_TYPE_MOV, 138 }, 139 [0x89] = { 140 .op_byte = 0x89, 141 .op_type = VIE_OP_TYPE_MOV, 142 }, 143 [0x8A] = { 144 .op_byte = 0x8A, 145 .op_type = VIE_OP_TYPE_MOV, 146 }, 147 [0x8B] = { 148 .op_byte = 0x8B, 149 .op_type = VIE_OP_TYPE_MOV, 150 }, 151 [0xA1] = { 152 .op_byte = 0xA1, 153 .op_type = VIE_OP_TYPE_MOV, 154 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 155 }, 156 [0xA3] = { 157 .op_byte = 0xA3, 158 .op_type = VIE_OP_TYPE_MOV, 159 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 160 }, 161 [0xA4] = { 162 .op_byte = 0xA4, 163 .op_type = VIE_OP_TYPE_MOVS, 164 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 165 }, 166 [0xA5] = { 167 .op_byte = 0xA5, 168 .op_type = VIE_OP_TYPE_MOVS, 169 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 170 }, 171 [0xAA] = { 172 .op_byte = 0xAA, 173 .op_type = VIE_OP_TYPE_STOS, 174 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 175 }, 176 [0xAB] = { 177 .op_byte = 0xAB, 178 .op_type = VIE_OP_TYPE_STOS, 179 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 180 }, 181 [0xC6] = { 182 /* XXX Group 11 extended opcode - not just MOV */ 183 .op_byte = 0xC6, 184 .op_type = VIE_OP_TYPE_MOV, 185 .op_flags = VIE_OP_F_IMM8, 186 }, 187 [0xC7] = { 188 .op_byte = 0xC7, 189 .op_type = VIE_OP_TYPE_MOV, 190 .op_flags = VIE_OP_F_IMM, 191 }, 192 [0x23] = { 193 .op_byte = 0x23, 194 .op_type = VIE_OP_TYPE_AND, 195 }, 196 [0x80] = { 197 /* Group 1 extended opcode */ 198 .op_byte = 0x80, 199 .op_type = VIE_OP_TYPE_GROUP1, 200 .op_flags = VIE_OP_F_IMM8, 201 }, 202 [0x81] = { 203 /* Group 1 extended opcode */ 204 .op_byte = 0x81, 205 .op_type = VIE_OP_TYPE_GROUP1, 206 .op_flags = VIE_OP_F_IMM, 207 }, 208 [0x83] = { 209 /* Group 1 extended opcode */ 210 .op_byte = 0x83, 211 .op_type = VIE_OP_TYPE_GROUP1, 212 .op_flags = VIE_OP_F_IMM8, 213 }, 214 [0x8F] = { 215 /* XXX Group 1A extended opcode - not just POP */ 216 .op_byte = 0x8F, 217 .op_type = VIE_OP_TYPE_POP, 218 }, 219 [0xFF] = { 220 /* XXX Group 5 extended opcode - not just PUSH */ 221 .op_byte = 0xFF, 222 .op_type = VIE_OP_TYPE_PUSH, 223 } 224 }; 225 226 /* struct vie.mod */ 227 #define VIE_MOD_INDIRECT 0 228 #define VIE_MOD_INDIRECT_DISP8 1 229 #define VIE_MOD_INDIRECT_DISP32 2 230 #define VIE_MOD_DIRECT 3 231 232 /* struct vie.rm */ 233 #define VIE_RM_SIB 4 234 #define VIE_RM_DISP32 5 235 236 #define GB (1024 * 1024 * 1024) 237 238 static enum vm_reg_name gpr_map[16] = { 239 VM_REG_GUEST_RAX, 240 VM_REG_GUEST_RCX, 241 VM_REG_GUEST_RDX, 242 VM_REG_GUEST_RBX, 243 VM_REG_GUEST_RSP, 244 VM_REG_GUEST_RBP, 245 VM_REG_GUEST_RSI, 246 VM_REG_GUEST_RDI, 247 VM_REG_GUEST_R8, 248 VM_REG_GUEST_R9, 249 VM_REG_GUEST_R10, 250 VM_REG_GUEST_R11, 251 VM_REG_GUEST_R12, 252 VM_REG_GUEST_R13, 253 VM_REG_GUEST_R14, 254 VM_REG_GUEST_R15 255 }; 256 257 static uint64_t size2mask[] = { 258 [1] = 0xff, 259 [2] = 0xffff, 260 [4] = 0xffffffff, 261 [8] = 0xffffffffffffffff, 262 }; 263 264 static int 265 vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval) 266 { 267 int error; 268 269 error = vm_get_register(vm, vcpuid, reg, rval); 270 271 return (error); 272 } 273 274 static void 275 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr) 276 { 277 *lhbr = 0; 278 *reg = gpr_map[vie->reg]; 279 280 /* 281 * 64-bit mode imposes limitations on accessing legacy high byte 282 * registers (lhbr). 283 * 284 * The legacy high-byte registers cannot be addressed if the REX 285 * prefix is present. In this case the values 4, 5, 6 and 7 of the 286 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively. 287 * 288 * If the REX prefix is not present then the values 4, 5, 6 and 7 289 * of the 'ModRM:reg' field address the legacy high-byte registers, 290 * %ah, %ch, %dh and %bh respectively. 291 */ 292 if (!vie->rex_present) { 293 if (vie->reg & 0x4) { 294 *lhbr = 1; 295 *reg = gpr_map[vie->reg & 0x3]; 296 } 297 } 298 } 299 300 static int 301 vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval) 302 { 303 uint64_t val; 304 int error, lhbr; 305 enum vm_reg_name reg; 306 307 vie_calc_bytereg(vie, ®, &lhbr); 308 error = vm_get_register(vm, vcpuid, reg, &val); 309 310 /* 311 * To obtain the value of a legacy high byte register shift the 312 * base register right by 8 bits (%ah = %rax >> 8). 313 */ 314 if (lhbr) 315 *rval = val >> 8; 316 else 317 *rval = val; 318 return (error); 319 } 320 321 static int 322 vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte) 323 { 324 uint64_t origval, val, mask; 325 int error, lhbr; 326 enum vm_reg_name reg; 327 328 vie_calc_bytereg(vie, ®, &lhbr); 329 error = vm_get_register(vm, vcpuid, reg, &origval); 330 if (error == 0) { 331 val = byte; 332 mask = 0xff; 333 if (lhbr) { 334 /* 335 * Shift left by 8 to store 'byte' in a legacy high 336 * byte register. 337 */ 338 val <<= 8; 339 mask <<= 8; 340 } 341 val |= origval & ~mask; 342 error = vm_set_register(vm, vcpuid, reg, val); 343 } 344 return (error); 345 } 346 347 int 348 vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, 349 uint64_t val, int size) 350 { 351 int error; 352 uint64_t origval; 353 354 switch (size) { 355 case 1: 356 case 2: 357 error = vie_read_register(vm, vcpuid, reg, &origval); 358 if (error) 359 return (error); 360 val &= size2mask[size]; 361 val |= origval & ~size2mask[size]; 362 break; 363 case 4: 364 val &= 0xffffffffUL; 365 break; 366 case 8: 367 break; 368 default: 369 return (EINVAL); 370 } 371 372 error = vm_set_register(vm, vcpuid, reg, val); 373 return (error); 374 } 375 376 #define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V) 377 378 /* 379 * Return the status flags that would result from doing (x - y). 380 */ 381 #define GETCC(sz) \ 382 static u_long \ 383 getcc##sz(uint##sz##_t x, uint##sz##_t y) \ 384 { \ 385 u_long rflags; \ 386 \ 387 __asm __volatile("sub %2,%1; pushfq; popq %0" : \ 388 "=r" (rflags), "+r" (x) : "m" (y)); \ 389 return (rflags); \ 390 } struct __hack 391 392 GETCC(8); 393 GETCC(16); 394 GETCC(32); 395 GETCC(64); 396 397 static u_long 398 getcc(int opsize, uint64_t x, uint64_t y) 399 { 400 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 401 ("getcc: invalid operand size %d", opsize)); 402 403 if (opsize == 1) 404 return (getcc8(x, y)); 405 else if (opsize == 2) 406 return (getcc16(x, y)); 407 else if (opsize == 4) 408 return (getcc32(x, y)); 409 else 410 return (getcc64(x, y)); 411 } 412 413 static int 414 emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 415 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 416 { 417 int error, size; 418 enum vm_reg_name reg; 419 uint8_t byte; 420 uint64_t val; 421 422 size = vie->opsize; 423 error = EINVAL; 424 425 switch (vie->op.op_byte) { 426 case 0x88: 427 /* 428 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m) 429 * 88/r: mov r/m8, r8 430 * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available) 431 */ 432 size = 1; /* override for byte operation */ 433 error = vie_read_bytereg(vm, vcpuid, vie, &byte); 434 if (error == 0) 435 error = memwrite(vm, vcpuid, gpa, byte, size, arg); 436 break; 437 case 0x89: 438 /* 439 * MOV from reg (ModRM:reg) to mem (ModRM:r/m) 440 * 89/r: mov r/m16, r16 441 * 89/r: mov r/m32, r32 442 * REX.W + 89/r mov r/m64, r64 443 */ 444 reg = gpr_map[vie->reg]; 445 error = vie_read_register(vm, vcpuid, reg, &val); 446 if (error == 0) { 447 val &= size2mask[size]; 448 error = memwrite(vm, vcpuid, gpa, val, size, arg); 449 } 450 break; 451 case 0x8A: 452 /* 453 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg) 454 * 8A/r: mov r8, r/m8 455 * REX + 8A/r: mov r8, r/m8 456 */ 457 size = 1; /* override for byte operation */ 458 error = memread(vm, vcpuid, gpa, &val, size, arg); 459 if (error == 0) 460 error = vie_write_bytereg(vm, vcpuid, vie, val); 461 break; 462 case 0x8B: 463 /* 464 * MOV from mem (ModRM:r/m) to reg (ModRM:reg) 465 * 8B/r: mov r16, r/m16 466 * 8B/r: mov r32, r/m32 467 * REX.W 8B/r: mov r64, r/m64 468 */ 469 error = memread(vm, vcpuid, gpa, &val, size, arg); 470 if (error == 0) { 471 reg = gpr_map[vie->reg]; 472 error = vie_update_register(vm, vcpuid, reg, val, size); 473 } 474 break; 475 case 0xA1: 476 /* 477 * MOV from seg:moffset to AX/EAX/RAX 478 * A1: mov AX, moffs16 479 * A1: mov EAX, moffs32 480 * REX.W + A1: mov RAX, moffs64 481 */ 482 error = memread(vm, vcpuid, gpa, &val, size, arg); 483 if (error == 0) { 484 reg = VM_REG_GUEST_RAX; 485 error = vie_update_register(vm, vcpuid, reg, val, size); 486 } 487 break; 488 case 0xA3: 489 /* 490 * MOV from AX/EAX/RAX to seg:moffset 491 * A3: mov moffs16, AX 492 * A3: mov moffs32, EAX 493 * REX.W + A3: mov moffs64, RAX 494 */ 495 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); 496 if (error == 0) { 497 val &= size2mask[size]; 498 error = memwrite(vm, vcpuid, gpa, val, size, arg); 499 } 500 break; 501 case 0xC6: 502 /* 503 * MOV from imm8 to mem (ModRM:r/m) 504 * C6/0 mov r/m8, imm8 505 * REX + C6/0 mov r/m8, imm8 506 */ 507 size = 1; /* override for byte operation */ 508 error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg); 509 break; 510 case 0xC7: 511 /* 512 * MOV from imm16/imm32 to mem (ModRM:r/m) 513 * C7/0 mov r/m16, imm16 514 * C7/0 mov r/m32, imm32 515 * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) 516 */ 517 val = vie->immediate & size2mask[size]; 518 error = memwrite(vm, vcpuid, gpa, val, size, arg); 519 break; 520 default: 521 break; 522 } 523 524 return (error); 525 } 526 527 static int 528 emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 529 mem_region_read_t memread, mem_region_write_t memwrite, 530 void *arg) 531 { 532 int error, size; 533 enum vm_reg_name reg; 534 uint64_t val; 535 536 size = vie->opsize; 537 error = EINVAL; 538 539 switch (vie->op.op_byte) { 540 case 0xB6: 541 /* 542 * MOV and zero extend byte from mem (ModRM:r/m) to 543 * reg (ModRM:reg). 544 * 545 * 0F B6/r movzx r16, r/m8 546 * 0F B6/r movzx r32, r/m8 547 * REX.W + 0F B6/r movzx r64, r/m8 548 */ 549 550 /* get the first operand */ 551 error = memread(vm, vcpuid, gpa, &val, 1, arg); 552 if (error) 553 break; 554 555 /* get the second operand */ 556 reg = gpr_map[vie->reg]; 557 558 /* zero-extend byte */ 559 val = (uint8_t)val; 560 561 /* write the result */ 562 error = vie_update_register(vm, vcpuid, reg, val, size); 563 break; 564 case 0xB7: 565 /* 566 * MOV and zero extend word from mem (ModRM:r/m) to 567 * reg (ModRM:reg). 568 * 569 * 0F B7/r movzx r32, r/m16 570 * REX.W + 0F B7/r movzx r64, r/m16 571 */ 572 error = memread(vm, vcpuid, gpa, &val, 2, arg); 573 if (error) 574 return (error); 575 576 reg = gpr_map[vie->reg]; 577 578 /* zero-extend word */ 579 val = (uint16_t)val; 580 581 error = vie_update_register(vm, vcpuid, reg, val, size); 582 break; 583 case 0xBE: 584 /* 585 * MOV and sign extend byte from mem (ModRM:r/m) to 586 * reg (ModRM:reg). 587 * 588 * 0F BE/r movsx r16, r/m8 589 * 0F BE/r movsx r32, r/m8 590 * REX.W + 0F BE/r movsx r64, r/m8 591 */ 592 593 /* get the first operand */ 594 error = memread(vm, vcpuid, gpa, &val, 1, arg); 595 if (error) 596 break; 597 598 /* get the second operand */ 599 reg = gpr_map[vie->reg]; 600 601 /* sign extend byte */ 602 val = (int8_t)val; 603 604 /* write the result */ 605 error = vie_update_register(vm, vcpuid, reg, val, size); 606 break; 607 default: 608 break; 609 } 610 return (error); 611 } 612 613 /* 614 * Helper function to calculate and validate a linear address. 615 */ 616 static int 617 get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging, 618 int opsize, int addrsize, int prot, enum vm_reg_name seg, 619 enum vm_reg_name gpr, uint64_t *gla, int *fault) 620 { 621 struct seg_desc desc; 622 uint64_t cr0, val, rflags; 623 int error; 624 625 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); 626 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); 627 628 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 629 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 630 631 error = vm_get_seg_desc(vm, vcpuid, seg, &desc); 632 KASSERT(error == 0, ("%s: error %d getting segment descriptor %d", 633 __func__, error, seg)); 634 635 error = vie_read_register(vm, vcpuid, gpr, &val); 636 KASSERT(error == 0, ("%s: error %d getting register %d", __func__, 637 error, gpr)); 638 639 if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize, 640 addrsize, prot, gla)) { 641 if (seg == VM_REG_GUEST_SS) 642 vm_inject_ss(vm, vcpuid, 0); 643 else 644 vm_inject_gp(vm, vcpuid); 645 goto guest_fault; 646 } 647 648 if (vie_canonical_check(paging->cpu_mode, *gla)) { 649 if (seg == VM_REG_GUEST_SS) 650 vm_inject_ss(vm, vcpuid, 0); 651 else 652 vm_inject_gp(vm, vcpuid); 653 goto guest_fault; 654 } 655 656 if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) { 657 vm_inject_ac(vm, vcpuid, 0); 658 goto guest_fault; 659 } 660 661 *fault = 0; 662 return (0); 663 664 guest_fault: 665 *fault = 1; 666 return (0); 667 } 668 669 static int 670 emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 671 struct vm_guest_paging *paging, mem_region_read_t memread, 672 mem_region_write_t memwrite, void *arg) 673 { 674 #ifdef _KERNEL 675 struct vm_copyinfo copyinfo[2]; 676 #else 677 struct iovec copyinfo[2]; 678 #endif 679 uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val; 680 uint64_t rcx, rdi, rsi, rflags; 681 int error, fault, opsize, seg, repeat; 682 683 opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize; 684 val = 0; 685 error = 0; 686 687 /* 688 * XXX although the MOVS instruction is only supposed to be used with 689 * the "rep" prefix some guests like FreeBSD will use "repnz" instead. 690 * 691 * Empirically the "repnz" prefix has identical behavior to "rep" 692 * and the zero flag does not make a difference. 693 */ 694 repeat = vie->repz_present | vie->repnz_present; 695 696 if (repeat) { 697 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); 698 KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); 699 700 /* 701 * The count register is %rcx, %ecx or %cx depending on the 702 * address size of the instruction. 703 */ 704 if ((rcx & vie_size2mask(vie->addrsize)) == 0) { 705 error = 0; 706 goto done; 707 } 708 } 709 710 /* 711 * Source Destination Comments 712 * -------------------------------------------- 713 * (1) memory memory n/a 714 * (2) memory mmio emulated 715 * (3) mmio memory emulated 716 * (4) mmio mmio emulated 717 * 718 * At this point we don't have sufficient information to distinguish 719 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this 720 * out because it will succeed only when operating on regular memory. 721 * 722 * XXX the emulation doesn't properly handle the case where 'gpa' 723 * is straddling the boundary between the normal memory and MMIO. 724 */ 725 726 seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS; 727 error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, 728 PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault); 729 if (error || fault) 730 goto done; 731 732 error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ, 733 copyinfo, nitems(copyinfo), &fault); 734 if (error == 0) { 735 if (fault) 736 goto done; /* Resume guest to handle fault */ 737 738 /* 739 * case (2): read from system memory and write to mmio. 740 */ 741 vm_copyin(vm, vcpuid, copyinfo, &val, opsize); 742 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 743 error = memwrite(vm, vcpuid, gpa, val, opsize, arg); 744 if (error) 745 goto done; 746 } else { 747 /* 748 * 'vm_copy_setup()' is expected to fail for cases (3) and (4) 749 * if 'srcaddr' is in the mmio space. 750 */ 751 752 error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, 753 PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr, 754 &fault); 755 if (error || fault) 756 goto done; 757 758 error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize, 759 PROT_WRITE, copyinfo, nitems(copyinfo), &fault); 760 if (error == 0) { 761 if (fault) 762 goto done; /* Resume guest to handle fault */ 763 764 /* 765 * case (3): read from MMIO and write to system memory. 766 * 767 * A MMIO read can have side-effects so we 768 * commit to it only after vm_copy_setup() is 769 * successful. If a page-fault needs to be 770 * injected into the guest then it will happen 771 * before the MMIO read is attempted. 772 */ 773 error = memread(vm, vcpuid, gpa, &val, opsize, arg); 774 if (error) 775 goto done; 776 777 vm_copyout(vm, vcpuid, &val, copyinfo, opsize); 778 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 779 } else { 780 /* 781 * Case (4): read from and write to mmio. 782 * 783 * Commit to the MMIO read/write (with potential 784 * side-effects) only after we are sure that the 785 * instruction is not going to be restarted due 786 * to address translation faults. 787 */ 788 error = vm_gla2gpa(vm, vcpuid, paging, srcaddr, 789 PROT_READ, &srcgpa, &fault); 790 if (error || fault) 791 goto done; 792 793 error = vm_gla2gpa(vm, vcpuid, paging, dstaddr, 794 PROT_WRITE, &dstgpa, &fault); 795 if (error || fault) 796 goto done; 797 798 error = memread(vm, vcpuid, srcgpa, &val, opsize, arg); 799 if (error) 800 goto done; 801 802 error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg); 803 if (error) 804 goto done; 805 } 806 } 807 808 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi); 809 KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error)); 810 811 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); 812 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); 813 814 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 815 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 816 817 if (rflags & PSL_D) { 818 rsi -= opsize; 819 rdi -= opsize; 820 } else { 821 rsi += opsize; 822 rdi += opsize; 823 } 824 825 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi, 826 vie->addrsize); 827 KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error)); 828 829 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, 830 vie->addrsize); 831 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); 832 833 if (repeat) { 834 rcx = rcx - 1; 835 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, 836 rcx, vie->addrsize); 837 KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); 838 839 /* 840 * Repeat the instruction if the count register is not zero. 841 */ 842 if ((rcx & vie_size2mask(vie->addrsize)) != 0) 843 vm_restart_instruction(vm, vcpuid); 844 } 845 done: 846 KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d", 847 __func__, error)); 848 return (error); 849 } 850 851 static int 852 emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 853 struct vm_guest_paging *paging, mem_region_read_t memread, 854 mem_region_write_t memwrite, void *arg) 855 { 856 int error, opsize, repeat; 857 uint64_t val; 858 uint64_t rcx, rdi, rflags; 859 860 opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize; 861 repeat = vie->repz_present | vie->repnz_present; 862 863 if (repeat) { 864 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); 865 KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); 866 867 /* 868 * The count register is %rcx, %ecx or %cx depending on the 869 * address size of the instruction. 870 */ 871 if ((rcx & vie_size2mask(vie->addrsize)) == 0) 872 return (0); 873 } 874 875 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); 876 KASSERT(!error, ("%s: error %d getting rax", __func__, error)); 877 878 error = memwrite(vm, vcpuid, gpa, val, opsize, arg); 879 if (error) 880 return (error); 881 882 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); 883 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); 884 885 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 886 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 887 888 if (rflags & PSL_D) 889 rdi -= opsize; 890 else 891 rdi += opsize; 892 893 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, 894 vie->addrsize); 895 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); 896 897 if (repeat) { 898 rcx = rcx - 1; 899 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, 900 rcx, vie->addrsize); 901 KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); 902 903 /* 904 * Repeat the instruction if the count register is not zero. 905 */ 906 if ((rcx & vie_size2mask(vie->addrsize)) != 0) 907 vm_restart_instruction(vm, vcpuid); 908 } 909 910 return (0); 911 } 912 913 static int 914 emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 915 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 916 { 917 int error, size; 918 enum vm_reg_name reg; 919 uint64_t result, rflags, rflags2, val1, val2; 920 921 size = vie->opsize; 922 error = EINVAL; 923 924 switch (vie->op.op_byte) { 925 case 0x23: 926 /* 927 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the 928 * result in reg. 929 * 930 * 23/r and r16, r/m16 931 * 23/r and r32, r/m32 932 * REX.W + 23/r and r64, r/m64 933 */ 934 935 /* get the first operand */ 936 reg = gpr_map[vie->reg]; 937 error = vie_read_register(vm, vcpuid, reg, &val1); 938 if (error) 939 break; 940 941 /* get the second operand */ 942 error = memread(vm, vcpuid, gpa, &val2, size, arg); 943 if (error) 944 break; 945 946 /* perform the operation and write the result */ 947 result = val1 & val2; 948 error = vie_update_register(vm, vcpuid, reg, result, size); 949 break; 950 case 0x81: 951 case 0x83: 952 /* 953 * AND mem (ModRM:r/m) with immediate and store the 954 * result in mem. 955 * 956 * 81 /4 and r/m16, imm16 957 * 81 /4 and r/m32, imm32 958 * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64 959 * 960 * 83 /4 and r/m16, imm8 sign-extended to 16 961 * 83 /4 and r/m32, imm8 sign-extended to 32 962 * REX.W + 83/4 and r/m64, imm8 sign-extended to 64 963 */ 964 965 /* get the first operand */ 966 error = memread(vm, vcpuid, gpa, &val1, size, arg); 967 if (error) 968 break; 969 970 /* 971 * perform the operation with the pre-fetched immediate 972 * operand and write the result 973 */ 974 result = val1 & vie->immediate; 975 error = memwrite(vm, vcpuid, gpa, result, size, arg); 976 break; 977 default: 978 break; 979 } 980 if (error) 981 return (error); 982 983 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 984 if (error) 985 return (error); 986 987 /* 988 * OF and CF are cleared; the SF, ZF and PF flags are set according 989 * to the result; AF is undefined. 990 * 991 * The updated status flags are obtained by subtracting 0 from 'result'. 992 */ 993 rflags2 = getcc(size, result, 0); 994 rflags &= ~RFLAGS_STATUS_BITS; 995 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 996 997 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 998 return (error); 999 } 1000 1001 static int 1002 emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 1003 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 1004 { 1005 int error, size; 1006 enum vm_reg_name reg; 1007 uint64_t result, rflags, rflags2, val1, val2; 1008 1009 size = vie->opsize; 1010 error = EINVAL; 1011 1012 switch (vie->op.op_byte) { 1013 case 0x0B: 1014 /* 1015 * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the 1016 * result in reg. 1017 * 1018 * 0b/r or r16, r/m16 1019 * 0b/r or r32, r/m32 1020 * REX.W + 0b/r or r64, r/m64 1021 */ 1022 1023 /* get the first operand */ 1024 reg = gpr_map[vie->reg]; 1025 error = vie_read_register(vm, vcpuid, reg, &val1); 1026 if (error) 1027 break; 1028 1029 /* get the second operand */ 1030 error = memread(vm, vcpuid, gpa, &val2, size, arg); 1031 if (error) 1032 break; 1033 1034 /* perform the operation and write the result */ 1035 result = val1 | val2; 1036 error = vie_update_register(vm, vcpuid, reg, result, size); 1037 break; 1038 case 0x81: 1039 case 0x83: 1040 /* 1041 * OR mem (ModRM:r/m) with immediate and store the 1042 * result in mem. 1043 * 1044 * 81 /1 or r/m16, imm16 1045 * 81 /1 or r/m32, imm32 1046 * REX.W + 81 /1 or r/m64, imm32 sign-extended to 64 1047 * 1048 * 83 /1 or r/m16, imm8 sign-extended to 16 1049 * 83 /1 or r/m32, imm8 sign-extended to 32 1050 * REX.W + 83/1 or r/m64, imm8 sign-extended to 64 1051 */ 1052 1053 /* get the first operand */ 1054 error = memread(vm, vcpuid, gpa, &val1, size, arg); 1055 if (error) 1056 break; 1057 1058 /* 1059 * perform the operation with the pre-fetched immediate 1060 * operand and write the result 1061 */ 1062 result = val1 | vie->immediate; 1063 error = memwrite(vm, vcpuid, gpa, result, size, arg); 1064 break; 1065 default: 1066 break; 1067 } 1068 if (error) 1069 return (error); 1070 1071 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1072 if (error) 1073 return (error); 1074 1075 /* 1076 * OF and CF are cleared; the SF, ZF and PF flags are set according 1077 * to the result; AF is undefined. 1078 * 1079 * The updated status flags are obtained by subtracting 0 from 'result'. 1080 */ 1081 rflags2 = getcc(size, result, 0); 1082 rflags &= ~RFLAGS_STATUS_BITS; 1083 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 1084 1085 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 1086 return (error); 1087 } 1088 1089 static int 1090 emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 1091 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 1092 { 1093 int error, size; 1094 uint64_t regop, memop, op1, op2, rflags, rflags2; 1095 enum vm_reg_name reg; 1096 1097 size = vie->opsize; 1098 switch (vie->op.op_byte) { 1099 case 0x39: 1100 case 0x3B: 1101 /* 1102 * 39/r CMP r/m16, r16 1103 * 39/r CMP r/m32, r32 1104 * REX.W 39/r CMP r/m64, r64 1105 * 1106 * 3B/r CMP r16, r/m16 1107 * 3B/r CMP r32, r/m32 1108 * REX.W + 3B/r CMP r64, r/m64 1109 * 1110 * Compare the first operand with the second operand and 1111 * set status flags in EFLAGS register. The comparison is 1112 * performed by subtracting the second operand from the first 1113 * operand and then setting the status flags. 1114 */ 1115 1116 /* Get the register operand */ 1117 reg = gpr_map[vie->reg]; 1118 error = vie_read_register(vm, vcpuid, reg, ®op); 1119 if (error) 1120 return (error); 1121 1122 /* Get the memory operand */ 1123 error = memread(vm, vcpuid, gpa, &memop, size, arg); 1124 if (error) 1125 return (error); 1126 1127 if (vie->op.op_byte == 0x3B) { 1128 op1 = regop; 1129 op2 = memop; 1130 } else { 1131 op1 = memop; 1132 op2 = regop; 1133 } 1134 rflags2 = getcc(size, op1, op2); 1135 break; 1136 case 0x80: 1137 case 0x81: 1138 case 0x83: 1139 /* 1140 * 80 /7 cmp r/m8, imm8 1141 * REX + 80 /7 cmp r/m8, imm8 1142 * 1143 * 81 /7 cmp r/m16, imm16 1144 * 81 /7 cmp r/m32, imm32 1145 * REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 64 1146 * 1147 * 83 /7 cmp r/m16, imm8 sign-extended to 16 1148 * 83 /7 cmp r/m32, imm8 sign-extended to 32 1149 * REX.W + 83 /7 cmp r/m64, imm8 sign-extended to 64 1150 * 1151 * Compare mem (ModRM:r/m) with immediate and set 1152 * status flags according to the results. The 1153 * comparison is performed by subtracting the 1154 * immediate from the first operand and then setting 1155 * the status flags. 1156 * 1157 */ 1158 if (vie->op.op_byte == 0x80) 1159 size = 1; 1160 1161 /* get the first operand */ 1162 error = memread(vm, vcpuid, gpa, &op1, size, arg); 1163 if (error) 1164 return (error); 1165 1166 rflags2 = getcc(size, op1, vie->immediate); 1167 break; 1168 default: 1169 return (EINVAL); 1170 } 1171 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1172 if (error) 1173 return (error); 1174 rflags &= ~RFLAGS_STATUS_BITS; 1175 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1176 1177 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 1178 return (error); 1179 } 1180 1181 static int 1182 emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 1183 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 1184 { 1185 int error, size; 1186 uint64_t nval, rflags, rflags2, val1, val2; 1187 enum vm_reg_name reg; 1188 1189 size = vie->opsize; 1190 error = EINVAL; 1191 1192 switch (vie->op.op_byte) { 1193 case 0x2B: 1194 /* 1195 * SUB r/m from r and store the result in r 1196 * 1197 * 2B/r SUB r16, r/m16 1198 * 2B/r SUB r32, r/m32 1199 * REX.W + 2B/r SUB r64, r/m64 1200 */ 1201 1202 /* get the first operand */ 1203 reg = gpr_map[vie->reg]; 1204 error = vie_read_register(vm, vcpuid, reg, &val1); 1205 if (error) 1206 break; 1207 1208 /* get the second operand */ 1209 error = memread(vm, vcpuid, gpa, &val2, size, arg); 1210 if (error) 1211 break; 1212 1213 /* perform the operation and write the result */ 1214 nval = val1 - val2; 1215 error = vie_update_register(vm, vcpuid, reg, nval, size); 1216 break; 1217 default: 1218 break; 1219 } 1220 1221 if (!error) { 1222 rflags2 = getcc(size, val1, val2); 1223 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1224 &rflags); 1225 if (error) 1226 return (error); 1227 1228 rflags &= ~RFLAGS_STATUS_BITS; 1229 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1230 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1231 rflags, 8); 1232 } 1233 1234 return (error); 1235 } 1236 1237 static int 1238 emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, 1239 struct vm_guest_paging *paging, mem_region_read_t memread, 1240 mem_region_write_t memwrite, void *arg) 1241 { 1242 #ifdef _KERNEL 1243 struct vm_copyinfo copyinfo[2]; 1244 #else 1245 struct iovec copyinfo[2]; 1246 #endif 1247 struct seg_desc ss_desc; 1248 uint64_t cr0, rflags, rsp, stack_gla, val; 1249 int error, fault, size, stackaddrsize, pushop; 1250 1251 val = 0; 1252 size = vie->opsize; 1253 pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0; 1254 1255 /* 1256 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1 1257 */ 1258 if (paging->cpu_mode == CPU_MODE_REAL) { 1259 stackaddrsize = 2; 1260 } else if (paging->cpu_mode == CPU_MODE_64BIT) { 1261 /* 1262 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3 1263 * - Stack pointer size is always 64-bits. 1264 * - PUSH/POP of 32-bit values is not possible in 64-bit mode. 1265 * - 16-bit PUSH/POP is supported by using the operand size 1266 * override prefix (66H). 1267 */ 1268 stackaddrsize = 8; 1269 size = vie->opsize_override ? 2 : 8; 1270 } else { 1271 /* 1272 * In protected or compatibility mode the 'B' flag in the 1273 * stack-segment descriptor determines the size of the 1274 * stack pointer. 1275 */ 1276 error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc); 1277 KASSERT(error == 0, ("%s: error %d getting SS descriptor", 1278 __func__, error)); 1279 if (SEG_DESC_DEF32(ss_desc.access)) 1280 stackaddrsize = 4; 1281 else 1282 stackaddrsize = 2; 1283 } 1284 1285 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); 1286 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); 1287 1288 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1289 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 1290 1291 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp); 1292 KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error)); 1293 if (pushop) { 1294 rsp -= size; 1295 } 1296 1297 if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc, 1298 rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ, 1299 &stack_gla)) { 1300 vm_inject_ss(vm, vcpuid, 0); 1301 return (0); 1302 } 1303 1304 if (vie_canonical_check(paging->cpu_mode, stack_gla)) { 1305 vm_inject_ss(vm, vcpuid, 0); 1306 return (0); 1307 } 1308 1309 if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) { 1310 vm_inject_ac(vm, vcpuid, 0); 1311 return (0); 1312 } 1313 1314 error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, 1315 pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo), 1316 &fault); 1317 if (error || fault) 1318 return (error); 1319 1320 if (pushop) { 1321 error = memread(vm, vcpuid, mmio_gpa, &val, size, arg); 1322 if (error == 0) 1323 vm_copyout(vm, vcpuid, &val, copyinfo, size); 1324 } else { 1325 vm_copyin(vm, vcpuid, copyinfo, &val, size); 1326 error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg); 1327 rsp += size; 1328 } 1329 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 1330 1331 if (error == 0) { 1332 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp, 1333 stackaddrsize); 1334 KASSERT(error == 0, ("error %d updating rsp", error)); 1335 } 1336 return (error); 1337 } 1338 1339 static int 1340 emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, 1341 struct vm_guest_paging *paging, mem_region_read_t memread, 1342 mem_region_write_t memwrite, void *arg) 1343 { 1344 int error; 1345 1346 /* 1347 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. 1348 * 1349 * PUSH is part of the group 5 extended opcodes and is identified 1350 * by ModRM:reg = b110. 1351 */ 1352 if ((vie->reg & 7) != 6) 1353 return (EINVAL); 1354 1355 error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread, 1356 memwrite, arg); 1357 return (error); 1358 } 1359 1360 static int 1361 emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, 1362 struct vm_guest_paging *paging, mem_region_read_t memread, 1363 mem_region_write_t memwrite, void *arg) 1364 { 1365 int error; 1366 1367 /* 1368 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. 1369 * 1370 * POP is part of the group 1A extended opcodes and is identified 1371 * by ModRM:reg = b000. 1372 */ 1373 if ((vie->reg & 7) != 0) 1374 return (EINVAL); 1375 1376 error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread, 1377 memwrite, arg); 1378 return (error); 1379 } 1380 1381 static int 1382 emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 1383 struct vm_guest_paging *paging, mem_region_read_t memread, 1384 mem_region_write_t memwrite, void *memarg) 1385 { 1386 int error; 1387 1388 switch (vie->reg & 7) { 1389 case 0x1: /* OR */ 1390 error = emulate_or(vm, vcpuid, gpa, vie, 1391 memread, memwrite, memarg); 1392 break; 1393 case 0x4: /* AND */ 1394 error = emulate_and(vm, vcpuid, gpa, vie, 1395 memread, memwrite, memarg); 1396 break; 1397 case 0x7: /* CMP */ 1398 error = emulate_cmp(vm, vcpuid, gpa, vie, 1399 memread, memwrite, memarg); 1400 break; 1401 default: 1402 error = EINVAL; 1403 break; 1404 } 1405 1406 return (error); 1407 } 1408 1409 static int 1410 emulate_bittest(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 1411 mem_region_read_t memread, mem_region_write_t memwrite, void *memarg) 1412 { 1413 uint64_t val, rflags; 1414 int error, bitmask, bitoff; 1415 1416 /* 1417 * 0F BA is a Group 8 extended opcode. 1418 * 1419 * Currently we only emulate the 'Bit Test' instruction which is 1420 * identified by a ModR/M:reg encoding of 100b. 1421 */ 1422 if ((vie->reg & 7) != 4) 1423 return (EINVAL); 1424 1425 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1426 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 1427 1428 error = memread(vm, vcpuid, gpa, &val, vie->opsize, memarg); 1429 if (error) 1430 return (error); 1431 1432 /* 1433 * Intel SDM, Vol 2, Table 3-2: 1434 * "Range of Bit Positions Specified by Bit Offset Operands" 1435 */ 1436 bitmask = vie->opsize * 8 - 1; 1437 bitoff = vie->immediate & bitmask; 1438 1439 /* Copy the bit into the Carry flag in %rflags */ 1440 if (val & (1UL << bitoff)) 1441 rflags |= PSL_C; 1442 else 1443 rflags &= ~PSL_C; 1444 1445 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 1446 KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error)); 1447 1448 return (0); 1449 } 1450 1451 static int 1452 emulate_twob_group15(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 1453 mem_region_read_t memread, mem_region_write_t memwrite, void *memarg) 1454 { 1455 int error; 1456 uint64_t buf; 1457 1458 switch (vie->reg & 7) { 1459 case 0x7: /* CLFLUSH, CLFLUSHOPT, and SFENCE */ 1460 if (vie->mod == 0x3) { 1461 /* 1462 * SFENCE. Ignore it, VM exit provides enough 1463 * barriers on its own. 1464 */ 1465 error = 0; 1466 } else { 1467 /* 1468 * CLFLUSH, CLFLUSHOPT. Only check for access 1469 * rights. 1470 */ 1471 error = memread(vm, vcpuid, gpa, &buf, 1, memarg); 1472 } 1473 break; 1474 default: 1475 error = EINVAL; 1476 break; 1477 } 1478 1479 return (error); 1480 } 1481 1482 int 1483 vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 1484 struct vm_guest_paging *paging, mem_region_read_t memread, 1485 mem_region_write_t memwrite, void *memarg) 1486 { 1487 int error; 1488 1489 if (!vie->decoded) 1490 return (EINVAL); 1491 1492 switch (vie->op.op_type) { 1493 case VIE_OP_TYPE_GROUP1: 1494 error = emulate_group1(vm, vcpuid, gpa, vie, paging, memread, 1495 memwrite, memarg); 1496 break; 1497 case VIE_OP_TYPE_POP: 1498 error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread, 1499 memwrite, memarg); 1500 break; 1501 case VIE_OP_TYPE_PUSH: 1502 error = emulate_push(vm, vcpuid, gpa, vie, paging, memread, 1503 memwrite, memarg); 1504 break; 1505 case VIE_OP_TYPE_CMP: 1506 error = emulate_cmp(vm, vcpuid, gpa, vie, 1507 memread, memwrite, memarg); 1508 break; 1509 case VIE_OP_TYPE_MOV: 1510 error = emulate_mov(vm, vcpuid, gpa, vie, 1511 memread, memwrite, memarg); 1512 break; 1513 case VIE_OP_TYPE_MOVSX: 1514 case VIE_OP_TYPE_MOVZX: 1515 error = emulate_movx(vm, vcpuid, gpa, vie, 1516 memread, memwrite, memarg); 1517 break; 1518 case VIE_OP_TYPE_MOVS: 1519 error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread, 1520 memwrite, memarg); 1521 break; 1522 case VIE_OP_TYPE_STOS: 1523 error = emulate_stos(vm, vcpuid, gpa, vie, paging, memread, 1524 memwrite, memarg); 1525 break; 1526 case VIE_OP_TYPE_AND: 1527 error = emulate_and(vm, vcpuid, gpa, vie, 1528 memread, memwrite, memarg); 1529 break; 1530 case VIE_OP_TYPE_OR: 1531 error = emulate_or(vm, vcpuid, gpa, vie, 1532 memread, memwrite, memarg); 1533 break; 1534 case VIE_OP_TYPE_SUB: 1535 error = emulate_sub(vm, vcpuid, gpa, vie, 1536 memread, memwrite, memarg); 1537 break; 1538 case VIE_OP_TYPE_BITTEST: 1539 error = emulate_bittest(vm, vcpuid, gpa, vie, 1540 memread, memwrite, memarg); 1541 break; 1542 case VIE_OP_TYPE_TWOB_GRP15: 1543 error = emulate_twob_group15(vm, vcpuid, gpa, vie, 1544 memread, memwrite, memarg); 1545 break; 1546 default: 1547 error = EINVAL; 1548 break; 1549 } 1550 1551 return (error); 1552 } 1553 1554 int 1555 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) 1556 { 1557 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 1558 ("%s: invalid size %d", __func__, size)); 1559 KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl)); 1560 1561 if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) 1562 return (0); 1563 1564 return ((gla & (size - 1)) ? 1 : 0); 1565 } 1566 1567 int 1568 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla) 1569 { 1570 uint64_t mask; 1571 1572 if (cpu_mode != CPU_MODE_64BIT) 1573 return (0); 1574 1575 /* 1576 * The value of the bit 47 in the 'gla' should be replicated in the 1577 * most significant 16 bits. 1578 */ 1579 mask = ~((1UL << 48) - 1); 1580 if (gla & (1UL << 47)) 1581 return ((gla & mask) != mask); 1582 else 1583 return ((gla & mask) != 0); 1584 } 1585 1586 uint64_t 1587 vie_size2mask(int size) 1588 { 1589 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 1590 ("vie_size2mask: invalid size %d", size)); 1591 return (size2mask[size]); 1592 } 1593 1594 int 1595 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, 1596 struct seg_desc *desc, uint64_t offset, int length, int addrsize, 1597 int prot, uint64_t *gla) 1598 { 1599 uint64_t firstoff, low_limit, high_limit, segbase; 1600 int glasize, type; 1601 1602 KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS, 1603 ("%s: invalid segment %d", __func__, seg)); 1604 KASSERT(length == 1 || length == 2 || length == 4 || length == 8, 1605 ("%s: invalid operand size %d", __func__, length)); 1606 KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, 1607 ("%s: invalid prot %#x", __func__, prot)); 1608 1609 firstoff = offset; 1610 if (cpu_mode == CPU_MODE_64BIT) { 1611 KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address " 1612 "size %d for cpu_mode %d", __func__, addrsize, cpu_mode)); 1613 glasize = 8; 1614 } else { 1615 KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address " 1616 "size %d for cpu mode %d", __func__, addrsize, cpu_mode)); 1617 glasize = 4; 1618 /* 1619 * If the segment selector is loaded with a NULL selector 1620 * then the descriptor is unusable and attempting to use 1621 * it results in a #GP(0). 1622 */ 1623 if (SEG_DESC_UNUSABLE(desc->access)) 1624 return (-1); 1625 1626 /* 1627 * The processor generates a #NP exception when a segment 1628 * register is loaded with a selector that points to a 1629 * descriptor that is not present. If this was the case then 1630 * it would have been checked before the VM-exit. 1631 */ 1632 KASSERT(SEG_DESC_PRESENT(desc->access), 1633 ("segment %d not present: %#x", seg, desc->access)); 1634 1635 /* 1636 * The descriptor type must indicate a code/data segment. 1637 */ 1638 type = SEG_DESC_TYPE(desc->access); 1639 KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " 1640 "descriptor type %#x", seg, type)); 1641 1642 if (prot & PROT_READ) { 1643 /* #GP on a read access to a exec-only code segment */ 1644 if ((type & 0xA) == 0x8) 1645 return (-1); 1646 } 1647 1648 if (prot & PROT_WRITE) { 1649 /* 1650 * #GP on a write access to a code segment or a 1651 * read-only data segment. 1652 */ 1653 if (type & 0x8) /* code segment */ 1654 return (-1); 1655 1656 if ((type & 0xA) == 0) /* read-only data seg */ 1657 return (-1); 1658 } 1659 1660 /* 1661 * 'desc->limit' is fully expanded taking granularity into 1662 * account. 1663 */ 1664 if ((type & 0xC) == 0x4) { 1665 /* expand-down data segment */ 1666 low_limit = desc->limit + 1; 1667 high_limit = SEG_DESC_DEF32(desc->access) ? 1668 0xffffffff : 0xffff; 1669 } else { 1670 /* code segment or expand-up data segment */ 1671 low_limit = 0; 1672 high_limit = desc->limit; 1673 } 1674 1675 while (length > 0) { 1676 offset &= vie_size2mask(addrsize); 1677 if (offset < low_limit || offset > high_limit) 1678 return (-1); 1679 offset++; 1680 length--; 1681 } 1682 } 1683 1684 /* 1685 * In 64-bit mode all segments except %fs and %gs have a segment 1686 * base address of 0. 1687 */ 1688 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && 1689 seg != VM_REG_GUEST_GS) { 1690 segbase = 0; 1691 } else { 1692 segbase = desc->base; 1693 } 1694 1695 /* 1696 * Truncate 'firstoff' to the effective address size before adding 1697 * it to the segment base. 1698 */ 1699 firstoff &= vie_size2mask(addrsize); 1700 *gla = (segbase + firstoff) & vie_size2mask(glasize); 1701 return (0); 1702 } 1703 1704 #ifdef _KERNEL 1705 void 1706 vie_init(struct vie *vie, const char *inst_bytes, int inst_length) 1707 { 1708 KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE, 1709 ("%s: invalid instruction length (%d)", __func__, inst_length)); 1710 1711 bzero(vie, sizeof(struct vie)); 1712 1713 vie->base_register = VM_REG_LAST; 1714 vie->index_register = VM_REG_LAST; 1715 vie->segment_register = VM_REG_LAST; 1716 1717 if (inst_length) { 1718 bcopy(inst_bytes, vie->inst, inst_length); 1719 vie->num_valid = inst_length; 1720 } 1721 } 1722 1723 static int 1724 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte) 1725 { 1726 int error_code = 0; 1727 1728 if (pte & PG_V) 1729 error_code |= PGEX_P; 1730 if (prot & VM_PROT_WRITE) 1731 error_code |= PGEX_W; 1732 if (usermode) 1733 error_code |= PGEX_U; 1734 if (rsvd) 1735 error_code |= PGEX_RSV; 1736 if (prot & VM_PROT_EXECUTE) 1737 error_code |= PGEX_I; 1738 1739 return (error_code); 1740 } 1741 1742 static void 1743 ptp_release(void **cookie) 1744 { 1745 if (*cookie != NULL) { 1746 vm_gpa_release(*cookie); 1747 *cookie = NULL; 1748 } 1749 } 1750 1751 static void * 1752 ptp_hold(struct vm *vm, int vcpu, vm_paddr_t ptpphys, size_t len, void **cookie) 1753 { 1754 void *ptr; 1755 1756 ptp_release(cookie); 1757 ptr = vm_gpa_hold(vm, vcpu, ptpphys, len, VM_PROT_RW, cookie); 1758 return (ptr); 1759 } 1760 1761 static int 1762 _vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 1763 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only) 1764 { 1765 int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable; 1766 u_int retries; 1767 uint64_t *ptpbase, ptpphys, pte, pgsize; 1768 uint32_t *ptpbase32, pte32; 1769 void *cookie; 1770 1771 *guest_fault = 0; 1772 1773 usermode = (paging->cpl == 3 ? 1 : 0); 1774 writable = prot & VM_PROT_WRITE; 1775 cookie = NULL; 1776 retval = 0; 1777 retries = 0; 1778 restart: 1779 ptpphys = paging->cr3; /* root of the page tables */ 1780 ptp_release(&cookie); 1781 if (retries++ > 0) 1782 maybe_yield(); 1783 1784 if (vie_canonical_check(paging->cpu_mode, gla)) { 1785 /* 1786 * XXX assuming a non-stack reference otherwise a stack fault 1787 * should be generated. 1788 */ 1789 if (!check_only) 1790 vm_inject_gp(vm, vcpuid); 1791 goto fault; 1792 } 1793 1794 if (paging->paging_mode == PAGING_MODE_FLAT) { 1795 *gpa = gla; 1796 goto done; 1797 } 1798 1799 if (paging->paging_mode == PAGING_MODE_32) { 1800 nlevels = 2; 1801 while (--nlevels >= 0) { 1802 /* Zero out the lower 12 bits. */ 1803 ptpphys &= ~0xfff; 1804 1805 ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, 1806 &cookie); 1807 1808 if (ptpbase32 == NULL) 1809 goto error; 1810 1811 ptpshift = PAGE_SHIFT + nlevels * 10; 1812 ptpindex = (gla >> ptpshift) & 0x3FF; 1813 pgsize = 1UL << ptpshift; 1814 1815 pte32 = ptpbase32[ptpindex]; 1816 1817 if ((pte32 & PG_V) == 0 || 1818 (usermode && (pte32 & PG_U) == 0) || 1819 (writable && (pte32 & PG_RW) == 0)) { 1820 if (!check_only) { 1821 pfcode = pf_error_code(usermode, prot, 0, 1822 pte32); 1823 vm_inject_pf(vm, vcpuid, pfcode, gla); 1824 } 1825 goto fault; 1826 } 1827 1828 /* 1829 * Emulate the x86 MMU's management of the accessed 1830 * and dirty flags. While the accessed flag is set 1831 * at every level of the page table, the dirty flag 1832 * is only set at the last level providing the guest 1833 * physical address. 1834 */ 1835 if (!check_only && (pte32 & PG_A) == 0) { 1836 if (atomic_cmpset_32(&ptpbase32[ptpindex], 1837 pte32, pte32 | PG_A) == 0) { 1838 goto restart; 1839 } 1840 } 1841 1842 /* XXX must be ignored if CR4.PSE=0 */ 1843 if (nlevels > 0 && (pte32 & PG_PS) != 0) 1844 break; 1845 1846 ptpphys = pte32; 1847 } 1848 1849 /* Set the dirty bit in the page table entry if necessary */ 1850 if (!check_only && writable && (pte32 & PG_M) == 0) { 1851 if (atomic_cmpset_32(&ptpbase32[ptpindex], 1852 pte32, pte32 | PG_M) == 0) { 1853 goto restart; 1854 } 1855 } 1856 1857 /* Zero out the lower 'ptpshift' bits */ 1858 pte32 >>= ptpshift; pte32 <<= ptpshift; 1859 *gpa = pte32 | (gla & (pgsize - 1)); 1860 goto done; 1861 } 1862 1863 if (paging->paging_mode == PAGING_MODE_PAE) { 1864 /* Zero out the lower 5 bits and the upper 32 bits */ 1865 ptpphys &= 0xffffffe0UL; 1866 1867 ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof(*ptpbase) * 4, 1868 &cookie); 1869 if (ptpbase == NULL) 1870 goto error; 1871 1872 ptpindex = (gla >> 30) & 0x3; 1873 1874 pte = ptpbase[ptpindex]; 1875 1876 if ((pte & PG_V) == 0) { 1877 if (!check_only) { 1878 pfcode = pf_error_code(usermode, prot, 0, pte); 1879 vm_inject_pf(vm, vcpuid, pfcode, gla); 1880 } 1881 goto fault; 1882 } 1883 1884 ptpphys = pte; 1885 1886 nlevels = 2; 1887 } else 1888 nlevels = 4; 1889 while (--nlevels >= 0) { 1890 /* Zero out the lower 12 bits and the upper 12 bits */ 1891 ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12; 1892 1893 ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie); 1894 if (ptpbase == NULL) 1895 goto error; 1896 1897 ptpshift = PAGE_SHIFT + nlevels * 9; 1898 ptpindex = (gla >> ptpshift) & 0x1FF; 1899 pgsize = 1UL << ptpshift; 1900 1901 pte = ptpbase[ptpindex]; 1902 1903 if ((pte & PG_V) == 0 || 1904 (usermode && (pte & PG_U) == 0) || 1905 (writable && (pte & PG_RW) == 0)) { 1906 if (!check_only) { 1907 pfcode = pf_error_code(usermode, prot, 0, pte); 1908 vm_inject_pf(vm, vcpuid, pfcode, gla); 1909 } 1910 goto fault; 1911 } 1912 1913 /* Set the accessed bit in the page table entry */ 1914 if (!check_only && (pte & PG_A) == 0) { 1915 if (atomic_cmpset_64(&ptpbase[ptpindex], 1916 pte, pte | PG_A) == 0) { 1917 goto restart; 1918 } 1919 } 1920 1921 if (nlevels > 0 && (pte & PG_PS) != 0) { 1922 if (pgsize > 1 * GB) { 1923 if (!check_only) { 1924 pfcode = pf_error_code(usermode, prot, 1, 1925 pte); 1926 vm_inject_pf(vm, vcpuid, pfcode, gla); 1927 } 1928 goto fault; 1929 } 1930 break; 1931 } 1932 1933 ptpphys = pte; 1934 } 1935 1936 /* Set the dirty bit in the page table entry if necessary */ 1937 if (!check_only && writable && (pte & PG_M) == 0) { 1938 if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0) 1939 goto restart; 1940 } 1941 1942 /* Zero out the lower 'ptpshift' bits and the upper 12 bits */ 1943 pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12; 1944 *gpa = pte | (gla & (pgsize - 1)); 1945 done: 1946 ptp_release(&cookie); 1947 KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d", 1948 __func__, retval)); 1949 return (retval); 1950 error: 1951 retval = EFAULT; 1952 goto done; 1953 fault: 1954 *guest_fault = 1; 1955 goto done; 1956 } 1957 1958 int 1959 vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 1960 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) 1961 { 1962 1963 return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault, 1964 false)); 1965 } 1966 1967 int 1968 vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 1969 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) 1970 { 1971 1972 return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault, 1973 true)); 1974 } 1975 1976 int 1977 vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 1978 uint64_t rip, int inst_length, struct vie *vie, int *faultptr) 1979 { 1980 struct vm_copyinfo copyinfo[2]; 1981 int error, prot; 1982 1983 if (inst_length > VIE_INST_SIZE) 1984 panic("vmm_fetch_instruction: invalid length %d", inst_length); 1985 1986 prot = PROT_READ | PROT_EXEC; 1987 error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot, 1988 copyinfo, nitems(copyinfo), faultptr); 1989 if (error || *faultptr) 1990 return (error); 1991 1992 vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length); 1993 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 1994 vie->num_valid = inst_length; 1995 return (0); 1996 } 1997 1998 static int 1999 vie_peek(struct vie *vie, uint8_t *x) 2000 { 2001 2002 if (vie->num_processed < vie->num_valid) { 2003 *x = vie->inst[vie->num_processed]; 2004 return (0); 2005 } else 2006 return (-1); 2007 } 2008 2009 static void 2010 vie_advance(struct vie *vie) 2011 { 2012 2013 vie->num_processed++; 2014 } 2015 2016 static bool 2017 segment_override(uint8_t x, int *seg) 2018 { 2019 2020 switch (x) { 2021 case 0x2E: 2022 *seg = VM_REG_GUEST_CS; 2023 break; 2024 case 0x36: 2025 *seg = VM_REG_GUEST_SS; 2026 break; 2027 case 0x3E: 2028 *seg = VM_REG_GUEST_DS; 2029 break; 2030 case 0x26: 2031 *seg = VM_REG_GUEST_ES; 2032 break; 2033 case 0x64: 2034 *seg = VM_REG_GUEST_FS; 2035 break; 2036 case 0x65: 2037 *seg = VM_REG_GUEST_GS; 2038 break; 2039 default: 2040 return (false); 2041 } 2042 return (true); 2043 } 2044 2045 static int 2046 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d) 2047 { 2048 uint8_t x; 2049 2050 while (1) { 2051 if (vie_peek(vie, &x)) 2052 return (-1); 2053 2054 if (x == 0x66) 2055 vie->opsize_override = 1; 2056 else if (x == 0x67) 2057 vie->addrsize_override = 1; 2058 else if (x == 0xF3) 2059 vie->repz_present = 1; 2060 else if (x == 0xF2) 2061 vie->repnz_present = 1; 2062 else if (segment_override(x, &vie->segment_register)) 2063 vie->segment_override = 1; 2064 else 2065 break; 2066 2067 vie_advance(vie); 2068 } 2069 2070 /* 2071 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2: 2072 * - Only one REX prefix is allowed per instruction. 2073 * - The REX prefix must immediately precede the opcode byte or the 2074 * escape opcode byte. 2075 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3) 2076 * the mandatory prefix must come before the REX prefix. 2077 */ 2078 if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) { 2079 vie->rex_present = 1; 2080 vie->rex_w = x & 0x8 ? 1 : 0; 2081 vie->rex_r = x & 0x4 ? 1 : 0; 2082 vie->rex_x = x & 0x2 ? 1 : 0; 2083 vie->rex_b = x & 0x1 ? 1 : 0; 2084 vie_advance(vie); 2085 } 2086 2087 /* 2088 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1 2089 */ 2090 if (cpu_mode == CPU_MODE_64BIT) { 2091 /* 2092 * Default address size is 64-bits and default operand size 2093 * is 32-bits. 2094 */ 2095 vie->addrsize = vie->addrsize_override ? 4 : 8; 2096 if (vie->rex_w) 2097 vie->opsize = 8; 2098 else if (vie->opsize_override) 2099 vie->opsize = 2; 2100 else 2101 vie->opsize = 4; 2102 } else if (cs_d) { 2103 /* Default address and operand sizes are 32-bits */ 2104 vie->addrsize = vie->addrsize_override ? 2 : 4; 2105 vie->opsize = vie->opsize_override ? 2 : 4; 2106 } else { 2107 /* Default address and operand sizes are 16-bits */ 2108 vie->addrsize = vie->addrsize_override ? 4 : 2; 2109 vie->opsize = vie->opsize_override ? 4 : 2; 2110 } 2111 return (0); 2112 } 2113 2114 static int 2115 decode_two_byte_opcode(struct vie *vie) 2116 { 2117 uint8_t x; 2118 2119 if (vie_peek(vie, &x)) 2120 return (-1); 2121 2122 vie->op = two_byte_opcodes[x]; 2123 2124 if (vie->op.op_type == VIE_OP_TYPE_NONE) 2125 return (-1); 2126 2127 vie_advance(vie); 2128 return (0); 2129 } 2130 2131 static int 2132 decode_opcode(struct vie *vie) 2133 { 2134 uint8_t x; 2135 2136 if (vie_peek(vie, &x)) 2137 return (-1); 2138 2139 vie->op = one_byte_opcodes[x]; 2140 2141 if (vie->op.op_type == VIE_OP_TYPE_NONE) 2142 return (-1); 2143 2144 vie_advance(vie); 2145 2146 if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE) 2147 return (decode_two_byte_opcode(vie)); 2148 2149 return (0); 2150 } 2151 2152 static int 2153 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode) 2154 { 2155 uint8_t x; 2156 2157 if (vie->op.op_flags & VIE_OP_F_NO_MODRM) 2158 return (0); 2159 2160 if (cpu_mode == CPU_MODE_REAL) 2161 return (-1); 2162 2163 if (vie_peek(vie, &x)) 2164 return (-1); 2165 2166 vie->mod = (x >> 6) & 0x3; 2167 vie->rm = (x >> 0) & 0x7; 2168 vie->reg = (x >> 3) & 0x7; 2169 2170 /* 2171 * A direct addressing mode makes no sense in the context of an EPT 2172 * fault. There has to be a memory access involved to cause the 2173 * EPT fault. 2174 */ 2175 if (vie->mod == VIE_MOD_DIRECT) 2176 return (-1); 2177 2178 if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) || 2179 (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) { 2180 /* 2181 * Table 2-5: Special Cases of REX Encodings 2182 * 2183 * mod=0, r/m=5 is used in the compatibility mode to 2184 * indicate a disp32 without a base register. 2185 * 2186 * mod!=3, r/m=4 is used in the compatibility mode to 2187 * indicate that the SIB byte is present. 2188 * 2189 * The 'b' bit in the REX prefix is don't care in 2190 * this case. 2191 */ 2192 } else { 2193 vie->rm |= (vie->rex_b << 3); 2194 } 2195 2196 vie->reg |= (vie->rex_r << 3); 2197 2198 /* SIB */ 2199 if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB) 2200 goto done; 2201 2202 vie->base_register = gpr_map[vie->rm]; 2203 2204 switch (vie->mod) { 2205 case VIE_MOD_INDIRECT_DISP8: 2206 vie->disp_bytes = 1; 2207 break; 2208 case VIE_MOD_INDIRECT_DISP32: 2209 vie->disp_bytes = 4; 2210 break; 2211 case VIE_MOD_INDIRECT: 2212 if (vie->rm == VIE_RM_DISP32) { 2213 vie->disp_bytes = 4; 2214 /* 2215 * Table 2-7. RIP-Relative Addressing 2216 * 2217 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32 2218 * whereas in compatibility mode it just implies disp32. 2219 */ 2220 2221 if (cpu_mode == CPU_MODE_64BIT) 2222 vie->base_register = VM_REG_GUEST_RIP; 2223 else 2224 vie->base_register = VM_REG_LAST; 2225 } 2226 break; 2227 } 2228 2229 done: 2230 vie_advance(vie); 2231 2232 return (0); 2233 } 2234 2235 static int 2236 decode_sib(struct vie *vie) 2237 { 2238 uint8_t x; 2239 2240 /* Proceed only if SIB byte is present */ 2241 if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB) 2242 return (0); 2243 2244 if (vie_peek(vie, &x)) 2245 return (-1); 2246 2247 /* De-construct the SIB byte */ 2248 vie->ss = (x >> 6) & 0x3; 2249 vie->index = (x >> 3) & 0x7; 2250 vie->base = (x >> 0) & 0x7; 2251 2252 /* Apply the REX prefix modifiers */ 2253 vie->index |= vie->rex_x << 3; 2254 vie->base |= vie->rex_b << 3; 2255 2256 switch (vie->mod) { 2257 case VIE_MOD_INDIRECT_DISP8: 2258 vie->disp_bytes = 1; 2259 break; 2260 case VIE_MOD_INDIRECT_DISP32: 2261 vie->disp_bytes = 4; 2262 break; 2263 } 2264 2265 if (vie->mod == VIE_MOD_INDIRECT && 2266 (vie->base == 5 || vie->base == 13)) { 2267 /* 2268 * Special case when base register is unused if mod = 0 2269 * and base = %rbp or %r13. 2270 * 2271 * Documented in: 2272 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 2273 * Table 2-5: Special Cases of REX Encodings 2274 */ 2275 vie->disp_bytes = 4; 2276 } else { 2277 vie->base_register = gpr_map[vie->base]; 2278 } 2279 2280 /* 2281 * All encodings of 'index' are valid except for %rsp (4). 2282 * 2283 * Documented in: 2284 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 2285 * Table 2-5: Special Cases of REX Encodings 2286 */ 2287 if (vie->index != 4) 2288 vie->index_register = gpr_map[vie->index]; 2289 2290 /* 'scale' makes sense only in the context of an index register */ 2291 if (vie->index_register < VM_REG_LAST) 2292 vie->scale = 1 << vie->ss; 2293 2294 vie_advance(vie); 2295 2296 return (0); 2297 } 2298 2299 static int 2300 decode_displacement(struct vie *vie) 2301 { 2302 int n, i; 2303 uint8_t x; 2304 2305 union { 2306 char buf[4]; 2307 int8_t signed8; 2308 int32_t signed32; 2309 } u; 2310 2311 if ((n = vie->disp_bytes) == 0) 2312 return (0); 2313 2314 if (n != 1 && n != 4) 2315 panic("decode_displacement: invalid disp_bytes %d", n); 2316 2317 for (i = 0; i < n; i++) { 2318 if (vie_peek(vie, &x)) 2319 return (-1); 2320 2321 u.buf[i] = x; 2322 vie_advance(vie); 2323 } 2324 2325 if (n == 1) 2326 vie->displacement = u.signed8; /* sign-extended */ 2327 else 2328 vie->displacement = u.signed32; /* sign-extended */ 2329 2330 return (0); 2331 } 2332 2333 static int 2334 decode_immediate(struct vie *vie) 2335 { 2336 int i, n; 2337 uint8_t x; 2338 union { 2339 char buf[4]; 2340 int8_t signed8; 2341 int16_t signed16; 2342 int32_t signed32; 2343 } u; 2344 2345 /* Figure out immediate operand size (if any) */ 2346 if (vie->op.op_flags & VIE_OP_F_IMM) { 2347 /* 2348 * Section 2.2.1.5 "Immediates", Intel SDM: 2349 * In 64-bit mode the typical size of immediate operands 2350 * remains 32-bits. When the operand size if 64-bits, the 2351 * processor sign-extends all immediates to 64-bits prior 2352 * to their use. 2353 */ 2354 if (vie->opsize == 4 || vie->opsize == 8) 2355 vie->imm_bytes = 4; 2356 else 2357 vie->imm_bytes = 2; 2358 } else if (vie->op.op_flags & VIE_OP_F_IMM8) { 2359 vie->imm_bytes = 1; 2360 } 2361 2362 if ((n = vie->imm_bytes) == 0) 2363 return (0); 2364 2365 KASSERT(n == 1 || n == 2 || n == 4, 2366 ("%s: invalid number of immediate bytes: %d", __func__, n)); 2367 2368 for (i = 0; i < n; i++) { 2369 if (vie_peek(vie, &x)) 2370 return (-1); 2371 2372 u.buf[i] = x; 2373 vie_advance(vie); 2374 } 2375 2376 /* sign-extend the immediate value before use */ 2377 if (n == 1) 2378 vie->immediate = u.signed8; 2379 else if (n == 2) 2380 vie->immediate = u.signed16; 2381 else 2382 vie->immediate = u.signed32; 2383 2384 return (0); 2385 } 2386 2387 static int 2388 decode_moffset(struct vie *vie) 2389 { 2390 int i, n; 2391 uint8_t x; 2392 union { 2393 char buf[8]; 2394 uint64_t u64; 2395 } u; 2396 2397 if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0) 2398 return (0); 2399 2400 /* 2401 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM: 2402 * The memory offset size follows the address-size of the instruction. 2403 */ 2404 n = vie->addrsize; 2405 KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n)); 2406 2407 u.u64 = 0; 2408 for (i = 0; i < n; i++) { 2409 if (vie_peek(vie, &x)) 2410 return (-1); 2411 2412 u.buf[i] = x; 2413 vie_advance(vie); 2414 } 2415 vie->displacement = u.u64; 2416 return (0); 2417 } 2418 2419 /* 2420 * Verify that the 'guest linear address' provided as collateral of the nested 2421 * page table fault matches with our instruction decoding. 2422 */ 2423 static int 2424 verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie, 2425 enum vm_cpu_mode cpu_mode) 2426 { 2427 int error; 2428 uint64_t base, segbase, idx, gla2; 2429 enum vm_reg_name seg; 2430 struct seg_desc desc; 2431 2432 /* Skip 'gla' verification */ 2433 if (gla == VIE_INVALID_GLA) 2434 return (0); 2435 2436 base = 0; 2437 if (vie->base_register != VM_REG_LAST) { 2438 error = vm_get_register(vm, cpuid, vie->base_register, &base); 2439 if (error) { 2440 printf("verify_gla: error %d getting base reg %d\n", 2441 error, vie->base_register); 2442 return (-1); 2443 } 2444 2445 /* 2446 * RIP-relative addressing starts from the following 2447 * instruction 2448 */ 2449 if (vie->base_register == VM_REG_GUEST_RIP) 2450 base += vie->num_processed; 2451 } 2452 2453 idx = 0; 2454 if (vie->index_register != VM_REG_LAST) { 2455 error = vm_get_register(vm, cpuid, vie->index_register, &idx); 2456 if (error) { 2457 printf("verify_gla: error %d getting index reg %d\n", 2458 error, vie->index_register); 2459 return (-1); 2460 } 2461 } 2462 2463 /* 2464 * From "Specifying a Segment Selector", Intel SDM, Vol 1 2465 * 2466 * In 64-bit mode, segmentation is generally (but not 2467 * completely) disabled. The exceptions are the FS and GS 2468 * segments. 2469 * 2470 * In legacy IA-32 mode, when the ESP or EBP register is used 2471 * as the base, the SS segment is the default segment. For 2472 * other data references, except when relative to stack or 2473 * string destination the DS segment is the default. These 2474 * can be overridden to allow other segments to be accessed. 2475 */ 2476 if (vie->segment_override) 2477 seg = vie->segment_register; 2478 else if (vie->base_register == VM_REG_GUEST_RSP || 2479 vie->base_register == VM_REG_GUEST_RBP) 2480 seg = VM_REG_GUEST_SS; 2481 else 2482 seg = VM_REG_GUEST_DS; 2483 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && 2484 seg != VM_REG_GUEST_GS) { 2485 segbase = 0; 2486 } else { 2487 error = vm_get_seg_desc(vm, cpuid, seg, &desc); 2488 if (error) { 2489 printf("verify_gla: error %d getting segment" 2490 " descriptor %d", error, 2491 vie->segment_register); 2492 return (-1); 2493 } 2494 segbase = desc.base; 2495 } 2496 2497 gla2 = segbase + base + vie->scale * idx + vie->displacement; 2498 gla2 &= size2mask[vie->addrsize]; 2499 if (gla != gla2) { 2500 printf("verify_gla mismatch: segbase(0x%0lx)" 2501 "base(0x%0lx), scale(%d), index(0x%0lx), " 2502 "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n", 2503 segbase, base, vie->scale, idx, vie->displacement, 2504 gla, gla2); 2505 return (-1); 2506 } 2507 2508 return (0); 2509 } 2510 2511 int 2512 vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, 2513 enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) 2514 { 2515 2516 if (decode_prefixes(vie, cpu_mode, cs_d)) 2517 return (-1); 2518 2519 if (decode_opcode(vie)) 2520 return (-1); 2521 2522 if (decode_modrm(vie, cpu_mode)) 2523 return (-1); 2524 2525 if (decode_sib(vie)) 2526 return (-1); 2527 2528 if (decode_displacement(vie)) 2529 return (-1); 2530 2531 if (decode_immediate(vie)) 2532 return (-1); 2533 2534 if (decode_moffset(vie)) 2535 return (-1); 2536 2537 if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) { 2538 if (verify_gla(vm, cpuid, gla, vie, cpu_mode)) 2539 return (-1); 2540 } 2541 2542 vie->decoded = 1; /* success */ 2543 2544 return (0); 2545 } 2546 #endif /* _KERNEL */ 2547