1 /*- 2 * Copyright (c) 2012 Sandvine, Inc. 3 * Copyright (c) 2012 NetApp, Inc. 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 * 27 * $FreeBSD$ 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #ifdef _KERNEL 34 #include <sys/param.h> 35 #include <sys/pcpu.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 39 #include <vm/vm.h> 40 #include <vm/pmap.h> 41 42 #include <machine/vmparam.h> 43 #include <machine/vmm.h> 44 #else /* !_KERNEL */ 45 #include <sys/types.h> 46 #include <sys/errno.h> 47 #include <sys/_iovec.h> 48 49 #include <machine/vmm.h> 50 51 #include <assert.h> 52 #include <vmmapi.h> 53 #define KASSERT(exp,msg) assert((exp)) 54 #endif /* _KERNEL */ 55 56 #include <machine/vmm_instruction_emul.h> 57 #include <x86/psl.h> 58 #include <x86/specialreg.h> 59 60 /* struct vie_op.op_type */ 61 enum { 62 VIE_OP_TYPE_NONE = 0, 63 VIE_OP_TYPE_MOV, 64 VIE_OP_TYPE_MOVSX, 65 VIE_OP_TYPE_MOVZX, 66 VIE_OP_TYPE_AND, 67 VIE_OP_TYPE_OR, 68 VIE_OP_TYPE_SUB, 69 VIE_OP_TYPE_TWO_BYTE, 70 VIE_OP_TYPE_PUSH, 71 VIE_OP_TYPE_CMP, 72 VIE_OP_TYPE_POP, 73 VIE_OP_TYPE_MOVS, 74 VIE_OP_TYPE_GROUP1, 75 VIE_OP_TYPE_STOS, 76 VIE_OP_TYPE_LAST 77 }; 78 79 /* struct vie_op.op_flags */ 80 #define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */ 81 #define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ 82 #define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */ 83 #define VIE_OP_F_NO_MODRM (1 << 3) 84 #define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4) 85 86 static const struct vie_op two_byte_opcodes[256] = { 87 [0xB6] = { 88 .op_byte = 0xB6, 89 .op_type = VIE_OP_TYPE_MOVZX, 90 }, 91 [0xB7] = { 92 .op_byte = 0xB7, 93 .op_type = VIE_OP_TYPE_MOVZX, 94 }, 95 [0xBE] = { 96 .op_byte = 0xBE, 97 .op_type = VIE_OP_TYPE_MOVSX, 98 }, 99 }; 100 101 static const struct vie_op one_byte_opcodes[256] = { 102 [0x0F] = { 103 .op_byte = 0x0F, 104 .op_type = VIE_OP_TYPE_TWO_BYTE 105 }, 106 [0x2B] = { 107 .op_byte = 0x2B, 108 .op_type = VIE_OP_TYPE_SUB, 109 }, 110 [0x3B] = { 111 .op_byte = 0x3B, 112 .op_type = VIE_OP_TYPE_CMP, 113 }, 114 [0x88] = { 115 .op_byte = 0x88, 116 .op_type = VIE_OP_TYPE_MOV, 117 }, 118 [0x89] = { 119 .op_byte = 0x89, 120 .op_type = VIE_OP_TYPE_MOV, 121 }, 122 [0x8A] = { 123 .op_byte = 0x8A, 124 .op_type = VIE_OP_TYPE_MOV, 125 }, 126 [0x8B] = { 127 .op_byte = 0x8B, 128 .op_type = VIE_OP_TYPE_MOV, 129 }, 130 [0xA1] = { 131 .op_byte = 0xA1, 132 .op_type = VIE_OP_TYPE_MOV, 133 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 134 }, 135 [0xA3] = { 136 .op_byte = 0xA3, 137 .op_type = VIE_OP_TYPE_MOV, 138 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 139 }, 140 [0xA4] = { 141 .op_byte = 0xA4, 142 .op_type = VIE_OP_TYPE_MOVS, 143 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 144 }, 145 [0xA5] = { 146 .op_byte = 0xA5, 147 .op_type = VIE_OP_TYPE_MOVS, 148 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 149 }, 150 [0xAA] = { 151 .op_byte = 0xAA, 152 .op_type = VIE_OP_TYPE_STOS, 153 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 154 }, 155 [0xAB] = { 156 .op_byte = 0xAB, 157 .op_type = VIE_OP_TYPE_STOS, 158 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 159 }, 160 [0xC6] = { 161 /* XXX Group 11 extended opcode - not just MOV */ 162 .op_byte = 0xC6, 163 .op_type = VIE_OP_TYPE_MOV, 164 .op_flags = VIE_OP_F_IMM8, 165 }, 166 [0xC7] = { 167 .op_byte = 0xC7, 168 .op_type = VIE_OP_TYPE_MOV, 169 .op_flags = VIE_OP_F_IMM, 170 }, 171 [0x23] = { 172 .op_byte = 0x23, 173 .op_type = VIE_OP_TYPE_AND, 174 }, 175 [0x81] = { 176 /* XXX Group 1 extended opcode */ 177 .op_byte = 0x81, 178 .op_type = VIE_OP_TYPE_GROUP1, 179 .op_flags = VIE_OP_F_IMM, 180 }, 181 [0x83] = { 182 /* XXX Group 1 extended opcode */ 183 .op_byte = 0x83, 184 .op_type = VIE_OP_TYPE_GROUP1, 185 .op_flags = VIE_OP_F_IMM8, 186 }, 187 [0x8F] = { 188 /* XXX Group 1A extended opcode - not just POP */ 189 .op_byte = 0x8F, 190 .op_type = VIE_OP_TYPE_POP, 191 }, 192 [0xFF] = { 193 /* XXX Group 5 extended opcode - not just PUSH */ 194 .op_byte = 0xFF, 195 .op_type = VIE_OP_TYPE_PUSH, 196 } 197 }; 198 199 /* struct vie.mod */ 200 #define VIE_MOD_INDIRECT 0 201 #define VIE_MOD_INDIRECT_DISP8 1 202 #define VIE_MOD_INDIRECT_DISP32 2 203 #define VIE_MOD_DIRECT 3 204 205 /* struct vie.rm */ 206 #define VIE_RM_SIB 4 207 #define VIE_RM_DISP32 5 208 209 #define GB (1024 * 1024 * 1024) 210 211 static enum vm_reg_name gpr_map[16] = { 212 VM_REG_GUEST_RAX, 213 VM_REG_GUEST_RCX, 214 VM_REG_GUEST_RDX, 215 VM_REG_GUEST_RBX, 216 VM_REG_GUEST_RSP, 217 VM_REG_GUEST_RBP, 218 VM_REG_GUEST_RSI, 219 VM_REG_GUEST_RDI, 220 VM_REG_GUEST_R8, 221 VM_REG_GUEST_R9, 222 VM_REG_GUEST_R10, 223 VM_REG_GUEST_R11, 224 VM_REG_GUEST_R12, 225 VM_REG_GUEST_R13, 226 VM_REG_GUEST_R14, 227 VM_REG_GUEST_R15 228 }; 229 230 static uint64_t size2mask[] = { 231 [1] = 0xff, 232 [2] = 0xffff, 233 [4] = 0xffffffff, 234 [8] = 0xffffffffffffffff, 235 }; 236 237 static int 238 vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval) 239 { 240 int error; 241 242 error = vm_get_register(vm, vcpuid, reg, rval); 243 244 return (error); 245 } 246 247 static void 248 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr) 249 { 250 *lhbr = 0; 251 *reg = gpr_map[vie->reg]; 252 253 /* 254 * 64-bit mode imposes limitations on accessing legacy high byte 255 * registers (lhbr). 256 * 257 * The legacy high-byte registers cannot be addressed if the REX 258 * prefix is present. In this case the values 4, 5, 6 and 7 of the 259 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively. 260 * 261 * If the REX prefix is not present then the values 4, 5, 6 and 7 262 * of the 'ModRM:reg' field address the legacy high-byte registers, 263 * %ah, %ch, %dh and %bh respectively. 264 */ 265 if (!vie->rex_present) { 266 if (vie->reg & 0x4) { 267 *lhbr = 1; 268 *reg = gpr_map[vie->reg & 0x3]; 269 } 270 } 271 } 272 273 static int 274 vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval) 275 { 276 uint64_t val; 277 int error, lhbr; 278 enum vm_reg_name reg; 279 280 vie_calc_bytereg(vie, ®, &lhbr); 281 error = vm_get_register(vm, vcpuid, reg, &val); 282 283 /* 284 * To obtain the value of a legacy high byte register shift the 285 * base register right by 8 bits (%ah = %rax >> 8). 286 */ 287 if (lhbr) 288 *rval = val >> 8; 289 else 290 *rval = val; 291 return (error); 292 } 293 294 static int 295 vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte) 296 { 297 uint64_t origval, val, mask; 298 int error, lhbr; 299 enum vm_reg_name reg; 300 301 vie_calc_bytereg(vie, ®, &lhbr); 302 error = vm_get_register(vm, vcpuid, reg, &origval); 303 if (error == 0) { 304 val = byte; 305 mask = 0xff; 306 if (lhbr) { 307 /* 308 * Shift left by 8 to store 'byte' in a legacy high 309 * byte register. 310 */ 311 val <<= 8; 312 mask <<= 8; 313 } 314 val |= origval & ~mask; 315 error = vm_set_register(vm, vcpuid, reg, val); 316 } 317 return (error); 318 } 319 320 int 321 vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, 322 uint64_t val, int size) 323 { 324 int error; 325 uint64_t origval; 326 327 switch (size) { 328 case 1: 329 case 2: 330 error = vie_read_register(vm, vcpuid, reg, &origval); 331 if (error) 332 return (error); 333 val &= size2mask[size]; 334 val |= origval & ~size2mask[size]; 335 break; 336 case 4: 337 val &= 0xffffffffUL; 338 break; 339 case 8: 340 break; 341 default: 342 return (EINVAL); 343 } 344 345 error = vm_set_register(vm, vcpuid, reg, val); 346 return (error); 347 } 348 349 #define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V) 350 351 /* 352 * Return the status flags that would result from doing (x - y). 353 */ 354 #define GETCC(sz) \ 355 static u_long \ 356 getcc##sz(uint##sz##_t x, uint##sz##_t y) \ 357 { \ 358 u_long rflags; \ 359 \ 360 __asm __volatile("sub %2,%1; pushfq; popq %0" : \ 361 "=r" (rflags), "+r" (x) : "m" (y)); \ 362 return (rflags); \ 363 } struct __hack 364 365 GETCC(8); 366 GETCC(16); 367 GETCC(32); 368 GETCC(64); 369 370 static u_long 371 getcc(int opsize, uint64_t x, uint64_t y) 372 { 373 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 374 ("getcc: invalid operand size %d", opsize)); 375 376 if (opsize == 1) 377 return (getcc8(x, y)); 378 else if (opsize == 2) 379 return (getcc16(x, y)); 380 else if (opsize == 4) 381 return (getcc32(x, y)); 382 else 383 return (getcc64(x, y)); 384 } 385 386 static int 387 emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 388 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 389 { 390 int error, size; 391 enum vm_reg_name reg; 392 uint8_t byte; 393 uint64_t val; 394 395 size = vie->opsize; 396 error = EINVAL; 397 398 switch (vie->op.op_byte) { 399 case 0x88: 400 /* 401 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m) 402 * 88/r: mov r/m8, r8 403 * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available) 404 */ 405 size = 1; /* override for byte operation */ 406 error = vie_read_bytereg(vm, vcpuid, vie, &byte); 407 if (error == 0) 408 error = memwrite(vm, vcpuid, gpa, byte, size, arg); 409 break; 410 case 0x89: 411 /* 412 * MOV from reg (ModRM:reg) to mem (ModRM:r/m) 413 * 89/r: mov r/m16, r16 414 * 89/r: mov r/m32, r32 415 * REX.W + 89/r mov r/m64, r64 416 */ 417 reg = gpr_map[vie->reg]; 418 error = vie_read_register(vm, vcpuid, reg, &val); 419 if (error == 0) { 420 val &= size2mask[size]; 421 error = memwrite(vm, vcpuid, gpa, val, size, arg); 422 } 423 break; 424 case 0x8A: 425 /* 426 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg) 427 * 8A/r: mov r8, r/m8 428 * REX + 8A/r: mov r8, r/m8 429 */ 430 size = 1; /* override for byte operation */ 431 error = memread(vm, vcpuid, gpa, &val, size, arg); 432 if (error == 0) 433 error = vie_write_bytereg(vm, vcpuid, vie, val); 434 break; 435 case 0x8B: 436 /* 437 * MOV from mem (ModRM:r/m) to reg (ModRM:reg) 438 * 8B/r: mov r16, r/m16 439 * 8B/r: mov r32, r/m32 440 * REX.W 8B/r: mov r64, r/m64 441 */ 442 error = memread(vm, vcpuid, gpa, &val, size, arg); 443 if (error == 0) { 444 reg = gpr_map[vie->reg]; 445 error = vie_update_register(vm, vcpuid, reg, val, size); 446 } 447 break; 448 case 0xA1: 449 /* 450 * MOV from seg:moffset to AX/EAX/RAX 451 * A1: mov AX, moffs16 452 * A1: mov EAX, moffs32 453 * REX.W + A1: mov RAX, moffs64 454 */ 455 error = memread(vm, vcpuid, gpa, &val, size, arg); 456 if (error == 0) { 457 reg = VM_REG_GUEST_RAX; 458 error = vie_update_register(vm, vcpuid, reg, val, size); 459 } 460 break; 461 case 0xA3: 462 /* 463 * MOV from AX/EAX/RAX to seg:moffset 464 * A3: mov moffs16, AX 465 * A3: mov moffs32, EAX 466 * REX.W + A3: mov moffs64, RAX 467 */ 468 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); 469 if (error == 0) { 470 val &= size2mask[size]; 471 error = memwrite(vm, vcpuid, gpa, val, size, arg); 472 } 473 break; 474 case 0xC6: 475 /* 476 * MOV from imm8 to mem (ModRM:r/m) 477 * C6/0 mov r/m8, imm8 478 * REX + C6/0 mov r/m8, imm8 479 */ 480 size = 1; /* override for byte operation */ 481 error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg); 482 break; 483 case 0xC7: 484 /* 485 * MOV from imm16/imm32 to mem (ModRM:r/m) 486 * C7/0 mov r/m16, imm16 487 * C7/0 mov r/m32, imm32 488 * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) 489 */ 490 val = vie->immediate & size2mask[size]; 491 error = memwrite(vm, vcpuid, gpa, val, size, arg); 492 break; 493 default: 494 break; 495 } 496 497 return (error); 498 } 499 500 static int 501 emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 502 mem_region_read_t memread, mem_region_write_t memwrite, 503 void *arg) 504 { 505 int error, size; 506 enum vm_reg_name reg; 507 uint64_t val; 508 509 size = vie->opsize; 510 error = EINVAL; 511 512 switch (vie->op.op_byte) { 513 case 0xB6: 514 /* 515 * MOV and zero extend byte from mem (ModRM:r/m) to 516 * reg (ModRM:reg). 517 * 518 * 0F B6/r movzx r16, r/m8 519 * 0F B6/r movzx r32, r/m8 520 * REX.W + 0F B6/r movzx r64, r/m8 521 */ 522 523 /* get the first operand */ 524 error = memread(vm, vcpuid, gpa, &val, 1, arg); 525 if (error) 526 break; 527 528 /* get the second operand */ 529 reg = gpr_map[vie->reg]; 530 531 /* zero-extend byte */ 532 val = (uint8_t)val; 533 534 /* write the result */ 535 error = vie_update_register(vm, vcpuid, reg, val, size); 536 break; 537 case 0xB7: 538 /* 539 * MOV and zero extend word from mem (ModRM:r/m) to 540 * reg (ModRM:reg). 541 * 542 * 0F B7/r movzx r32, r/m16 543 * REX.W + 0F B7/r movzx r64, r/m16 544 */ 545 error = memread(vm, vcpuid, gpa, &val, 2, arg); 546 if (error) 547 return (error); 548 549 reg = gpr_map[vie->reg]; 550 551 /* zero-extend word */ 552 val = (uint16_t)val; 553 554 error = vie_update_register(vm, vcpuid, reg, val, size); 555 break; 556 case 0xBE: 557 /* 558 * MOV and sign extend byte from mem (ModRM:r/m) to 559 * reg (ModRM:reg). 560 * 561 * 0F BE/r movsx r16, r/m8 562 * 0F BE/r movsx r32, r/m8 563 * REX.W + 0F BE/r movsx r64, r/m8 564 */ 565 566 /* get the first operand */ 567 error = memread(vm, vcpuid, gpa, &val, 1, arg); 568 if (error) 569 break; 570 571 /* get the second operand */ 572 reg = gpr_map[vie->reg]; 573 574 /* sign extend byte */ 575 val = (int8_t)val; 576 577 /* write the result */ 578 error = vie_update_register(vm, vcpuid, reg, val, size); 579 break; 580 default: 581 break; 582 } 583 return (error); 584 } 585 586 /* 587 * Helper function to calculate and validate a linear address. 588 * 589 * Returns 0 on success and 1 if an exception was injected into the guest. 590 */ 591 static int 592 get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging, 593 int opsize, int addrsize, int prot, enum vm_reg_name seg, 594 enum vm_reg_name gpr, uint64_t *gla) 595 { 596 struct seg_desc desc; 597 uint64_t cr0, val, rflags; 598 int error; 599 600 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); 601 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); 602 603 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 604 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 605 606 error = vm_get_seg_desc(vm, vcpuid, seg, &desc); 607 KASSERT(error == 0, ("%s: error %d getting segment descriptor %d", 608 __func__, error, seg)); 609 610 error = vie_read_register(vm, vcpuid, gpr, &val); 611 KASSERT(error == 0, ("%s: error %d getting register %d", __func__, 612 error, gpr)); 613 614 if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize, 615 addrsize, prot, gla)) { 616 if (seg == VM_REG_GUEST_SS) 617 vm_inject_ss(vm, vcpuid, 0); 618 else 619 vm_inject_gp(vm, vcpuid); 620 return (1); 621 } 622 623 if (vie_canonical_check(paging->cpu_mode, *gla)) { 624 if (seg == VM_REG_GUEST_SS) 625 vm_inject_ss(vm, vcpuid, 0); 626 else 627 vm_inject_gp(vm, vcpuid); 628 return (1); 629 } 630 631 if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) { 632 vm_inject_ac(vm, vcpuid, 0); 633 return (1); 634 } 635 636 return (0); 637 } 638 639 static int 640 emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 641 struct vm_guest_paging *paging, mem_region_read_t memread, 642 mem_region_write_t memwrite, void *arg) 643 { 644 #ifdef _KERNEL 645 struct vm_copyinfo copyinfo[2]; 646 #else 647 struct iovec copyinfo[2]; 648 #endif 649 uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val; 650 uint64_t rcx, rdi, rsi, rflags; 651 int error, opsize, seg, repeat; 652 653 opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize; 654 val = 0; 655 error = 0; 656 657 /* 658 * XXX although the MOVS instruction is only supposed to be used with 659 * the "rep" prefix some guests like FreeBSD will use "repnz" instead. 660 * 661 * Empirically the "repnz" prefix has identical behavior to "rep" 662 * and the zero flag does not make a difference. 663 */ 664 repeat = vie->repz_present | vie->repnz_present; 665 666 if (repeat) { 667 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); 668 KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); 669 670 /* 671 * The count register is %rcx, %ecx or %cx depending on the 672 * address size of the instruction. 673 */ 674 if ((rcx & vie_size2mask(vie->addrsize)) == 0) 675 return (0); 676 } 677 678 /* 679 * Source Destination Comments 680 * -------------------------------------------- 681 * (1) memory memory n/a 682 * (2) memory mmio emulated 683 * (3) mmio memory emulated 684 * (4) mmio mmio emulated 685 * 686 * At this point we don't have sufficient information to distinguish 687 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this 688 * out because it will succeed only when operating on regular memory. 689 * 690 * XXX the emulation doesn't properly handle the case where 'gpa' 691 * is straddling the boundary between the normal memory and MMIO. 692 */ 693 694 seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS; 695 error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, 696 PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr); 697 if (error) 698 goto done; 699 700 error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ, 701 copyinfo, nitems(copyinfo)); 702 if (error == 0) { 703 /* 704 * case (2): read from system memory and write to mmio. 705 */ 706 vm_copyin(vm, vcpuid, copyinfo, &val, opsize); 707 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 708 error = memwrite(vm, vcpuid, gpa, val, opsize, arg); 709 if (error) 710 goto done; 711 } else if (error > 0) { 712 /* 713 * Resume guest execution to handle fault. 714 */ 715 goto done; 716 } else { 717 /* 718 * 'vm_copy_setup()' is expected to fail for cases (3) and (4) 719 * if 'srcaddr' is in the mmio space. 720 */ 721 722 error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, 723 PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr); 724 if (error) 725 goto done; 726 727 error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize, 728 PROT_WRITE, copyinfo, nitems(copyinfo)); 729 if (error == 0) { 730 /* 731 * case (3): read from MMIO and write to system memory. 732 * 733 * A MMIO read can have side-effects so we 734 * commit to it only after vm_copy_setup() is 735 * successful. If a page-fault needs to be 736 * injected into the guest then it will happen 737 * before the MMIO read is attempted. 738 */ 739 error = memread(vm, vcpuid, gpa, &val, opsize, arg); 740 if (error) 741 goto done; 742 743 vm_copyout(vm, vcpuid, &val, copyinfo, opsize); 744 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 745 } else if (error > 0) { 746 /* 747 * Resume guest execution to handle fault. 748 */ 749 goto done; 750 } else { 751 /* 752 * Case (4): read from and write to mmio. 753 */ 754 error = vm_gla2gpa(vm, vcpuid, paging, srcaddr, 755 PROT_READ, &srcgpa); 756 if (error) 757 goto done; 758 error = memread(vm, vcpuid, srcgpa, &val, opsize, arg); 759 if (error) 760 goto done; 761 762 error = vm_gla2gpa(vm, vcpuid, paging, dstaddr, 763 PROT_WRITE, &dstgpa); 764 if (error) 765 goto done; 766 error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg); 767 if (error) 768 goto done; 769 } 770 } 771 772 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi); 773 KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error)); 774 775 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); 776 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); 777 778 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 779 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 780 781 if (rflags & PSL_D) { 782 rsi -= opsize; 783 rdi -= opsize; 784 } else { 785 rsi += opsize; 786 rdi += opsize; 787 } 788 789 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi, 790 vie->addrsize); 791 KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error)); 792 793 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, 794 vie->addrsize); 795 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); 796 797 if (repeat) { 798 rcx = rcx - 1; 799 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, 800 rcx, vie->addrsize); 801 KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); 802 803 /* 804 * Repeat the instruction if the count register is not zero. 805 */ 806 if ((rcx & vie_size2mask(vie->addrsize)) != 0) 807 vm_restart_instruction(vm, vcpuid); 808 } 809 done: 810 if (error < 0) 811 return (EFAULT); 812 else 813 return (0); 814 } 815 816 static int 817 emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 818 struct vm_guest_paging *paging, mem_region_read_t memread, 819 mem_region_write_t memwrite, void *arg) 820 { 821 int error, opsize, repeat; 822 uint64_t val; 823 uint64_t rcx, rdi, rflags; 824 825 opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize; 826 repeat = vie->repz_present | vie->repnz_present; 827 828 if (repeat) { 829 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); 830 KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); 831 832 /* 833 * The count register is %rcx, %ecx or %cx depending on the 834 * address size of the instruction. 835 */ 836 if ((rcx & vie_size2mask(vie->addrsize)) == 0) 837 return (0); 838 } 839 840 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); 841 KASSERT(!error, ("%s: error %d getting rax", __func__, error)); 842 843 error = memwrite(vm, vcpuid, gpa, val, opsize, arg); 844 if (error) 845 return (error); 846 847 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); 848 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); 849 850 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 851 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 852 853 if (rflags & PSL_D) 854 rdi -= opsize; 855 else 856 rdi += opsize; 857 858 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, 859 vie->addrsize); 860 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); 861 862 if (repeat) { 863 rcx = rcx - 1; 864 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, 865 rcx, vie->addrsize); 866 KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); 867 868 /* 869 * Repeat the instruction if the count register is not zero. 870 */ 871 if ((rcx & vie_size2mask(vie->addrsize)) != 0) 872 vm_restart_instruction(vm, vcpuid); 873 } 874 875 return (0); 876 } 877 878 static int 879 emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 880 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 881 { 882 int error, size; 883 enum vm_reg_name reg; 884 uint64_t result, rflags, rflags2, val1, val2; 885 886 size = vie->opsize; 887 error = EINVAL; 888 889 switch (vie->op.op_byte) { 890 case 0x23: 891 /* 892 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the 893 * result in reg. 894 * 895 * 23/r and r16, r/m16 896 * 23/r and r32, r/m32 897 * REX.W + 23/r and r64, r/m64 898 */ 899 900 /* get the first operand */ 901 reg = gpr_map[vie->reg]; 902 error = vie_read_register(vm, vcpuid, reg, &val1); 903 if (error) 904 break; 905 906 /* get the second operand */ 907 error = memread(vm, vcpuid, gpa, &val2, size, arg); 908 if (error) 909 break; 910 911 /* perform the operation and write the result */ 912 result = val1 & val2; 913 error = vie_update_register(vm, vcpuid, reg, result, size); 914 break; 915 case 0x81: 916 case 0x83: 917 /* 918 * AND mem (ModRM:r/m) with immediate and store the 919 * result in mem. 920 * 921 * 81 /4 and r/m16, imm16 922 * 81 /4 and r/m32, imm32 923 * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64 924 * 925 * 83 /4 and r/m16, imm8 sign-extended to 16 926 * 83 /4 and r/m32, imm8 sign-extended to 32 927 * REX.W + 83/4 and r/m64, imm8 sign-extended to 64 928 */ 929 930 /* get the first operand */ 931 error = memread(vm, vcpuid, gpa, &val1, size, arg); 932 if (error) 933 break; 934 935 /* 936 * perform the operation with the pre-fetched immediate 937 * operand and write the result 938 */ 939 result = val1 & vie->immediate; 940 error = memwrite(vm, vcpuid, gpa, result, size, arg); 941 break; 942 default: 943 break; 944 } 945 if (error) 946 return (error); 947 948 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 949 if (error) 950 return (error); 951 952 /* 953 * OF and CF are cleared; the SF, ZF and PF flags are set according 954 * to the result; AF is undefined. 955 * 956 * The updated status flags are obtained by subtracting 0 from 'result'. 957 */ 958 rflags2 = getcc(size, result, 0); 959 rflags &= ~RFLAGS_STATUS_BITS; 960 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 961 962 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 963 return (error); 964 } 965 966 static int 967 emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 968 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 969 { 970 int error, size; 971 uint64_t val1, result, rflags, rflags2; 972 973 size = vie->opsize; 974 error = EINVAL; 975 976 switch (vie->op.op_byte) { 977 case 0x81: 978 case 0x83: 979 /* 980 * OR mem (ModRM:r/m) with immediate and store the 981 * result in mem. 982 * 983 * 81 /1 or r/m16, imm16 984 * 81 /1 or r/m32, imm32 985 * REX.W + 81 /1 or r/m64, imm32 sign-extended to 64 986 * 987 * 83 /1 or r/m16, imm8 sign-extended to 16 988 * 83 /1 or r/m32, imm8 sign-extended to 32 989 * REX.W + 83/1 or r/m64, imm8 sign-extended to 64 990 */ 991 992 /* get the first operand */ 993 error = memread(vm, vcpuid, gpa, &val1, size, arg); 994 if (error) 995 break; 996 997 /* 998 * perform the operation with the pre-fetched immediate 999 * operand and write the result 1000 */ 1001 result = val1 | vie->immediate; 1002 error = memwrite(vm, vcpuid, gpa, result, size, arg); 1003 break; 1004 default: 1005 break; 1006 } 1007 if (error) 1008 return (error); 1009 1010 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1011 if (error) 1012 return (error); 1013 1014 /* 1015 * OF and CF are cleared; the SF, ZF and PF flags are set according 1016 * to the result; AF is undefined. 1017 * 1018 * The updated status flags are obtained by subtracting 0 from 'result'. 1019 */ 1020 rflags2 = getcc(size, result, 0); 1021 rflags &= ~RFLAGS_STATUS_BITS; 1022 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 1023 1024 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 1025 return (error); 1026 } 1027 1028 static int 1029 emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 1030 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 1031 { 1032 int error, size; 1033 uint64_t op1, op2, rflags, rflags2; 1034 enum vm_reg_name reg; 1035 1036 size = vie->opsize; 1037 switch (vie->op.op_byte) { 1038 case 0x3B: 1039 /* 1040 * 3B/r CMP r16, r/m16 1041 * 3B/r CMP r32, r/m32 1042 * REX.W + 3B/r CMP r64, r/m64 1043 * 1044 * Compare first operand (reg) with second operand (r/m) and 1045 * set status flags in EFLAGS register. The comparison is 1046 * performed by subtracting the second operand from the first 1047 * operand and then setting the status flags. 1048 */ 1049 1050 /* Get the first operand */ 1051 reg = gpr_map[vie->reg]; 1052 error = vie_read_register(vm, vcpuid, reg, &op1); 1053 if (error) 1054 return (error); 1055 1056 /* Get the second operand */ 1057 error = memread(vm, vcpuid, gpa, &op2, size, arg); 1058 if (error) 1059 return (error); 1060 1061 rflags2 = getcc(size, op1, op2); 1062 break; 1063 case 0x81: 1064 case 0x83: 1065 /* 1066 * 81 /7 cmp r/m16, imm16 1067 * 81 /7 cmp r/m32, imm32 1068 * REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 64 1069 * 1070 * 83 /7 cmp r/m16, imm8 sign-extended to 16 1071 * 83 /7 cmp r/m32, imm8 sign-extended to 32 1072 * REX.W + 83 /7 cmp r/m64, imm8 sign-extended to 64 1073 * 1074 * Compare mem (ModRM:r/m) with immediate and set 1075 * status flags according to the results. The 1076 * comparison is performed by subtracting the 1077 * immediate from the first operand and then setting 1078 * the status flags. 1079 * 1080 */ 1081 1082 /* get the first operand */ 1083 error = memread(vm, vcpuid, gpa, &op1, size, arg); 1084 if (error) 1085 return (error); 1086 1087 rflags2 = getcc(size, op1, vie->immediate); 1088 break; 1089 default: 1090 return (EINVAL); 1091 } 1092 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1093 if (error) 1094 return (error); 1095 rflags &= ~RFLAGS_STATUS_BITS; 1096 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1097 1098 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 1099 return (error); 1100 } 1101 1102 static int 1103 emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 1104 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 1105 { 1106 int error, size; 1107 uint64_t nval, rflags, rflags2, val1, val2; 1108 enum vm_reg_name reg; 1109 1110 size = vie->opsize; 1111 error = EINVAL; 1112 1113 switch (vie->op.op_byte) { 1114 case 0x2B: 1115 /* 1116 * SUB r/m from r and store the result in r 1117 * 1118 * 2B/r SUB r16, r/m16 1119 * 2B/r SUB r32, r/m32 1120 * REX.W + 2B/r SUB r64, r/m64 1121 */ 1122 1123 /* get the first operand */ 1124 reg = gpr_map[vie->reg]; 1125 error = vie_read_register(vm, vcpuid, reg, &val1); 1126 if (error) 1127 break; 1128 1129 /* get the second operand */ 1130 error = memread(vm, vcpuid, gpa, &val2, size, arg); 1131 if (error) 1132 break; 1133 1134 /* perform the operation and write the result */ 1135 nval = val1 - val2; 1136 error = vie_update_register(vm, vcpuid, reg, nval, size); 1137 break; 1138 default: 1139 break; 1140 } 1141 1142 if (!error) { 1143 rflags2 = getcc(size, val1, val2); 1144 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1145 &rflags); 1146 if (error) 1147 return (error); 1148 1149 rflags &= ~RFLAGS_STATUS_BITS; 1150 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1151 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1152 rflags, 8); 1153 } 1154 1155 return (error); 1156 } 1157 1158 static int 1159 emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, 1160 struct vm_guest_paging *paging, mem_region_read_t memread, 1161 mem_region_write_t memwrite, void *arg) 1162 { 1163 #ifdef _KERNEL 1164 struct vm_copyinfo copyinfo[2]; 1165 #else 1166 struct iovec copyinfo[2]; 1167 #endif 1168 struct seg_desc ss_desc; 1169 uint64_t cr0, rflags, rsp, stack_gla, val; 1170 int error, size, stackaddrsize, pushop; 1171 1172 val = 0; 1173 size = vie->opsize; 1174 pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0; 1175 1176 /* 1177 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1 1178 */ 1179 if (paging->cpu_mode == CPU_MODE_REAL) { 1180 stackaddrsize = 2; 1181 } else if (paging->cpu_mode == CPU_MODE_64BIT) { 1182 /* 1183 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3 1184 * - Stack pointer size is always 64-bits. 1185 * - PUSH/POP of 32-bit values is not possible in 64-bit mode. 1186 * - 16-bit PUSH/POP is supported by using the operand size 1187 * override prefix (66H). 1188 */ 1189 stackaddrsize = 8; 1190 size = vie->opsize_override ? 2 : 8; 1191 } else { 1192 /* 1193 * In protected or compability mode the 'B' flag in the 1194 * stack-segment descriptor determines the size of the 1195 * stack pointer. 1196 */ 1197 error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc); 1198 KASSERT(error == 0, ("%s: error %d getting SS descriptor", 1199 __func__, error)); 1200 if (SEG_DESC_DEF32(ss_desc.access)) 1201 stackaddrsize = 4; 1202 else 1203 stackaddrsize = 2; 1204 } 1205 1206 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); 1207 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); 1208 1209 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1210 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 1211 1212 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp); 1213 KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error)); 1214 if (pushop) { 1215 rsp -= size; 1216 } 1217 1218 if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc, 1219 rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ, 1220 &stack_gla)) { 1221 vm_inject_ss(vm, vcpuid, 0); 1222 return (0); 1223 } 1224 1225 if (vie_canonical_check(paging->cpu_mode, stack_gla)) { 1226 vm_inject_ss(vm, vcpuid, 0); 1227 return (0); 1228 } 1229 1230 if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) { 1231 vm_inject_ac(vm, vcpuid, 0); 1232 return (0); 1233 } 1234 1235 error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, 1236 pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo)); 1237 if (error == -1) { 1238 /* 1239 * XXX cannot return a negative error value here because it 1240 * ends up being the return value of the VM_RUN() ioctl and 1241 * is interpreted as a pseudo-error (for e.g. ERESTART). 1242 */ 1243 return (EFAULT); 1244 } else if (error == 1) { 1245 /* Resume guest execution to handle page fault */ 1246 return (0); 1247 } 1248 1249 if (pushop) { 1250 error = memread(vm, vcpuid, mmio_gpa, &val, size, arg); 1251 if (error == 0) 1252 vm_copyout(vm, vcpuid, &val, copyinfo, size); 1253 } else { 1254 vm_copyin(vm, vcpuid, copyinfo, &val, size); 1255 error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg); 1256 rsp += size; 1257 } 1258 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 1259 1260 if (error == 0) { 1261 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp, 1262 stackaddrsize); 1263 KASSERT(error == 0, ("error %d updating rsp", error)); 1264 } 1265 return (error); 1266 } 1267 1268 static int 1269 emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, 1270 struct vm_guest_paging *paging, mem_region_read_t memread, 1271 mem_region_write_t memwrite, void *arg) 1272 { 1273 int error; 1274 1275 /* 1276 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. 1277 * 1278 * PUSH is part of the group 5 extended opcodes and is identified 1279 * by ModRM:reg = b110. 1280 */ 1281 if ((vie->reg & 7) != 6) 1282 return (EINVAL); 1283 1284 error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread, 1285 memwrite, arg); 1286 return (error); 1287 } 1288 1289 static int 1290 emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, 1291 struct vm_guest_paging *paging, mem_region_read_t memread, 1292 mem_region_write_t memwrite, void *arg) 1293 { 1294 int error; 1295 1296 /* 1297 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. 1298 * 1299 * POP is part of the group 1A extended opcodes and is identified 1300 * by ModRM:reg = b000. 1301 */ 1302 if ((vie->reg & 7) != 0) 1303 return (EINVAL); 1304 1305 error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread, 1306 memwrite, arg); 1307 return (error); 1308 } 1309 1310 static int 1311 emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 1312 struct vm_guest_paging *paging, mem_region_read_t memread, 1313 mem_region_write_t memwrite, void *memarg) 1314 { 1315 int error; 1316 1317 switch (vie->reg & 7) { 1318 case 0x1: /* OR */ 1319 error = emulate_or(vm, vcpuid, gpa, vie, 1320 memread, memwrite, memarg); 1321 break; 1322 case 0x4: /* AND */ 1323 error = emulate_and(vm, vcpuid, gpa, vie, 1324 memread, memwrite, memarg); 1325 break; 1326 case 0x7: /* CMP */ 1327 error = emulate_cmp(vm, vcpuid, gpa, vie, 1328 memread, memwrite, memarg); 1329 break; 1330 default: 1331 error = EINVAL; 1332 break; 1333 } 1334 1335 return (error); 1336 } 1337 1338 int 1339 vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 1340 struct vm_guest_paging *paging, mem_region_read_t memread, 1341 mem_region_write_t memwrite, void *memarg) 1342 { 1343 int error; 1344 1345 if (!vie->decoded) 1346 return (EINVAL); 1347 1348 switch (vie->op.op_type) { 1349 case VIE_OP_TYPE_GROUP1: 1350 error = emulate_group1(vm, vcpuid, gpa, vie, paging, memread, 1351 memwrite, memarg); 1352 break; 1353 case VIE_OP_TYPE_POP: 1354 error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread, 1355 memwrite, memarg); 1356 break; 1357 case VIE_OP_TYPE_PUSH: 1358 error = emulate_push(vm, vcpuid, gpa, vie, paging, memread, 1359 memwrite, memarg); 1360 break; 1361 case VIE_OP_TYPE_CMP: 1362 error = emulate_cmp(vm, vcpuid, gpa, vie, 1363 memread, memwrite, memarg); 1364 break; 1365 case VIE_OP_TYPE_MOV: 1366 error = emulate_mov(vm, vcpuid, gpa, vie, 1367 memread, memwrite, memarg); 1368 break; 1369 case VIE_OP_TYPE_MOVSX: 1370 case VIE_OP_TYPE_MOVZX: 1371 error = emulate_movx(vm, vcpuid, gpa, vie, 1372 memread, memwrite, memarg); 1373 break; 1374 case VIE_OP_TYPE_MOVS: 1375 error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread, 1376 memwrite, memarg); 1377 break; 1378 case VIE_OP_TYPE_STOS: 1379 error = emulate_stos(vm, vcpuid, gpa, vie, paging, memread, 1380 memwrite, memarg); 1381 break; 1382 case VIE_OP_TYPE_AND: 1383 error = emulate_and(vm, vcpuid, gpa, vie, 1384 memread, memwrite, memarg); 1385 break; 1386 case VIE_OP_TYPE_OR: 1387 error = emulate_or(vm, vcpuid, gpa, vie, 1388 memread, memwrite, memarg); 1389 break; 1390 case VIE_OP_TYPE_SUB: 1391 error = emulate_sub(vm, vcpuid, gpa, vie, 1392 memread, memwrite, memarg); 1393 break; 1394 default: 1395 error = EINVAL; 1396 break; 1397 } 1398 1399 return (error); 1400 } 1401 1402 int 1403 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) 1404 { 1405 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 1406 ("%s: invalid size %d", __func__, size)); 1407 KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl)); 1408 1409 if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) 1410 return (0); 1411 1412 return ((gla & (size - 1)) ? 1 : 0); 1413 } 1414 1415 int 1416 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla) 1417 { 1418 uint64_t mask; 1419 1420 if (cpu_mode != CPU_MODE_64BIT) 1421 return (0); 1422 1423 /* 1424 * The value of the bit 47 in the 'gla' should be replicated in the 1425 * most significant 16 bits. 1426 */ 1427 mask = ~((1UL << 48) - 1); 1428 if (gla & (1UL << 47)) 1429 return ((gla & mask) != mask); 1430 else 1431 return ((gla & mask) != 0); 1432 } 1433 1434 uint64_t 1435 vie_size2mask(int size) 1436 { 1437 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 1438 ("vie_size2mask: invalid size %d", size)); 1439 return (size2mask[size]); 1440 } 1441 1442 int 1443 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, 1444 struct seg_desc *desc, uint64_t offset, int length, int addrsize, 1445 int prot, uint64_t *gla) 1446 { 1447 uint64_t firstoff, low_limit, high_limit, segbase; 1448 int glasize, type; 1449 1450 KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS, 1451 ("%s: invalid segment %d", __func__, seg)); 1452 KASSERT(length == 1 || length == 2 || length == 4 || length == 8, 1453 ("%s: invalid operand size %d", __func__, length)); 1454 KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, 1455 ("%s: invalid prot %#x", __func__, prot)); 1456 1457 firstoff = offset; 1458 if (cpu_mode == CPU_MODE_64BIT) { 1459 KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address " 1460 "size %d for cpu_mode %d", __func__, addrsize, cpu_mode)); 1461 glasize = 8; 1462 } else { 1463 KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address " 1464 "size %d for cpu mode %d", __func__, addrsize, cpu_mode)); 1465 glasize = 4; 1466 /* 1467 * If the segment selector is loaded with a NULL selector 1468 * then the descriptor is unusable and attempting to use 1469 * it results in a #GP(0). 1470 */ 1471 if (SEG_DESC_UNUSABLE(desc->access)) 1472 return (-1); 1473 1474 /* 1475 * The processor generates a #NP exception when a segment 1476 * register is loaded with a selector that points to a 1477 * descriptor that is not present. If this was the case then 1478 * it would have been checked before the VM-exit. 1479 */ 1480 KASSERT(SEG_DESC_PRESENT(desc->access), 1481 ("segment %d not present: %#x", seg, desc->access)); 1482 1483 /* 1484 * The descriptor type must indicate a code/data segment. 1485 */ 1486 type = SEG_DESC_TYPE(desc->access); 1487 KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " 1488 "descriptor type %#x", seg, type)); 1489 1490 if (prot & PROT_READ) { 1491 /* #GP on a read access to a exec-only code segment */ 1492 if ((type & 0xA) == 0x8) 1493 return (-1); 1494 } 1495 1496 if (prot & PROT_WRITE) { 1497 /* 1498 * #GP on a write access to a code segment or a 1499 * read-only data segment. 1500 */ 1501 if (type & 0x8) /* code segment */ 1502 return (-1); 1503 1504 if ((type & 0xA) == 0) /* read-only data seg */ 1505 return (-1); 1506 } 1507 1508 /* 1509 * 'desc->limit' is fully expanded taking granularity into 1510 * account. 1511 */ 1512 if ((type & 0xC) == 0x4) { 1513 /* expand-down data segment */ 1514 low_limit = desc->limit + 1; 1515 high_limit = SEG_DESC_DEF32(desc->access) ? 1516 0xffffffff : 0xffff; 1517 } else { 1518 /* code segment or expand-up data segment */ 1519 low_limit = 0; 1520 high_limit = desc->limit; 1521 } 1522 1523 while (length > 0) { 1524 offset &= vie_size2mask(addrsize); 1525 if (offset < low_limit || offset > high_limit) 1526 return (-1); 1527 offset++; 1528 length--; 1529 } 1530 } 1531 1532 /* 1533 * In 64-bit mode all segments except %fs and %gs have a segment 1534 * base address of 0. 1535 */ 1536 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && 1537 seg != VM_REG_GUEST_GS) { 1538 segbase = 0; 1539 } else { 1540 segbase = desc->base; 1541 } 1542 1543 /* 1544 * Truncate 'firstoff' to the effective address size before adding 1545 * it to the segment base. 1546 */ 1547 firstoff &= vie_size2mask(addrsize); 1548 *gla = (segbase + firstoff) & vie_size2mask(glasize); 1549 return (0); 1550 } 1551 1552 #ifdef _KERNEL 1553 void 1554 vie_init(struct vie *vie, const char *inst_bytes, int inst_length) 1555 { 1556 KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE, 1557 ("%s: invalid instruction length (%d)", __func__, inst_length)); 1558 1559 bzero(vie, sizeof(struct vie)); 1560 1561 vie->base_register = VM_REG_LAST; 1562 vie->index_register = VM_REG_LAST; 1563 vie->segment_register = VM_REG_LAST; 1564 1565 if (inst_length) { 1566 bcopy(inst_bytes, vie->inst, inst_length); 1567 vie->num_valid = inst_length; 1568 } 1569 } 1570 1571 static int 1572 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte) 1573 { 1574 int error_code = 0; 1575 1576 if (pte & PG_V) 1577 error_code |= PGEX_P; 1578 if (prot & VM_PROT_WRITE) 1579 error_code |= PGEX_W; 1580 if (usermode) 1581 error_code |= PGEX_U; 1582 if (rsvd) 1583 error_code |= PGEX_RSV; 1584 if (prot & VM_PROT_EXECUTE) 1585 error_code |= PGEX_I; 1586 1587 return (error_code); 1588 } 1589 1590 static void 1591 ptp_release(void **cookie) 1592 { 1593 if (*cookie != NULL) { 1594 vm_gpa_release(*cookie); 1595 *cookie = NULL; 1596 } 1597 } 1598 1599 static void * 1600 ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie) 1601 { 1602 void *ptr; 1603 1604 ptp_release(cookie); 1605 ptr = vm_gpa_hold(vm, ptpphys, len, VM_PROT_RW, cookie); 1606 return (ptr); 1607 } 1608 1609 int 1610 vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 1611 uint64_t gla, int prot, uint64_t *gpa) 1612 { 1613 int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable; 1614 u_int retries; 1615 uint64_t *ptpbase, ptpphys, pte, pgsize; 1616 uint32_t *ptpbase32, pte32; 1617 void *cookie; 1618 1619 usermode = (paging->cpl == 3 ? 1 : 0); 1620 writable = prot & VM_PROT_WRITE; 1621 cookie = NULL; 1622 retval = 0; 1623 retries = 0; 1624 restart: 1625 ptpphys = paging->cr3; /* root of the page tables */ 1626 ptp_release(&cookie); 1627 if (retries++ > 0) 1628 maybe_yield(); 1629 1630 if (vie_canonical_check(paging->cpu_mode, gla)) { 1631 /* 1632 * XXX assuming a non-stack reference otherwise a stack fault 1633 * should be generated. 1634 */ 1635 vm_inject_gp(vm, vcpuid); 1636 goto fault; 1637 } 1638 1639 if (paging->paging_mode == PAGING_MODE_FLAT) { 1640 *gpa = gla; 1641 goto done; 1642 } 1643 1644 if (paging->paging_mode == PAGING_MODE_32) { 1645 nlevels = 2; 1646 while (--nlevels >= 0) { 1647 /* Zero out the lower 12 bits. */ 1648 ptpphys &= ~0xfff; 1649 1650 ptpbase32 = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie); 1651 1652 if (ptpbase32 == NULL) 1653 goto error; 1654 1655 ptpshift = PAGE_SHIFT + nlevels * 10; 1656 ptpindex = (gla >> ptpshift) & 0x3FF; 1657 pgsize = 1UL << ptpshift; 1658 1659 pte32 = ptpbase32[ptpindex]; 1660 1661 if ((pte32 & PG_V) == 0 || 1662 (usermode && (pte32 & PG_U) == 0) || 1663 (writable && (pte32 & PG_RW) == 0)) { 1664 pfcode = pf_error_code(usermode, prot, 0, 1665 pte32); 1666 vm_inject_pf(vm, vcpuid, pfcode, gla); 1667 goto fault; 1668 } 1669 1670 /* 1671 * Emulate the x86 MMU's management of the accessed 1672 * and dirty flags. While the accessed flag is set 1673 * at every level of the page table, the dirty flag 1674 * is only set at the last level providing the guest 1675 * physical address. 1676 */ 1677 if ((pte32 & PG_A) == 0) { 1678 if (atomic_cmpset_32(&ptpbase32[ptpindex], 1679 pte32, pte32 | PG_A) == 0) { 1680 goto restart; 1681 } 1682 } 1683 1684 /* XXX must be ignored if CR4.PSE=0 */ 1685 if (nlevels > 0 && (pte32 & PG_PS) != 0) 1686 break; 1687 1688 ptpphys = pte32; 1689 } 1690 1691 /* Set the dirty bit in the page table entry if necessary */ 1692 if (writable && (pte32 & PG_M) == 0) { 1693 if (atomic_cmpset_32(&ptpbase32[ptpindex], 1694 pte32, pte32 | PG_M) == 0) { 1695 goto restart; 1696 } 1697 } 1698 1699 /* Zero out the lower 'ptpshift' bits */ 1700 pte32 >>= ptpshift; pte32 <<= ptpshift; 1701 *gpa = pte32 | (gla & (pgsize - 1)); 1702 goto done; 1703 } 1704 1705 if (paging->paging_mode == PAGING_MODE_PAE) { 1706 /* Zero out the lower 5 bits and the upper 32 bits */ 1707 ptpphys &= 0xffffffe0UL; 1708 1709 ptpbase = ptp_hold(vm, ptpphys, sizeof(*ptpbase) * 4, &cookie); 1710 if (ptpbase == NULL) 1711 goto error; 1712 1713 ptpindex = (gla >> 30) & 0x3; 1714 1715 pte = ptpbase[ptpindex]; 1716 1717 if ((pte & PG_V) == 0) { 1718 pfcode = pf_error_code(usermode, prot, 0, pte); 1719 vm_inject_pf(vm, vcpuid, pfcode, gla); 1720 goto fault; 1721 } 1722 1723 ptpphys = pte; 1724 1725 nlevels = 2; 1726 } else 1727 nlevels = 4; 1728 while (--nlevels >= 0) { 1729 /* Zero out the lower 12 bits and the upper 12 bits */ 1730 ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12; 1731 1732 ptpbase = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie); 1733 if (ptpbase == NULL) 1734 goto error; 1735 1736 ptpshift = PAGE_SHIFT + nlevels * 9; 1737 ptpindex = (gla >> ptpshift) & 0x1FF; 1738 pgsize = 1UL << ptpshift; 1739 1740 pte = ptpbase[ptpindex]; 1741 1742 if ((pte & PG_V) == 0 || 1743 (usermode && (pte & PG_U) == 0) || 1744 (writable && (pte & PG_RW) == 0)) { 1745 pfcode = pf_error_code(usermode, prot, 0, pte); 1746 vm_inject_pf(vm, vcpuid, pfcode, gla); 1747 goto fault; 1748 } 1749 1750 /* Set the accessed bit in the page table entry */ 1751 if ((pte & PG_A) == 0) { 1752 if (atomic_cmpset_64(&ptpbase[ptpindex], 1753 pte, pte | PG_A) == 0) { 1754 goto restart; 1755 } 1756 } 1757 1758 if (nlevels > 0 && (pte & PG_PS) != 0) { 1759 if (pgsize > 1 * GB) { 1760 pfcode = pf_error_code(usermode, prot, 1, pte); 1761 vm_inject_pf(vm, vcpuid, pfcode, gla); 1762 goto fault; 1763 } 1764 break; 1765 } 1766 1767 ptpphys = pte; 1768 } 1769 1770 /* Set the dirty bit in the page table entry if necessary */ 1771 if (writable && (pte & PG_M) == 0) { 1772 if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0) 1773 goto restart; 1774 } 1775 1776 /* Zero out the lower 'ptpshift' bits and the upper 12 bits */ 1777 pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12; 1778 *gpa = pte | (gla & (pgsize - 1)); 1779 done: 1780 ptp_release(&cookie); 1781 return (retval); 1782 error: 1783 retval = -1; 1784 goto done; 1785 fault: 1786 retval = 1; 1787 goto done; 1788 } 1789 1790 int 1791 vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 1792 uint64_t rip, int inst_length, struct vie *vie) 1793 { 1794 struct vm_copyinfo copyinfo[2]; 1795 int error, prot; 1796 1797 if (inst_length > VIE_INST_SIZE) 1798 panic("vmm_fetch_instruction: invalid length %d", inst_length); 1799 1800 prot = PROT_READ | PROT_EXEC; 1801 error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot, 1802 copyinfo, nitems(copyinfo)); 1803 if (error == 0) { 1804 vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length); 1805 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 1806 vie->num_valid = inst_length; 1807 } 1808 return (error); 1809 } 1810 1811 static int 1812 vie_peek(struct vie *vie, uint8_t *x) 1813 { 1814 1815 if (vie->num_processed < vie->num_valid) { 1816 *x = vie->inst[vie->num_processed]; 1817 return (0); 1818 } else 1819 return (-1); 1820 } 1821 1822 static void 1823 vie_advance(struct vie *vie) 1824 { 1825 1826 vie->num_processed++; 1827 } 1828 1829 static bool 1830 segment_override(uint8_t x, int *seg) 1831 { 1832 1833 switch (x) { 1834 case 0x2E: 1835 *seg = VM_REG_GUEST_CS; 1836 break; 1837 case 0x36: 1838 *seg = VM_REG_GUEST_SS; 1839 break; 1840 case 0x3E: 1841 *seg = VM_REG_GUEST_DS; 1842 break; 1843 case 0x26: 1844 *seg = VM_REG_GUEST_ES; 1845 break; 1846 case 0x64: 1847 *seg = VM_REG_GUEST_FS; 1848 break; 1849 case 0x65: 1850 *seg = VM_REG_GUEST_GS; 1851 break; 1852 default: 1853 return (false); 1854 } 1855 return (true); 1856 } 1857 1858 static int 1859 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d) 1860 { 1861 uint8_t x; 1862 1863 while (1) { 1864 if (vie_peek(vie, &x)) 1865 return (-1); 1866 1867 if (x == 0x66) 1868 vie->opsize_override = 1; 1869 else if (x == 0x67) 1870 vie->addrsize_override = 1; 1871 else if (x == 0xF3) 1872 vie->repz_present = 1; 1873 else if (x == 0xF2) 1874 vie->repnz_present = 1; 1875 else if (segment_override(x, &vie->segment_register)) 1876 vie->segment_override = 1; 1877 else 1878 break; 1879 1880 vie_advance(vie); 1881 } 1882 1883 /* 1884 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2: 1885 * - Only one REX prefix is allowed per instruction. 1886 * - The REX prefix must immediately precede the opcode byte or the 1887 * escape opcode byte. 1888 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3) 1889 * the mandatory prefix must come before the REX prefix. 1890 */ 1891 if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) { 1892 vie->rex_present = 1; 1893 vie->rex_w = x & 0x8 ? 1 : 0; 1894 vie->rex_r = x & 0x4 ? 1 : 0; 1895 vie->rex_x = x & 0x2 ? 1 : 0; 1896 vie->rex_b = x & 0x1 ? 1 : 0; 1897 vie_advance(vie); 1898 } 1899 1900 /* 1901 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1 1902 */ 1903 if (cpu_mode == CPU_MODE_64BIT) { 1904 /* 1905 * Default address size is 64-bits and default operand size 1906 * is 32-bits. 1907 */ 1908 vie->addrsize = vie->addrsize_override ? 4 : 8; 1909 if (vie->rex_w) 1910 vie->opsize = 8; 1911 else if (vie->opsize_override) 1912 vie->opsize = 2; 1913 else 1914 vie->opsize = 4; 1915 } else if (cs_d) { 1916 /* Default address and operand sizes are 32-bits */ 1917 vie->addrsize = vie->addrsize_override ? 2 : 4; 1918 vie->opsize = vie->opsize_override ? 2 : 4; 1919 } else { 1920 /* Default address and operand sizes are 16-bits */ 1921 vie->addrsize = vie->addrsize_override ? 4 : 2; 1922 vie->opsize = vie->opsize_override ? 4 : 2; 1923 } 1924 return (0); 1925 } 1926 1927 static int 1928 decode_two_byte_opcode(struct vie *vie) 1929 { 1930 uint8_t x; 1931 1932 if (vie_peek(vie, &x)) 1933 return (-1); 1934 1935 vie->op = two_byte_opcodes[x]; 1936 1937 if (vie->op.op_type == VIE_OP_TYPE_NONE) 1938 return (-1); 1939 1940 vie_advance(vie); 1941 return (0); 1942 } 1943 1944 static int 1945 decode_opcode(struct vie *vie) 1946 { 1947 uint8_t x; 1948 1949 if (vie_peek(vie, &x)) 1950 return (-1); 1951 1952 vie->op = one_byte_opcodes[x]; 1953 1954 if (vie->op.op_type == VIE_OP_TYPE_NONE) 1955 return (-1); 1956 1957 vie_advance(vie); 1958 1959 if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE) 1960 return (decode_two_byte_opcode(vie)); 1961 1962 return (0); 1963 } 1964 1965 static int 1966 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode) 1967 { 1968 uint8_t x; 1969 1970 if (vie->op.op_flags & VIE_OP_F_NO_MODRM) 1971 return (0); 1972 1973 if (cpu_mode == CPU_MODE_REAL) 1974 return (-1); 1975 1976 if (vie_peek(vie, &x)) 1977 return (-1); 1978 1979 vie->mod = (x >> 6) & 0x3; 1980 vie->rm = (x >> 0) & 0x7; 1981 vie->reg = (x >> 3) & 0x7; 1982 1983 /* 1984 * A direct addressing mode makes no sense in the context of an EPT 1985 * fault. There has to be a memory access involved to cause the 1986 * EPT fault. 1987 */ 1988 if (vie->mod == VIE_MOD_DIRECT) 1989 return (-1); 1990 1991 if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) || 1992 (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) { 1993 /* 1994 * Table 2-5: Special Cases of REX Encodings 1995 * 1996 * mod=0, r/m=5 is used in the compatibility mode to 1997 * indicate a disp32 without a base register. 1998 * 1999 * mod!=3, r/m=4 is used in the compatibility mode to 2000 * indicate that the SIB byte is present. 2001 * 2002 * The 'b' bit in the REX prefix is don't care in 2003 * this case. 2004 */ 2005 } else { 2006 vie->rm |= (vie->rex_b << 3); 2007 } 2008 2009 vie->reg |= (vie->rex_r << 3); 2010 2011 /* SIB */ 2012 if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB) 2013 goto done; 2014 2015 vie->base_register = gpr_map[vie->rm]; 2016 2017 switch (vie->mod) { 2018 case VIE_MOD_INDIRECT_DISP8: 2019 vie->disp_bytes = 1; 2020 break; 2021 case VIE_MOD_INDIRECT_DISP32: 2022 vie->disp_bytes = 4; 2023 break; 2024 case VIE_MOD_INDIRECT: 2025 if (vie->rm == VIE_RM_DISP32) { 2026 vie->disp_bytes = 4; 2027 /* 2028 * Table 2-7. RIP-Relative Addressing 2029 * 2030 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32 2031 * whereas in compatibility mode it just implies disp32. 2032 */ 2033 2034 if (cpu_mode == CPU_MODE_64BIT) 2035 vie->base_register = VM_REG_GUEST_RIP; 2036 else 2037 vie->base_register = VM_REG_LAST; 2038 } 2039 break; 2040 } 2041 2042 done: 2043 vie_advance(vie); 2044 2045 return (0); 2046 } 2047 2048 static int 2049 decode_sib(struct vie *vie) 2050 { 2051 uint8_t x; 2052 2053 /* Proceed only if SIB byte is present */ 2054 if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB) 2055 return (0); 2056 2057 if (vie_peek(vie, &x)) 2058 return (-1); 2059 2060 /* De-construct the SIB byte */ 2061 vie->ss = (x >> 6) & 0x3; 2062 vie->index = (x >> 3) & 0x7; 2063 vie->base = (x >> 0) & 0x7; 2064 2065 /* Apply the REX prefix modifiers */ 2066 vie->index |= vie->rex_x << 3; 2067 vie->base |= vie->rex_b << 3; 2068 2069 switch (vie->mod) { 2070 case VIE_MOD_INDIRECT_DISP8: 2071 vie->disp_bytes = 1; 2072 break; 2073 case VIE_MOD_INDIRECT_DISP32: 2074 vie->disp_bytes = 4; 2075 break; 2076 } 2077 2078 if (vie->mod == VIE_MOD_INDIRECT && 2079 (vie->base == 5 || vie->base == 13)) { 2080 /* 2081 * Special case when base register is unused if mod = 0 2082 * and base = %rbp or %r13. 2083 * 2084 * Documented in: 2085 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 2086 * Table 2-5: Special Cases of REX Encodings 2087 */ 2088 vie->disp_bytes = 4; 2089 } else { 2090 vie->base_register = gpr_map[vie->base]; 2091 } 2092 2093 /* 2094 * All encodings of 'index' are valid except for %rsp (4). 2095 * 2096 * Documented in: 2097 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 2098 * Table 2-5: Special Cases of REX Encodings 2099 */ 2100 if (vie->index != 4) 2101 vie->index_register = gpr_map[vie->index]; 2102 2103 /* 'scale' makes sense only in the context of an index register */ 2104 if (vie->index_register < VM_REG_LAST) 2105 vie->scale = 1 << vie->ss; 2106 2107 vie_advance(vie); 2108 2109 return (0); 2110 } 2111 2112 static int 2113 decode_displacement(struct vie *vie) 2114 { 2115 int n, i; 2116 uint8_t x; 2117 2118 union { 2119 char buf[4]; 2120 int8_t signed8; 2121 int32_t signed32; 2122 } u; 2123 2124 if ((n = vie->disp_bytes) == 0) 2125 return (0); 2126 2127 if (n != 1 && n != 4) 2128 panic("decode_displacement: invalid disp_bytes %d", n); 2129 2130 for (i = 0; i < n; i++) { 2131 if (vie_peek(vie, &x)) 2132 return (-1); 2133 2134 u.buf[i] = x; 2135 vie_advance(vie); 2136 } 2137 2138 if (n == 1) 2139 vie->displacement = u.signed8; /* sign-extended */ 2140 else 2141 vie->displacement = u.signed32; /* sign-extended */ 2142 2143 return (0); 2144 } 2145 2146 static int 2147 decode_immediate(struct vie *vie) 2148 { 2149 int i, n; 2150 uint8_t x; 2151 union { 2152 char buf[4]; 2153 int8_t signed8; 2154 int16_t signed16; 2155 int32_t signed32; 2156 } u; 2157 2158 /* Figure out immediate operand size (if any) */ 2159 if (vie->op.op_flags & VIE_OP_F_IMM) { 2160 /* 2161 * Section 2.2.1.5 "Immediates", Intel SDM: 2162 * In 64-bit mode the typical size of immediate operands 2163 * remains 32-bits. When the operand size if 64-bits, the 2164 * processor sign-extends all immediates to 64-bits prior 2165 * to their use. 2166 */ 2167 if (vie->opsize == 4 || vie->opsize == 8) 2168 vie->imm_bytes = 4; 2169 else 2170 vie->imm_bytes = 2; 2171 } else if (vie->op.op_flags & VIE_OP_F_IMM8) { 2172 vie->imm_bytes = 1; 2173 } 2174 2175 if ((n = vie->imm_bytes) == 0) 2176 return (0); 2177 2178 KASSERT(n == 1 || n == 2 || n == 4, 2179 ("%s: invalid number of immediate bytes: %d", __func__, n)); 2180 2181 for (i = 0; i < n; i++) { 2182 if (vie_peek(vie, &x)) 2183 return (-1); 2184 2185 u.buf[i] = x; 2186 vie_advance(vie); 2187 } 2188 2189 /* sign-extend the immediate value before use */ 2190 if (n == 1) 2191 vie->immediate = u.signed8; 2192 else if (n == 2) 2193 vie->immediate = u.signed16; 2194 else 2195 vie->immediate = u.signed32; 2196 2197 return (0); 2198 } 2199 2200 static int 2201 decode_moffset(struct vie *vie) 2202 { 2203 int i, n; 2204 uint8_t x; 2205 union { 2206 char buf[8]; 2207 uint64_t u64; 2208 } u; 2209 2210 if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0) 2211 return (0); 2212 2213 /* 2214 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM: 2215 * The memory offset size follows the address-size of the instruction. 2216 */ 2217 n = vie->addrsize; 2218 KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n)); 2219 2220 u.u64 = 0; 2221 for (i = 0; i < n; i++) { 2222 if (vie_peek(vie, &x)) 2223 return (-1); 2224 2225 u.buf[i] = x; 2226 vie_advance(vie); 2227 } 2228 vie->displacement = u.u64; 2229 return (0); 2230 } 2231 2232 /* 2233 * Verify that all the bytes in the instruction buffer were consumed. 2234 */ 2235 static int 2236 verify_inst_length(struct vie *vie) 2237 { 2238 2239 if (vie->num_processed) 2240 return (0); 2241 else 2242 return (-1); 2243 } 2244 2245 /* 2246 * Verify that the 'guest linear address' provided as collateral of the nested 2247 * page table fault matches with our instruction decoding. 2248 */ 2249 static int 2250 verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie) 2251 { 2252 int error; 2253 uint64_t base, idx, gla2; 2254 2255 /* Skip 'gla' verification */ 2256 if (gla == VIE_INVALID_GLA) 2257 return (0); 2258 2259 base = 0; 2260 if (vie->base_register != VM_REG_LAST) { 2261 error = vm_get_register(vm, cpuid, vie->base_register, &base); 2262 if (error) { 2263 printf("verify_gla: error %d getting base reg %d\n", 2264 error, vie->base_register); 2265 return (-1); 2266 } 2267 2268 /* 2269 * RIP-relative addressing starts from the following 2270 * instruction 2271 */ 2272 if (vie->base_register == VM_REG_GUEST_RIP) 2273 base += vie->num_valid; 2274 } 2275 2276 idx = 0; 2277 if (vie->index_register != VM_REG_LAST) { 2278 error = vm_get_register(vm, cpuid, vie->index_register, &idx); 2279 if (error) { 2280 printf("verify_gla: error %d getting index reg %d\n", 2281 error, vie->index_register); 2282 return (-1); 2283 } 2284 } 2285 2286 /* XXX assuming that the base address of the segment is 0 */ 2287 gla2 = base + vie->scale * idx + vie->displacement; 2288 gla2 &= size2mask[vie->addrsize]; 2289 if (gla != gla2) { 2290 printf("verify_gla mismatch: " 2291 "base(0x%0lx), scale(%d), index(0x%0lx), " 2292 "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n", 2293 base, vie->scale, idx, vie->displacement, gla, gla2); 2294 return (-1); 2295 } 2296 2297 return (0); 2298 } 2299 2300 int 2301 vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, 2302 enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) 2303 { 2304 2305 if (decode_prefixes(vie, cpu_mode, cs_d)) 2306 return (-1); 2307 2308 if (decode_opcode(vie)) 2309 return (-1); 2310 2311 if (decode_modrm(vie, cpu_mode)) 2312 return (-1); 2313 2314 if (decode_sib(vie)) 2315 return (-1); 2316 2317 if (decode_displacement(vie)) 2318 return (-1); 2319 2320 if (decode_immediate(vie)) 2321 return (-1); 2322 2323 if (decode_moffset(vie)) 2324 return (-1); 2325 2326 if (verify_inst_length(vie)) 2327 return (-1); 2328 2329 if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) { 2330 if (verify_gla(vm, cpuid, gla, vie)) 2331 return (-1); 2332 } 2333 2334 vie->decoded = 1; /* success */ 2335 2336 return (0); 2337 } 2338 #endif /* _KERNEL */ 2339