1 /*- 2 * Copyright (c) 2012 Sandvine, Inc. 3 * Copyright (c) 2012 NetApp, Inc. 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 * 27 * $FreeBSD$ 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #ifdef _KERNEL 34 #include <sys/param.h> 35 #include <sys/pcpu.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 39 #include <vm/vm.h> 40 #include <vm/pmap.h> 41 42 #include <machine/vmparam.h> 43 #include <machine/vmm.h> 44 #else /* !_KERNEL */ 45 #include <sys/types.h> 46 #include <sys/errno.h> 47 #include <sys/_iovec.h> 48 49 #include <machine/vmm.h> 50 51 #include <assert.h> 52 #include <vmmapi.h> 53 #define KASSERT(exp,msg) assert((exp)) 54 #endif /* _KERNEL */ 55 56 #include <machine/vmm_instruction_emul.h> 57 #include <x86/psl.h> 58 #include <x86/specialreg.h> 59 60 /* struct vie_op.op_type */ 61 enum { 62 VIE_OP_TYPE_NONE = 0, 63 VIE_OP_TYPE_MOV, 64 VIE_OP_TYPE_MOVSX, 65 VIE_OP_TYPE_MOVZX, 66 VIE_OP_TYPE_AND, 67 VIE_OP_TYPE_OR, 68 VIE_OP_TYPE_SUB, 69 VIE_OP_TYPE_TWO_BYTE, 70 VIE_OP_TYPE_PUSH, 71 VIE_OP_TYPE_CMP, 72 VIE_OP_TYPE_POP, 73 VIE_OP_TYPE_MOVS, 74 VIE_OP_TYPE_GROUP1, 75 VIE_OP_TYPE_LAST 76 }; 77 78 /* struct vie_op.op_flags */ 79 #define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */ 80 #define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ 81 #define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */ 82 #define VIE_OP_F_NO_MODRM (1 << 3) 83 #define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4) 84 85 static const struct vie_op two_byte_opcodes[256] = { 86 [0xB6] = { 87 .op_byte = 0xB6, 88 .op_type = VIE_OP_TYPE_MOVZX, 89 }, 90 [0xB7] = { 91 .op_byte = 0xB7, 92 .op_type = VIE_OP_TYPE_MOVZX, 93 }, 94 [0xBE] = { 95 .op_byte = 0xBE, 96 .op_type = VIE_OP_TYPE_MOVSX, 97 }, 98 }; 99 100 static const struct vie_op one_byte_opcodes[256] = { 101 [0x0F] = { 102 .op_byte = 0x0F, 103 .op_type = VIE_OP_TYPE_TWO_BYTE 104 }, 105 [0x2B] = { 106 .op_byte = 0x2B, 107 .op_type = VIE_OP_TYPE_SUB, 108 }, 109 [0x3B] = { 110 .op_byte = 0x3B, 111 .op_type = VIE_OP_TYPE_CMP, 112 }, 113 [0x88] = { 114 .op_byte = 0x88, 115 .op_type = VIE_OP_TYPE_MOV, 116 }, 117 [0x89] = { 118 .op_byte = 0x89, 119 .op_type = VIE_OP_TYPE_MOV, 120 }, 121 [0x8A] = { 122 .op_byte = 0x8A, 123 .op_type = VIE_OP_TYPE_MOV, 124 }, 125 [0x8B] = { 126 .op_byte = 0x8B, 127 .op_type = VIE_OP_TYPE_MOV, 128 }, 129 [0xA1] = { 130 .op_byte = 0xA1, 131 .op_type = VIE_OP_TYPE_MOV, 132 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 133 }, 134 [0xA3] = { 135 .op_byte = 0xA3, 136 .op_type = VIE_OP_TYPE_MOV, 137 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 138 }, 139 [0xA4] = { 140 .op_byte = 0xA4, 141 .op_type = VIE_OP_TYPE_MOVS, 142 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 143 }, 144 [0xA5] = { 145 .op_byte = 0xA5, 146 .op_type = VIE_OP_TYPE_MOVS, 147 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 148 }, 149 [0xC6] = { 150 /* XXX Group 11 extended opcode - not just MOV */ 151 .op_byte = 0xC6, 152 .op_type = VIE_OP_TYPE_MOV, 153 .op_flags = VIE_OP_F_IMM8, 154 }, 155 [0xC7] = { 156 .op_byte = 0xC7, 157 .op_type = VIE_OP_TYPE_MOV, 158 .op_flags = VIE_OP_F_IMM, 159 }, 160 [0x23] = { 161 .op_byte = 0x23, 162 .op_type = VIE_OP_TYPE_AND, 163 }, 164 [0x81] = { 165 /* XXX Group 1 extended opcode */ 166 .op_byte = 0x81, 167 .op_type = VIE_OP_TYPE_GROUP1, 168 .op_flags = VIE_OP_F_IMM, 169 }, 170 [0x83] = { 171 /* XXX Group 1 extended opcode */ 172 .op_byte = 0x83, 173 .op_type = VIE_OP_TYPE_GROUP1, 174 .op_flags = VIE_OP_F_IMM8, 175 }, 176 [0x8F] = { 177 /* XXX Group 1A extended opcode - not just POP */ 178 .op_byte = 0x8F, 179 .op_type = VIE_OP_TYPE_POP, 180 }, 181 [0xFF] = { 182 /* XXX Group 5 extended opcode - not just PUSH */ 183 .op_byte = 0xFF, 184 .op_type = VIE_OP_TYPE_PUSH, 185 } 186 }; 187 188 /* struct vie.mod */ 189 #define VIE_MOD_INDIRECT 0 190 #define VIE_MOD_INDIRECT_DISP8 1 191 #define VIE_MOD_INDIRECT_DISP32 2 192 #define VIE_MOD_DIRECT 3 193 194 /* struct vie.rm */ 195 #define VIE_RM_SIB 4 196 #define VIE_RM_DISP32 5 197 198 #define GB (1024 * 1024 * 1024) 199 200 static enum vm_reg_name gpr_map[16] = { 201 VM_REG_GUEST_RAX, 202 VM_REG_GUEST_RCX, 203 VM_REG_GUEST_RDX, 204 VM_REG_GUEST_RBX, 205 VM_REG_GUEST_RSP, 206 VM_REG_GUEST_RBP, 207 VM_REG_GUEST_RSI, 208 VM_REG_GUEST_RDI, 209 VM_REG_GUEST_R8, 210 VM_REG_GUEST_R9, 211 VM_REG_GUEST_R10, 212 VM_REG_GUEST_R11, 213 VM_REG_GUEST_R12, 214 VM_REG_GUEST_R13, 215 VM_REG_GUEST_R14, 216 VM_REG_GUEST_R15 217 }; 218 219 static uint64_t size2mask[] = { 220 [1] = 0xff, 221 [2] = 0xffff, 222 [4] = 0xffffffff, 223 [8] = 0xffffffffffffffff, 224 }; 225 226 static int 227 vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval) 228 { 229 int error; 230 231 error = vm_get_register(vm, vcpuid, reg, rval); 232 233 return (error); 234 } 235 236 static void 237 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr) 238 { 239 *lhbr = 0; 240 *reg = gpr_map[vie->reg]; 241 242 /* 243 * 64-bit mode imposes limitations on accessing legacy high byte 244 * registers (lhbr). 245 * 246 * The legacy high-byte registers cannot be addressed if the REX 247 * prefix is present. In this case the values 4, 5, 6 and 7 of the 248 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively. 249 * 250 * If the REX prefix is not present then the values 4, 5, 6 and 7 251 * of the 'ModRM:reg' field address the legacy high-byte registers, 252 * %ah, %ch, %dh and %bh respectively. 253 */ 254 if (!vie->rex_present) { 255 if (vie->reg & 0x4) { 256 *lhbr = 1; 257 *reg = gpr_map[vie->reg & 0x3]; 258 } 259 } 260 } 261 262 static int 263 vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval) 264 { 265 uint64_t val; 266 int error, lhbr; 267 enum vm_reg_name reg; 268 269 vie_calc_bytereg(vie, ®, &lhbr); 270 error = vm_get_register(vm, vcpuid, reg, &val); 271 272 /* 273 * To obtain the value of a legacy high byte register shift the 274 * base register right by 8 bits (%ah = %rax >> 8). 275 */ 276 if (lhbr) 277 *rval = val >> 8; 278 else 279 *rval = val; 280 return (error); 281 } 282 283 static int 284 vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte) 285 { 286 uint64_t origval, val, mask; 287 int error, lhbr; 288 enum vm_reg_name reg; 289 290 vie_calc_bytereg(vie, ®, &lhbr); 291 error = vm_get_register(vm, vcpuid, reg, &origval); 292 if (error == 0) { 293 val = byte; 294 mask = 0xff; 295 if (lhbr) { 296 /* 297 * Shift left by 8 to store 'byte' in a legacy high 298 * byte register. 299 */ 300 val <<= 8; 301 mask <<= 8; 302 } 303 val |= origval & ~mask; 304 error = vm_set_register(vm, vcpuid, reg, val); 305 } 306 return (error); 307 } 308 309 int 310 vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, 311 uint64_t val, int size) 312 { 313 int error; 314 uint64_t origval; 315 316 switch (size) { 317 case 1: 318 case 2: 319 error = vie_read_register(vm, vcpuid, reg, &origval); 320 if (error) 321 return (error); 322 val &= size2mask[size]; 323 val |= origval & ~size2mask[size]; 324 break; 325 case 4: 326 val &= 0xffffffffUL; 327 break; 328 case 8: 329 break; 330 default: 331 return (EINVAL); 332 } 333 334 error = vm_set_register(vm, vcpuid, reg, val); 335 return (error); 336 } 337 338 #define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V) 339 340 /* 341 * Return the status flags that would result from doing (x - y). 342 */ 343 #define GETCC(sz) \ 344 static u_long \ 345 getcc##sz(uint##sz##_t x, uint##sz##_t y) \ 346 { \ 347 u_long rflags; \ 348 \ 349 __asm __volatile("sub %2,%1; pushfq; popq %0" : \ 350 "=r" (rflags), "+r" (x) : "m" (y)); \ 351 return (rflags); \ 352 } struct __hack 353 354 GETCC(8); 355 GETCC(16); 356 GETCC(32); 357 GETCC(64); 358 359 static u_long 360 getcc(int opsize, uint64_t x, uint64_t y) 361 { 362 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 363 ("getcc: invalid operand size %d", opsize)); 364 365 if (opsize == 1) 366 return (getcc8(x, y)); 367 else if (opsize == 2) 368 return (getcc16(x, y)); 369 else if (opsize == 4) 370 return (getcc32(x, y)); 371 else 372 return (getcc64(x, y)); 373 } 374 375 static int 376 emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 377 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 378 { 379 int error, size; 380 enum vm_reg_name reg; 381 uint8_t byte; 382 uint64_t val; 383 384 size = vie->opsize; 385 error = EINVAL; 386 387 switch (vie->op.op_byte) { 388 case 0x88: 389 /* 390 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m) 391 * 88/r: mov r/m8, r8 392 * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available) 393 */ 394 size = 1; /* override for byte operation */ 395 error = vie_read_bytereg(vm, vcpuid, vie, &byte); 396 if (error == 0) 397 error = memwrite(vm, vcpuid, gpa, byte, size, arg); 398 break; 399 case 0x89: 400 /* 401 * MOV from reg (ModRM:reg) to mem (ModRM:r/m) 402 * 89/r: mov r/m16, r16 403 * 89/r: mov r/m32, r32 404 * REX.W + 89/r mov r/m64, r64 405 */ 406 reg = gpr_map[vie->reg]; 407 error = vie_read_register(vm, vcpuid, reg, &val); 408 if (error == 0) { 409 val &= size2mask[size]; 410 error = memwrite(vm, vcpuid, gpa, val, size, arg); 411 } 412 break; 413 case 0x8A: 414 /* 415 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg) 416 * 8A/r: mov r8, r/m8 417 * REX + 8A/r: mov r8, r/m8 418 */ 419 size = 1; /* override for byte operation */ 420 error = memread(vm, vcpuid, gpa, &val, size, arg); 421 if (error == 0) 422 error = vie_write_bytereg(vm, vcpuid, vie, val); 423 break; 424 case 0x8B: 425 /* 426 * MOV from mem (ModRM:r/m) to reg (ModRM:reg) 427 * 8B/r: mov r16, r/m16 428 * 8B/r: mov r32, r/m32 429 * REX.W 8B/r: mov r64, r/m64 430 */ 431 error = memread(vm, vcpuid, gpa, &val, size, arg); 432 if (error == 0) { 433 reg = gpr_map[vie->reg]; 434 error = vie_update_register(vm, vcpuid, reg, val, size); 435 } 436 break; 437 case 0xA1: 438 /* 439 * MOV from seg:moffset to AX/EAX/RAX 440 * A1: mov AX, moffs16 441 * A1: mov EAX, moffs32 442 * REX.W + A1: mov RAX, moffs64 443 */ 444 error = memread(vm, vcpuid, gpa, &val, size, arg); 445 if (error == 0) { 446 reg = VM_REG_GUEST_RAX; 447 error = vie_update_register(vm, vcpuid, reg, val, size); 448 } 449 break; 450 case 0xA3: 451 /* 452 * MOV from AX/EAX/RAX to seg:moffset 453 * A3: mov moffs16, AX 454 * A3: mov moffs32, EAX 455 * REX.W + A3: mov moffs64, RAX 456 */ 457 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); 458 if (error == 0) { 459 val &= size2mask[size]; 460 error = memwrite(vm, vcpuid, gpa, val, size, arg); 461 } 462 break; 463 case 0xC6: 464 /* 465 * MOV from imm8 to mem (ModRM:r/m) 466 * C6/0 mov r/m8, imm8 467 * REX + C6/0 mov r/m8, imm8 468 */ 469 size = 1; /* override for byte operation */ 470 error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg); 471 break; 472 case 0xC7: 473 /* 474 * MOV from imm16/imm32 to mem (ModRM:r/m) 475 * C7/0 mov r/m16, imm16 476 * C7/0 mov r/m32, imm32 477 * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) 478 */ 479 val = vie->immediate & size2mask[size]; 480 error = memwrite(vm, vcpuid, gpa, val, size, arg); 481 break; 482 default: 483 break; 484 } 485 486 return (error); 487 } 488 489 static int 490 emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 491 mem_region_read_t memread, mem_region_write_t memwrite, 492 void *arg) 493 { 494 int error, size; 495 enum vm_reg_name reg; 496 uint64_t val; 497 498 size = vie->opsize; 499 error = EINVAL; 500 501 switch (vie->op.op_byte) { 502 case 0xB6: 503 /* 504 * MOV and zero extend byte from mem (ModRM:r/m) to 505 * reg (ModRM:reg). 506 * 507 * 0F B6/r movzx r16, r/m8 508 * 0F B6/r movzx r32, r/m8 509 * REX.W + 0F B6/r movzx r64, r/m8 510 */ 511 512 /* get the first operand */ 513 error = memread(vm, vcpuid, gpa, &val, 1, arg); 514 if (error) 515 break; 516 517 /* get the second operand */ 518 reg = gpr_map[vie->reg]; 519 520 /* zero-extend byte */ 521 val = (uint8_t)val; 522 523 /* write the result */ 524 error = vie_update_register(vm, vcpuid, reg, val, size); 525 break; 526 case 0xB7: 527 /* 528 * MOV and zero extend word from mem (ModRM:r/m) to 529 * reg (ModRM:reg). 530 * 531 * 0F B7/r movzx r32, r/m16 532 * REX.W + 0F B7/r movzx r64, r/m16 533 */ 534 error = memread(vm, vcpuid, gpa, &val, 2, arg); 535 if (error) 536 return (error); 537 538 reg = gpr_map[vie->reg]; 539 540 /* zero-extend word */ 541 val = (uint16_t)val; 542 543 error = vie_update_register(vm, vcpuid, reg, val, size); 544 break; 545 case 0xBE: 546 /* 547 * MOV and sign extend byte from mem (ModRM:r/m) to 548 * reg (ModRM:reg). 549 * 550 * 0F BE/r movsx r16, r/m8 551 * 0F BE/r movsx r32, r/m8 552 * REX.W + 0F BE/r movsx r64, r/m8 553 */ 554 555 /* get the first operand */ 556 error = memread(vm, vcpuid, gpa, &val, 1, arg); 557 if (error) 558 break; 559 560 /* get the second operand */ 561 reg = gpr_map[vie->reg]; 562 563 /* sign extend byte */ 564 val = (int8_t)val; 565 566 /* write the result */ 567 error = vie_update_register(vm, vcpuid, reg, val, size); 568 break; 569 default: 570 break; 571 } 572 return (error); 573 } 574 575 /* 576 * Helper function to calculate and validate a linear address. 577 * 578 * Returns 0 on success and 1 if an exception was injected into the guest. 579 */ 580 static int 581 get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging, 582 int opsize, int addrsize, int prot, enum vm_reg_name seg, 583 enum vm_reg_name gpr, uint64_t *gla) 584 { 585 struct seg_desc desc; 586 uint64_t cr0, val, rflags; 587 int error; 588 589 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); 590 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); 591 592 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 593 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 594 595 error = vm_get_seg_desc(vm, vcpuid, seg, &desc); 596 KASSERT(error == 0, ("%s: error %d getting segment descriptor %d", 597 __func__, error, seg)); 598 599 error = vie_read_register(vm, vcpuid, gpr, &val); 600 KASSERT(error == 0, ("%s: error %d getting register %d", __func__, 601 error, gpr)); 602 603 if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize, 604 addrsize, prot, gla)) { 605 if (seg == VM_REG_GUEST_SS) 606 vm_inject_ss(vm, vcpuid, 0); 607 else 608 vm_inject_gp(vm, vcpuid); 609 return (1); 610 } 611 612 if (vie_canonical_check(paging->cpu_mode, *gla)) { 613 if (seg == VM_REG_GUEST_SS) 614 vm_inject_ss(vm, vcpuid, 0); 615 else 616 vm_inject_gp(vm, vcpuid); 617 return (1); 618 } 619 620 if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) { 621 vm_inject_ac(vm, vcpuid, 0); 622 return (1); 623 } 624 625 return (0); 626 } 627 628 static int 629 emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 630 struct vm_guest_paging *paging, mem_region_read_t memread, 631 mem_region_write_t memwrite, void *arg) 632 { 633 #ifdef _KERNEL 634 struct vm_copyinfo copyinfo[2]; 635 #else 636 struct iovec copyinfo[2]; 637 #endif 638 uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val; 639 uint64_t rcx, rdi, rsi, rflags; 640 int error, opsize, seg, repeat; 641 642 opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize; 643 val = 0; 644 error = 0; 645 646 /* 647 * XXX although the MOVS instruction is only supposed to be used with 648 * the "rep" prefix some guests like FreeBSD will use "repnz" instead. 649 * 650 * Empirically the "repnz" prefix has identical behavior to "rep" 651 * and the zero flag does not make a difference. 652 */ 653 repeat = vie->repz_present | vie->repnz_present; 654 655 if (repeat) { 656 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); 657 KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); 658 659 /* 660 * The count register is %rcx, %ecx or %cx depending on the 661 * address size of the instruction. 662 */ 663 if ((rcx & vie_size2mask(vie->addrsize)) == 0) 664 return (0); 665 } 666 667 /* 668 * Source Destination Comments 669 * -------------------------------------------- 670 * (1) memory memory n/a 671 * (2) memory mmio emulated 672 * (3) mmio memory emulated 673 * (4) mmio mmio emulated 674 * 675 * At this point we don't have sufficient information to distinguish 676 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this 677 * out because it will succeed only when operating on regular memory. 678 * 679 * XXX the emulation doesn't properly handle the case where 'gpa' 680 * is straddling the boundary between the normal memory and MMIO. 681 */ 682 683 seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS; 684 error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, 685 PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr); 686 if (error) 687 goto done; 688 689 error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ, 690 copyinfo, nitems(copyinfo)); 691 if (error == 0) { 692 /* 693 * case (2): read from system memory and write to mmio. 694 */ 695 vm_copyin(vm, vcpuid, copyinfo, &val, opsize); 696 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 697 error = memwrite(vm, vcpuid, gpa, val, opsize, arg); 698 if (error) 699 goto done; 700 } else if (error > 0) { 701 /* 702 * Resume guest execution to handle fault. 703 */ 704 goto done; 705 } else { 706 /* 707 * 'vm_copy_setup()' is expected to fail for cases (3) and (4) 708 * if 'srcaddr' is in the mmio space. 709 */ 710 711 error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, 712 PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr); 713 if (error) 714 goto done; 715 716 error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize, 717 PROT_WRITE, copyinfo, nitems(copyinfo)); 718 if (error == 0) { 719 /* 720 * case (3): read from MMIO and write to system memory. 721 * 722 * A MMIO read can have side-effects so we 723 * commit to it only after vm_copy_setup() is 724 * successful. If a page-fault needs to be 725 * injected into the guest then it will happen 726 * before the MMIO read is attempted. 727 */ 728 error = memread(vm, vcpuid, gpa, &val, opsize, arg); 729 if (error) 730 goto done; 731 732 vm_copyout(vm, vcpuid, &val, copyinfo, opsize); 733 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 734 } else if (error > 0) { 735 /* 736 * Resume guest execution to handle fault. 737 */ 738 goto done; 739 } else { 740 /* 741 * Case (4): read from and write to mmio. 742 */ 743 error = vm_gla2gpa(vm, vcpuid, paging, srcaddr, 744 PROT_READ, &srcgpa); 745 if (error) 746 goto done; 747 error = memread(vm, vcpuid, srcgpa, &val, opsize, arg); 748 if (error) 749 goto done; 750 751 error = vm_gla2gpa(vm, vcpuid, paging, dstaddr, 752 PROT_WRITE, &dstgpa); 753 if (error) 754 goto done; 755 error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg); 756 if (error) 757 goto done; 758 } 759 } 760 761 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi); 762 KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error)); 763 764 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); 765 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); 766 767 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 768 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 769 770 if (rflags & PSL_D) { 771 rsi -= opsize; 772 rdi -= opsize; 773 } else { 774 rsi += opsize; 775 rdi += opsize; 776 } 777 778 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi, 779 vie->addrsize); 780 KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error)); 781 782 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, 783 vie->addrsize); 784 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); 785 786 if (repeat) { 787 rcx = rcx - 1; 788 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, 789 rcx, vie->addrsize); 790 KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); 791 792 /* 793 * Repeat the instruction if the count register is not zero. 794 */ 795 if ((rcx & vie_size2mask(vie->addrsize)) != 0) 796 vm_restart_instruction(vm, vcpuid); 797 } 798 done: 799 if (error < 0) 800 return (EFAULT); 801 else 802 return (0); 803 } 804 805 static int 806 emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 807 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 808 { 809 int error, size; 810 enum vm_reg_name reg; 811 uint64_t result, rflags, rflags2, val1, val2; 812 813 size = vie->opsize; 814 error = EINVAL; 815 816 switch (vie->op.op_byte) { 817 case 0x23: 818 /* 819 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the 820 * result in reg. 821 * 822 * 23/r and r16, r/m16 823 * 23/r and r32, r/m32 824 * REX.W + 23/r and r64, r/m64 825 */ 826 827 /* get the first operand */ 828 reg = gpr_map[vie->reg]; 829 error = vie_read_register(vm, vcpuid, reg, &val1); 830 if (error) 831 break; 832 833 /* get the second operand */ 834 error = memread(vm, vcpuid, gpa, &val2, size, arg); 835 if (error) 836 break; 837 838 /* perform the operation and write the result */ 839 result = val1 & val2; 840 error = vie_update_register(vm, vcpuid, reg, result, size); 841 break; 842 case 0x81: 843 case 0x83: 844 /* 845 * AND mem (ModRM:r/m) with immediate and store the 846 * result in mem. 847 * 848 * 81 /4 and r/m16, imm16 849 * 81 /4 and r/m32, imm32 850 * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64 851 * 852 * 83 /4 and r/m16, imm8 sign-extended to 16 853 * 83 /4 and r/m32, imm8 sign-extended to 32 854 * REX.W + 83/4 and r/m64, imm8 sign-extended to 64 855 */ 856 857 /* get the first operand */ 858 error = memread(vm, vcpuid, gpa, &val1, size, arg); 859 if (error) 860 break; 861 862 /* 863 * perform the operation with the pre-fetched immediate 864 * operand and write the result 865 */ 866 result = val1 & vie->immediate; 867 error = memwrite(vm, vcpuid, gpa, result, size, arg); 868 break; 869 default: 870 break; 871 } 872 if (error) 873 return (error); 874 875 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 876 if (error) 877 return (error); 878 879 /* 880 * OF and CF are cleared; the SF, ZF and PF flags are set according 881 * to the result; AF is undefined. 882 * 883 * The updated status flags are obtained by subtracting 0 from 'result'. 884 */ 885 rflags2 = getcc(size, result, 0); 886 rflags &= ~RFLAGS_STATUS_BITS; 887 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 888 889 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 890 return (error); 891 } 892 893 static int 894 emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 895 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 896 { 897 int error, size; 898 uint64_t val1, result, rflags, rflags2; 899 900 size = vie->opsize; 901 error = EINVAL; 902 903 switch (vie->op.op_byte) { 904 case 0x81: 905 case 0x83: 906 /* 907 * OR mem (ModRM:r/m) with immediate and store the 908 * result in mem. 909 * 910 * 81 /1 or r/m16, imm16 911 * 81 /1 or r/m32, imm32 912 * REX.W + 81 /1 or r/m64, imm32 sign-extended to 64 913 * 914 * 83 /1 or r/m16, imm8 sign-extended to 16 915 * 83 /1 or r/m32, imm8 sign-extended to 32 916 * REX.W + 83/1 or r/m64, imm8 sign-extended to 64 917 */ 918 919 /* get the first operand */ 920 error = memread(vm, vcpuid, gpa, &val1, size, arg); 921 if (error) 922 break; 923 924 /* 925 * perform the operation with the pre-fetched immediate 926 * operand and write the result 927 */ 928 result = val1 | vie->immediate; 929 error = memwrite(vm, vcpuid, gpa, result, size, arg); 930 break; 931 default: 932 break; 933 } 934 if (error) 935 return (error); 936 937 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 938 if (error) 939 return (error); 940 941 /* 942 * OF and CF are cleared; the SF, ZF and PF flags are set according 943 * to the result; AF is undefined. 944 * 945 * The updated status flags are obtained by subtracting 0 from 'result'. 946 */ 947 rflags2 = getcc(size, result, 0); 948 rflags &= ~RFLAGS_STATUS_BITS; 949 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 950 951 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 952 return (error); 953 } 954 955 static int 956 emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 957 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 958 { 959 int error, size; 960 uint64_t op1, op2, rflags, rflags2; 961 enum vm_reg_name reg; 962 963 size = vie->opsize; 964 switch (vie->op.op_byte) { 965 case 0x3B: 966 /* 967 * 3B/r CMP r16, r/m16 968 * 3B/r CMP r32, r/m32 969 * REX.W + 3B/r CMP r64, r/m64 970 * 971 * Compare first operand (reg) with second operand (r/m) and 972 * set status flags in EFLAGS register. The comparison is 973 * performed by subtracting the second operand from the first 974 * operand and then setting the status flags. 975 */ 976 977 /* Get the first operand */ 978 reg = gpr_map[vie->reg]; 979 error = vie_read_register(vm, vcpuid, reg, &op1); 980 if (error) 981 return (error); 982 983 /* Get the second operand */ 984 error = memread(vm, vcpuid, gpa, &op2, size, arg); 985 if (error) 986 return (error); 987 988 rflags2 = getcc(size, op1, op2); 989 break; 990 case 0x81: 991 case 0x83: 992 /* 993 * 81 /7 cmp r/m16, imm16 994 * 81 /7 cmp r/m32, imm32 995 * REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 64 996 * 997 * 83 /7 cmp r/m16, imm8 sign-extended to 16 998 * 83 /7 cmp r/m32, imm8 sign-extended to 32 999 * REX.W + 83 /7 cmp r/m64, imm8 sign-extended to 64 1000 * 1001 * Compare mem (ModRM:r/m) with immediate and set 1002 * status flags according to the results. The 1003 * comparison is performed by subtracting the 1004 * immediate from the first operand and then setting 1005 * the status flags. 1006 * 1007 */ 1008 1009 /* get the first operand */ 1010 error = memread(vm, vcpuid, gpa, &op1, size, arg); 1011 if (error) 1012 return (error); 1013 1014 rflags2 = getcc(size, op1, vie->immediate); 1015 break; 1016 default: 1017 return (EINVAL); 1018 } 1019 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1020 if (error) 1021 return (error); 1022 rflags &= ~RFLAGS_STATUS_BITS; 1023 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1024 1025 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 1026 return (error); 1027 } 1028 1029 static int 1030 emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 1031 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 1032 { 1033 int error, size; 1034 uint64_t nval, rflags, rflags2, val1, val2; 1035 enum vm_reg_name reg; 1036 1037 size = vie->opsize; 1038 error = EINVAL; 1039 1040 switch (vie->op.op_byte) { 1041 case 0x2B: 1042 /* 1043 * SUB r/m from r and store the result in r 1044 * 1045 * 2B/r SUB r16, r/m16 1046 * 2B/r SUB r32, r/m32 1047 * REX.W + 2B/r SUB r64, r/m64 1048 */ 1049 1050 /* get the first operand */ 1051 reg = gpr_map[vie->reg]; 1052 error = vie_read_register(vm, vcpuid, reg, &val1); 1053 if (error) 1054 break; 1055 1056 /* get the second operand */ 1057 error = memread(vm, vcpuid, gpa, &val2, size, arg); 1058 if (error) 1059 break; 1060 1061 /* perform the operation and write the result */ 1062 nval = val1 - val2; 1063 error = vie_update_register(vm, vcpuid, reg, nval, size); 1064 break; 1065 default: 1066 break; 1067 } 1068 1069 if (!error) { 1070 rflags2 = getcc(size, val1, val2); 1071 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1072 &rflags); 1073 if (error) 1074 return (error); 1075 1076 rflags &= ~RFLAGS_STATUS_BITS; 1077 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1078 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1079 rflags, 8); 1080 } 1081 1082 return (error); 1083 } 1084 1085 static int 1086 emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, 1087 struct vm_guest_paging *paging, mem_region_read_t memread, 1088 mem_region_write_t memwrite, void *arg) 1089 { 1090 #ifdef _KERNEL 1091 struct vm_copyinfo copyinfo[2]; 1092 #else 1093 struct iovec copyinfo[2]; 1094 #endif 1095 struct seg_desc ss_desc; 1096 uint64_t cr0, rflags, rsp, stack_gla, val; 1097 int error, size, stackaddrsize, pushop; 1098 1099 val = 0; 1100 size = vie->opsize; 1101 pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0; 1102 1103 /* 1104 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1 1105 */ 1106 if (paging->cpu_mode == CPU_MODE_REAL) { 1107 stackaddrsize = 2; 1108 } else if (paging->cpu_mode == CPU_MODE_64BIT) { 1109 /* 1110 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3 1111 * - Stack pointer size is always 64-bits. 1112 * - PUSH/POP of 32-bit values is not possible in 64-bit mode. 1113 * - 16-bit PUSH/POP is supported by using the operand size 1114 * override prefix (66H). 1115 */ 1116 stackaddrsize = 8; 1117 size = vie->opsize_override ? 2 : 8; 1118 } else { 1119 /* 1120 * In protected or compability mode the 'B' flag in the 1121 * stack-segment descriptor determines the size of the 1122 * stack pointer. 1123 */ 1124 error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc); 1125 KASSERT(error == 0, ("%s: error %d getting SS descriptor", 1126 __func__, error)); 1127 if (SEG_DESC_DEF32(ss_desc.access)) 1128 stackaddrsize = 4; 1129 else 1130 stackaddrsize = 2; 1131 } 1132 1133 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); 1134 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); 1135 1136 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1137 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 1138 1139 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp); 1140 KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error)); 1141 if (pushop) { 1142 rsp -= size; 1143 } 1144 1145 if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc, 1146 rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ, 1147 &stack_gla)) { 1148 vm_inject_ss(vm, vcpuid, 0); 1149 return (0); 1150 } 1151 1152 if (vie_canonical_check(paging->cpu_mode, stack_gla)) { 1153 vm_inject_ss(vm, vcpuid, 0); 1154 return (0); 1155 } 1156 1157 if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) { 1158 vm_inject_ac(vm, vcpuid, 0); 1159 return (0); 1160 } 1161 1162 error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, 1163 pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo)); 1164 if (error == -1) { 1165 /* 1166 * XXX cannot return a negative error value here because it 1167 * ends up being the return value of the VM_RUN() ioctl and 1168 * is interpreted as a pseudo-error (for e.g. ERESTART). 1169 */ 1170 return (EFAULT); 1171 } else if (error == 1) { 1172 /* Resume guest execution to handle page fault */ 1173 return (0); 1174 } 1175 1176 if (pushop) { 1177 error = memread(vm, vcpuid, mmio_gpa, &val, size, arg); 1178 if (error == 0) 1179 vm_copyout(vm, vcpuid, &val, copyinfo, size); 1180 } else { 1181 vm_copyin(vm, vcpuid, copyinfo, &val, size); 1182 error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg); 1183 rsp += size; 1184 } 1185 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 1186 1187 if (error == 0) { 1188 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp, 1189 stackaddrsize); 1190 KASSERT(error == 0, ("error %d updating rsp", error)); 1191 } 1192 return (error); 1193 } 1194 1195 static int 1196 emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, 1197 struct vm_guest_paging *paging, mem_region_read_t memread, 1198 mem_region_write_t memwrite, void *arg) 1199 { 1200 int error; 1201 1202 /* 1203 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. 1204 * 1205 * PUSH is part of the group 5 extended opcodes and is identified 1206 * by ModRM:reg = b110. 1207 */ 1208 if ((vie->reg & 7) != 6) 1209 return (EINVAL); 1210 1211 error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread, 1212 memwrite, arg); 1213 return (error); 1214 } 1215 1216 static int 1217 emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, 1218 struct vm_guest_paging *paging, mem_region_read_t memread, 1219 mem_region_write_t memwrite, void *arg) 1220 { 1221 int error; 1222 1223 /* 1224 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. 1225 * 1226 * POP is part of the group 1A extended opcodes and is identified 1227 * by ModRM:reg = b000. 1228 */ 1229 if ((vie->reg & 7) != 0) 1230 return (EINVAL); 1231 1232 error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread, 1233 memwrite, arg); 1234 return (error); 1235 } 1236 1237 static int 1238 emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 1239 struct vm_guest_paging *paging, mem_region_read_t memread, 1240 mem_region_write_t memwrite, void *memarg) 1241 { 1242 int error; 1243 1244 switch (vie->reg & 7) { 1245 case 0x1: /* OR */ 1246 error = emulate_or(vm, vcpuid, gpa, vie, 1247 memread, memwrite, memarg); 1248 break; 1249 case 0x4: /* AND */ 1250 error = emulate_and(vm, vcpuid, gpa, vie, 1251 memread, memwrite, memarg); 1252 break; 1253 case 0x7: /* CMP */ 1254 error = emulate_cmp(vm, vcpuid, gpa, vie, 1255 memread, memwrite, memarg); 1256 break; 1257 default: 1258 error = EINVAL; 1259 break; 1260 } 1261 1262 return (error); 1263 } 1264 1265 int 1266 vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 1267 struct vm_guest_paging *paging, mem_region_read_t memread, 1268 mem_region_write_t memwrite, void *memarg) 1269 { 1270 int error; 1271 1272 if (!vie->decoded) 1273 return (EINVAL); 1274 1275 switch (vie->op.op_type) { 1276 case VIE_OP_TYPE_GROUP1: 1277 error = emulate_group1(vm, vcpuid, gpa, vie, paging, memread, 1278 memwrite, memarg); 1279 break; 1280 case VIE_OP_TYPE_POP: 1281 error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread, 1282 memwrite, memarg); 1283 break; 1284 case VIE_OP_TYPE_PUSH: 1285 error = emulate_push(vm, vcpuid, gpa, vie, paging, memread, 1286 memwrite, memarg); 1287 break; 1288 case VIE_OP_TYPE_CMP: 1289 error = emulate_cmp(vm, vcpuid, gpa, vie, 1290 memread, memwrite, memarg); 1291 break; 1292 case VIE_OP_TYPE_MOV: 1293 error = emulate_mov(vm, vcpuid, gpa, vie, 1294 memread, memwrite, memarg); 1295 break; 1296 case VIE_OP_TYPE_MOVSX: 1297 case VIE_OP_TYPE_MOVZX: 1298 error = emulate_movx(vm, vcpuid, gpa, vie, 1299 memread, memwrite, memarg); 1300 break; 1301 case VIE_OP_TYPE_MOVS: 1302 error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread, 1303 memwrite, memarg); 1304 break; 1305 case VIE_OP_TYPE_AND: 1306 error = emulate_and(vm, vcpuid, gpa, vie, 1307 memread, memwrite, memarg); 1308 break; 1309 case VIE_OP_TYPE_OR: 1310 error = emulate_or(vm, vcpuid, gpa, vie, 1311 memread, memwrite, memarg); 1312 break; 1313 case VIE_OP_TYPE_SUB: 1314 error = emulate_sub(vm, vcpuid, gpa, vie, 1315 memread, memwrite, memarg); 1316 break; 1317 default: 1318 error = EINVAL; 1319 break; 1320 } 1321 1322 return (error); 1323 } 1324 1325 int 1326 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) 1327 { 1328 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 1329 ("%s: invalid size %d", __func__, size)); 1330 KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl)); 1331 1332 if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) 1333 return (0); 1334 1335 return ((gla & (size - 1)) ? 1 : 0); 1336 } 1337 1338 int 1339 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla) 1340 { 1341 uint64_t mask; 1342 1343 if (cpu_mode != CPU_MODE_64BIT) 1344 return (0); 1345 1346 /* 1347 * The value of the bit 47 in the 'gla' should be replicated in the 1348 * most significant 16 bits. 1349 */ 1350 mask = ~((1UL << 48) - 1); 1351 if (gla & (1UL << 47)) 1352 return ((gla & mask) != mask); 1353 else 1354 return ((gla & mask) != 0); 1355 } 1356 1357 uint64_t 1358 vie_size2mask(int size) 1359 { 1360 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 1361 ("vie_size2mask: invalid size %d", size)); 1362 return (size2mask[size]); 1363 } 1364 1365 int 1366 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, 1367 struct seg_desc *desc, uint64_t offset, int length, int addrsize, 1368 int prot, uint64_t *gla) 1369 { 1370 uint64_t firstoff, low_limit, high_limit, segbase; 1371 int glasize, type; 1372 1373 KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS, 1374 ("%s: invalid segment %d", __func__, seg)); 1375 KASSERT(length == 1 || length == 2 || length == 4 || length == 8, 1376 ("%s: invalid operand size %d", __func__, length)); 1377 KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, 1378 ("%s: invalid prot %#x", __func__, prot)); 1379 1380 firstoff = offset; 1381 if (cpu_mode == CPU_MODE_64BIT) { 1382 KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address " 1383 "size %d for cpu_mode %d", __func__, addrsize, cpu_mode)); 1384 glasize = 8; 1385 } else { 1386 KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address " 1387 "size %d for cpu mode %d", __func__, addrsize, cpu_mode)); 1388 glasize = 4; 1389 /* 1390 * If the segment selector is loaded with a NULL selector 1391 * then the descriptor is unusable and attempting to use 1392 * it results in a #GP(0). 1393 */ 1394 if (SEG_DESC_UNUSABLE(desc->access)) 1395 return (-1); 1396 1397 /* 1398 * The processor generates a #NP exception when a segment 1399 * register is loaded with a selector that points to a 1400 * descriptor that is not present. If this was the case then 1401 * it would have been checked before the VM-exit. 1402 */ 1403 KASSERT(SEG_DESC_PRESENT(desc->access), 1404 ("segment %d not present: %#x", seg, desc->access)); 1405 1406 /* 1407 * The descriptor type must indicate a code/data segment. 1408 */ 1409 type = SEG_DESC_TYPE(desc->access); 1410 KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " 1411 "descriptor type %#x", seg, type)); 1412 1413 if (prot & PROT_READ) { 1414 /* #GP on a read access to a exec-only code segment */ 1415 if ((type & 0xA) == 0x8) 1416 return (-1); 1417 } 1418 1419 if (prot & PROT_WRITE) { 1420 /* 1421 * #GP on a write access to a code segment or a 1422 * read-only data segment. 1423 */ 1424 if (type & 0x8) /* code segment */ 1425 return (-1); 1426 1427 if ((type & 0xA) == 0) /* read-only data seg */ 1428 return (-1); 1429 } 1430 1431 /* 1432 * 'desc->limit' is fully expanded taking granularity into 1433 * account. 1434 */ 1435 if ((type & 0xC) == 0x4) { 1436 /* expand-down data segment */ 1437 low_limit = desc->limit + 1; 1438 high_limit = SEG_DESC_DEF32(desc->access) ? 1439 0xffffffff : 0xffff; 1440 } else { 1441 /* code segment or expand-up data segment */ 1442 low_limit = 0; 1443 high_limit = desc->limit; 1444 } 1445 1446 while (length > 0) { 1447 offset &= vie_size2mask(addrsize); 1448 if (offset < low_limit || offset > high_limit) 1449 return (-1); 1450 offset++; 1451 length--; 1452 } 1453 } 1454 1455 /* 1456 * In 64-bit mode all segments except %fs and %gs have a segment 1457 * base address of 0. 1458 */ 1459 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && 1460 seg != VM_REG_GUEST_GS) { 1461 segbase = 0; 1462 } else { 1463 segbase = desc->base; 1464 } 1465 1466 /* 1467 * Truncate 'firstoff' to the effective address size before adding 1468 * it to the segment base. 1469 */ 1470 firstoff &= vie_size2mask(addrsize); 1471 *gla = (segbase + firstoff) & vie_size2mask(glasize); 1472 return (0); 1473 } 1474 1475 #ifdef _KERNEL 1476 void 1477 vie_init(struct vie *vie, const char *inst_bytes, int inst_length) 1478 { 1479 KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE, 1480 ("%s: invalid instruction length (%d)", __func__, inst_length)); 1481 1482 bzero(vie, sizeof(struct vie)); 1483 1484 vie->base_register = VM_REG_LAST; 1485 vie->index_register = VM_REG_LAST; 1486 vie->segment_register = VM_REG_LAST; 1487 1488 if (inst_length) { 1489 bcopy(inst_bytes, vie->inst, inst_length); 1490 vie->num_valid = inst_length; 1491 } 1492 } 1493 1494 static int 1495 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte) 1496 { 1497 int error_code = 0; 1498 1499 if (pte & PG_V) 1500 error_code |= PGEX_P; 1501 if (prot & VM_PROT_WRITE) 1502 error_code |= PGEX_W; 1503 if (usermode) 1504 error_code |= PGEX_U; 1505 if (rsvd) 1506 error_code |= PGEX_RSV; 1507 if (prot & VM_PROT_EXECUTE) 1508 error_code |= PGEX_I; 1509 1510 return (error_code); 1511 } 1512 1513 static void 1514 ptp_release(void **cookie) 1515 { 1516 if (*cookie != NULL) { 1517 vm_gpa_release(*cookie); 1518 *cookie = NULL; 1519 } 1520 } 1521 1522 static void * 1523 ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie) 1524 { 1525 void *ptr; 1526 1527 ptp_release(cookie); 1528 ptr = vm_gpa_hold(vm, ptpphys, len, VM_PROT_RW, cookie); 1529 return (ptr); 1530 } 1531 1532 int 1533 vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 1534 uint64_t gla, int prot, uint64_t *gpa) 1535 { 1536 int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable; 1537 u_int retries; 1538 uint64_t *ptpbase, ptpphys, pte, pgsize; 1539 uint32_t *ptpbase32, pte32; 1540 void *cookie; 1541 1542 usermode = (paging->cpl == 3 ? 1 : 0); 1543 writable = prot & VM_PROT_WRITE; 1544 cookie = NULL; 1545 retval = 0; 1546 retries = 0; 1547 restart: 1548 ptpphys = paging->cr3; /* root of the page tables */ 1549 ptp_release(&cookie); 1550 if (retries++ > 0) 1551 maybe_yield(); 1552 1553 if (vie_canonical_check(paging->cpu_mode, gla)) { 1554 /* 1555 * XXX assuming a non-stack reference otherwise a stack fault 1556 * should be generated. 1557 */ 1558 vm_inject_gp(vm, vcpuid); 1559 goto fault; 1560 } 1561 1562 if (paging->paging_mode == PAGING_MODE_FLAT) { 1563 *gpa = gla; 1564 goto done; 1565 } 1566 1567 if (paging->paging_mode == PAGING_MODE_32) { 1568 nlevels = 2; 1569 while (--nlevels >= 0) { 1570 /* Zero out the lower 12 bits. */ 1571 ptpphys &= ~0xfff; 1572 1573 ptpbase32 = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie); 1574 1575 if (ptpbase32 == NULL) 1576 goto error; 1577 1578 ptpshift = PAGE_SHIFT + nlevels * 10; 1579 ptpindex = (gla >> ptpshift) & 0x3FF; 1580 pgsize = 1UL << ptpshift; 1581 1582 pte32 = ptpbase32[ptpindex]; 1583 1584 if ((pte32 & PG_V) == 0 || 1585 (usermode && (pte32 & PG_U) == 0) || 1586 (writable && (pte32 & PG_RW) == 0)) { 1587 pfcode = pf_error_code(usermode, prot, 0, 1588 pte32); 1589 vm_inject_pf(vm, vcpuid, pfcode, gla); 1590 goto fault; 1591 } 1592 1593 /* 1594 * Emulate the x86 MMU's management of the accessed 1595 * and dirty flags. While the accessed flag is set 1596 * at every level of the page table, the dirty flag 1597 * is only set at the last level providing the guest 1598 * physical address. 1599 */ 1600 if ((pte32 & PG_A) == 0) { 1601 if (atomic_cmpset_32(&ptpbase32[ptpindex], 1602 pte32, pte32 | PG_A) == 0) { 1603 goto restart; 1604 } 1605 } 1606 1607 /* XXX must be ignored if CR4.PSE=0 */ 1608 if (nlevels > 0 && (pte32 & PG_PS) != 0) 1609 break; 1610 1611 ptpphys = pte32; 1612 } 1613 1614 /* Set the dirty bit in the page table entry if necessary */ 1615 if (writable && (pte32 & PG_M) == 0) { 1616 if (atomic_cmpset_32(&ptpbase32[ptpindex], 1617 pte32, pte32 | PG_M) == 0) { 1618 goto restart; 1619 } 1620 } 1621 1622 /* Zero out the lower 'ptpshift' bits */ 1623 pte32 >>= ptpshift; pte32 <<= ptpshift; 1624 *gpa = pte32 | (gla & (pgsize - 1)); 1625 goto done; 1626 } 1627 1628 if (paging->paging_mode == PAGING_MODE_PAE) { 1629 /* Zero out the lower 5 bits and the upper 32 bits */ 1630 ptpphys &= 0xffffffe0UL; 1631 1632 ptpbase = ptp_hold(vm, ptpphys, sizeof(*ptpbase) * 4, &cookie); 1633 if (ptpbase == NULL) 1634 goto error; 1635 1636 ptpindex = (gla >> 30) & 0x3; 1637 1638 pte = ptpbase[ptpindex]; 1639 1640 if ((pte & PG_V) == 0) { 1641 pfcode = pf_error_code(usermode, prot, 0, pte); 1642 vm_inject_pf(vm, vcpuid, pfcode, gla); 1643 goto fault; 1644 } 1645 1646 ptpphys = pte; 1647 1648 nlevels = 2; 1649 } else 1650 nlevels = 4; 1651 while (--nlevels >= 0) { 1652 /* Zero out the lower 12 bits and the upper 12 bits */ 1653 ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12; 1654 1655 ptpbase = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie); 1656 if (ptpbase == NULL) 1657 goto error; 1658 1659 ptpshift = PAGE_SHIFT + nlevels * 9; 1660 ptpindex = (gla >> ptpshift) & 0x1FF; 1661 pgsize = 1UL << ptpshift; 1662 1663 pte = ptpbase[ptpindex]; 1664 1665 if ((pte & PG_V) == 0 || 1666 (usermode && (pte & PG_U) == 0) || 1667 (writable && (pte & PG_RW) == 0)) { 1668 pfcode = pf_error_code(usermode, prot, 0, pte); 1669 vm_inject_pf(vm, vcpuid, pfcode, gla); 1670 goto fault; 1671 } 1672 1673 /* Set the accessed bit in the page table entry */ 1674 if ((pte & PG_A) == 0) { 1675 if (atomic_cmpset_64(&ptpbase[ptpindex], 1676 pte, pte | PG_A) == 0) { 1677 goto restart; 1678 } 1679 } 1680 1681 if (nlevels > 0 && (pte & PG_PS) != 0) { 1682 if (pgsize > 1 * GB) { 1683 pfcode = pf_error_code(usermode, prot, 1, pte); 1684 vm_inject_pf(vm, vcpuid, pfcode, gla); 1685 goto fault; 1686 } 1687 break; 1688 } 1689 1690 ptpphys = pte; 1691 } 1692 1693 /* Set the dirty bit in the page table entry if necessary */ 1694 if (writable && (pte & PG_M) == 0) { 1695 if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0) 1696 goto restart; 1697 } 1698 1699 /* Zero out the lower 'ptpshift' bits and the upper 12 bits */ 1700 pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12; 1701 *gpa = pte | (gla & (pgsize - 1)); 1702 done: 1703 ptp_release(&cookie); 1704 return (retval); 1705 error: 1706 retval = -1; 1707 goto done; 1708 fault: 1709 retval = 1; 1710 goto done; 1711 } 1712 1713 int 1714 vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 1715 uint64_t rip, int inst_length, struct vie *vie) 1716 { 1717 struct vm_copyinfo copyinfo[2]; 1718 int error, prot; 1719 1720 if (inst_length > VIE_INST_SIZE) 1721 panic("vmm_fetch_instruction: invalid length %d", inst_length); 1722 1723 prot = PROT_READ | PROT_EXEC; 1724 error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot, 1725 copyinfo, nitems(copyinfo)); 1726 if (error == 0) { 1727 vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length); 1728 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 1729 vie->num_valid = inst_length; 1730 } 1731 return (error); 1732 } 1733 1734 static int 1735 vie_peek(struct vie *vie, uint8_t *x) 1736 { 1737 1738 if (vie->num_processed < vie->num_valid) { 1739 *x = vie->inst[vie->num_processed]; 1740 return (0); 1741 } else 1742 return (-1); 1743 } 1744 1745 static void 1746 vie_advance(struct vie *vie) 1747 { 1748 1749 vie->num_processed++; 1750 } 1751 1752 static bool 1753 segment_override(uint8_t x, int *seg) 1754 { 1755 1756 switch (x) { 1757 case 0x2E: 1758 *seg = VM_REG_GUEST_CS; 1759 break; 1760 case 0x36: 1761 *seg = VM_REG_GUEST_SS; 1762 break; 1763 case 0x3E: 1764 *seg = VM_REG_GUEST_DS; 1765 break; 1766 case 0x26: 1767 *seg = VM_REG_GUEST_ES; 1768 break; 1769 case 0x64: 1770 *seg = VM_REG_GUEST_FS; 1771 break; 1772 case 0x65: 1773 *seg = VM_REG_GUEST_GS; 1774 break; 1775 default: 1776 return (false); 1777 } 1778 return (true); 1779 } 1780 1781 static int 1782 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d) 1783 { 1784 uint8_t x; 1785 1786 while (1) { 1787 if (vie_peek(vie, &x)) 1788 return (-1); 1789 1790 if (x == 0x66) 1791 vie->opsize_override = 1; 1792 else if (x == 0x67) 1793 vie->addrsize_override = 1; 1794 else if (x == 0xF3) 1795 vie->repz_present = 1; 1796 else if (x == 0xF2) 1797 vie->repnz_present = 1; 1798 else if (segment_override(x, &vie->segment_register)) 1799 vie->segment_override = 1; 1800 else 1801 break; 1802 1803 vie_advance(vie); 1804 } 1805 1806 /* 1807 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2: 1808 * - Only one REX prefix is allowed per instruction. 1809 * - The REX prefix must immediately precede the opcode byte or the 1810 * escape opcode byte. 1811 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3) 1812 * the mandatory prefix must come before the REX prefix. 1813 */ 1814 if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) { 1815 vie->rex_present = 1; 1816 vie->rex_w = x & 0x8 ? 1 : 0; 1817 vie->rex_r = x & 0x4 ? 1 : 0; 1818 vie->rex_x = x & 0x2 ? 1 : 0; 1819 vie->rex_b = x & 0x1 ? 1 : 0; 1820 vie_advance(vie); 1821 } 1822 1823 /* 1824 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1 1825 */ 1826 if (cpu_mode == CPU_MODE_64BIT) { 1827 /* 1828 * Default address size is 64-bits and default operand size 1829 * is 32-bits. 1830 */ 1831 vie->addrsize = vie->addrsize_override ? 4 : 8; 1832 if (vie->rex_w) 1833 vie->opsize = 8; 1834 else if (vie->opsize_override) 1835 vie->opsize = 2; 1836 else 1837 vie->opsize = 4; 1838 } else if (cs_d) { 1839 /* Default address and operand sizes are 32-bits */ 1840 vie->addrsize = vie->addrsize_override ? 2 : 4; 1841 vie->opsize = vie->opsize_override ? 2 : 4; 1842 } else { 1843 /* Default address and operand sizes are 16-bits */ 1844 vie->addrsize = vie->addrsize_override ? 4 : 2; 1845 vie->opsize = vie->opsize_override ? 4 : 2; 1846 } 1847 return (0); 1848 } 1849 1850 static int 1851 decode_two_byte_opcode(struct vie *vie) 1852 { 1853 uint8_t x; 1854 1855 if (vie_peek(vie, &x)) 1856 return (-1); 1857 1858 vie->op = two_byte_opcodes[x]; 1859 1860 if (vie->op.op_type == VIE_OP_TYPE_NONE) 1861 return (-1); 1862 1863 vie_advance(vie); 1864 return (0); 1865 } 1866 1867 static int 1868 decode_opcode(struct vie *vie) 1869 { 1870 uint8_t x; 1871 1872 if (vie_peek(vie, &x)) 1873 return (-1); 1874 1875 vie->op = one_byte_opcodes[x]; 1876 1877 if (vie->op.op_type == VIE_OP_TYPE_NONE) 1878 return (-1); 1879 1880 vie_advance(vie); 1881 1882 if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE) 1883 return (decode_two_byte_opcode(vie)); 1884 1885 return (0); 1886 } 1887 1888 static int 1889 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode) 1890 { 1891 uint8_t x; 1892 1893 if (vie->op.op_flags & VIE_OP_F_NO_MODRM) 1894 return (0); 1895 1896 if (cpu_mode == CPU_MODE_REAL) 1897 return (-1); 1898 1899 if (vie_peek(vie, &x)) 1900 return (-1); 1901 1902 vie->mod = (x >> 6) & 0x3; 1903 vie->rm = (x >> 0) & 0x7; 1904 vie->reg = (x >> 3) & 0x7; 1905 1906 /* 1907 * A direct addressing mode makes no sense in the context of an EPT 1908 * fault. There has to be a memory access involved to cause the 1909 * EPT fault. 1910 */ 1911 if (vie->mod == VIE_MOD_DIRECT) 1912 return (-1); 1913 1914 if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) || 1915 (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) { 1916 /* 1917 * Table 2-5: Special Cases of REX Encodings 1918 * 1919 * mod=0, r/m=5 is used in the compatibility mode to 1920 * indicate a disp32 without a base register. 1921 * 1922 * mod!=3, r/m=4 is used in the compatibility mode to 1923 * indicate that the SIB byte is present. 1924 * 1925 * The 'b' bit in the REX prefix is don't care in 1926 * this case. 1927 */ 1928 } else { 1929 vie->rm |= (vie->rex_b << 3); 1930 } 1931 1932 vie->reg |= (vie->rex_r << 3); 1933 1934 /* SIB */ 1935 if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB) 1936 goto done; 1937 1938 vie->base_register = gpr_map[vie->rm]; 1939 1940 switch (vie->mod) { 1941 case VIE_MOD_INDIRECT_DISP8: 1942 vie->disp_bytes = 1; 1943 break; 1944 case VIE_MOD_INDIRECT_DISP32: 1945 vie->disp_bytes = 4; 1946 break; 1947 case VIE_MOD_INDIRECT: 1948 if (vie->rm == VIE_RM_DISP32) { 1949 vie->disp_bytes = 4; 1950 /* 1951 * Table 2-7. RIP-Relative Addressing 1952 * 1953 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32 1954 * whereas in compatibility mode it just implies disp32. 1955 */ 1956 1957 if (cpu_mode == CPU_MODE_64BIT) 1958 vie->base_register = VM_REG_GUEST_RIP; 1959 else 1960 vie->base_register = VM_REG_LAST; 1961 } 1962 break; 1963 } 1964 1965 done: 1966 vie_advance(vie); 1967 1968 return (0); 1969 } 1970 1971 static int 1972 decode_sib(struct vie *vie) 1973 { 1974 uint8_t x; 1975 1976 /* Proceed only if SIB byte is present */ 1977 if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB) 1978 return (0); 1979 1980 if (vie_peek(vie, &x)) 1981 return (-1); 1982 1983 /* De-construct the SIB byte */ 1984 vie->ss = (x >> 6) & 0x3; 1985 vie->index = (x >> 3) & 0x7; 1986 vie->base = (x >> 0) & 0x7; 1987 1988 /* Apply the REX prefix modifiers */ 1989 vie->index |= vie->rex_x << 3; 1990 vie->base |= vie->rex_b << 3; 1991 1992 switch (vie->mod) { 1993 case VIE_MOD_INDIRECT_DISP8: 1994 vie->disp_bytes = 1; 1995 break; 1996 case VIE_MOD_INDIRECT_DISP32: 1997 vie->disp_bytes = 4; 1998 break; 1999 } 2000 2001 if (vie->mod == VIE_MOD_INDIRECT && 2002 (vie->base == 5 || vie->base == 13)) { 2003 /* 2004 * Special case when base register is unused if mod = 0 2005 * and base = %rbp or %r13. 2006 * 2007 * Documented in: 2008 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 2009 * Table 2-5: Special Cases of REX Encodings 2010 */ 2011 vie->disp_bytes = 4; 2012 } else { 2013 vie->base_register = gpr_map[vie->base]; 2014 } 2015 2016 /* 2017 * All encodings of 'index' are valid except for %rsp (4). 2018 * 2019 * Documented in: 2020 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 2021 * Table 2-5: Special Cases of REX Encodings 2022 */ 2023 if (vie->index != 4) 2024 vie->index_register = gpr_map[vie->index]; 2025 2026 /* 'scale' makes sense only in the context of an index register */ 2027 if (vie->index_register < VM_REG_LAST) 2028 vie->scale = 1 << vie->ss; 2029 2030 vie_advance(vie); 2031 2032 return (0); 2033 } 2034 2035 static int 2036 decode_displacement(struct vie *vie) 2037 { 2038 int n, i; 2039 uint8_t x; 2040 2041 union { 2042 char buf[4]; 2043 int8_t signed8; 2044 int32_t signed32; 2045 } u; 2046 2047 if ((n = vie->disp_bytes) == 0) 2048 return (0); 2049 2050 if (n != 1 && n != 4) 2051 panic("decode_displacement: invalid disp_bytes %d", n); 2052 2053 for (i = 0; i < n; i++) { 2054 if (vie_peek(vie, &x)) 2055 return (-1); 2056 2057 u.buf[i] = x; 2058 vie_advance(vie); 2059 } 2060 2061 if (n == 1) 2062 vie->displacement = u.signed8; /* sign-extended */ 2063 else 2064 vie->displacement = u.signed32; /* sign-extended */ 2065 2066 return (0); 2067 } 2068 2069 static int 2070 decode_immediate(struct vie *vie) 2071 { 2072 int i, n; 2073 uint8_t x; 2074 union { 2075 char buf[4]; 2076 int8_t signed8; 2077 int16_t signed16; 2078 int32_t signed32; 2079 } u; 2080 2081 /* Figure out immediate operand size (if any) */ 2082 if (vie->op.op_flags & VIE_OP_F_IMM) { 2083 /* 2084 * Section 2.2.1.5 "Immediates", Intel SDM: 2085 * In 64-bit mode the typical size of immediate operands 2086 * remains 32-bits. When the operand size if 64-bits, the 2087 * processor sign-extends all immediates to 64-bits prior 2088 * to their use. 2089 */ 2090 if (vie->opsize == 4 || vie->opsize == 8) 2091 vie->imm_bytes = 4; 2092 else 2093 vie->imm_bytes = 2; 2094 } else if (vie->op.op_flags & VIE_OP_F_IMM8) { 2095 vie->imm_bytes = 1; 2096 } 2097 2098 if ((n = vie->imm_bytes) == 0) 2099 return (0); 2100 2101 KASSERT(n == 1 || n == 2 || n == 4, 2102 ("%s: invalid number of immediate bytes: %d", __func__, n)); 2103 2104 for (i = 0; i < n; i++) { 2105 if (vie_peek(vie, &x)) 2106 return (-1); 2107 2108 u.buf[i] = x; 2109 vie_advance(vie); 2110 } 2111 2112 /* sign-extend the immediate value before use */ 2113 if (n == 1) 2114 vie->immediate = u.signed8; 2115 else if (n == 2) 2116 vie->immediate = u.signed16; 2117 else 2118 vie->immediate = u.signed32; 2119 2120 return (0); 2121 } 2122 2123 static int 2124 decode_moffset(struct vie *vie) 2125 { 2126 int i, n; 2127 uint8_t x; 2128 union { 2129 char buf[8]; 2130 uint64_t u64; 2131 } u; 2132 2133 if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0) 2134 return (0); 2135 2136 /* 2137 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM: 2138 * The memory offset size follows the address-size of the instruction. 2139 */ 2140 n = vie->addrsize; 2141 KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n)); 2142 2143 u.u64 = 0; 2144 for (i = 0; i < n; i++) { 2145 if (vie_peek(vie, &x)) 2146 return (-1); 2147 2148 u.buf[i] = x; 2149 vie_advance(vie); 2150 } 2151 vie->displacement = u.u64; 2152 return (0); 2153 } 2154 2155 /* 2156 * Verify that all the bytes in the instruction buffer were consumed. 2157 */ 2158 static int 2159 verify_inst_length(struct vie *vie) 2160 { 2161 2162 if (vie->num_processed) 2163 return (0); 2164 else 2165 return (-1); 2166 } 2167 2168 /* 2169 * Verify that the 'guest linear address' provided as collateral of the nested 2170 * page table fault matches with our instruction decoding. 2171 */ 2172 static int 2173 verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie) 2174 { 2175 int error; 2176 uint64_t base, idx, gla2; 2177 2178 /* Skip 'gla' verification */ 2179 if (gla == VIE_INVALID_GLA) 2180 return (0); 2181 2182 base = 0; 2183 if (vie->base_register != VM_REG_LAST) { 2184 error = vm_get_register(vm, cpuid, vie->base_register, &base); 2185 if (error) { 2186 printf("verify_gla: error %d getting base reg %d\n", 2187 error, vie->base_register); 2188 return (-1); 2189 } 2190 2191 /* 2192 * RIP-relative addressing starts from the following 2193 * instruction 2194 */ 2195 if (vie->base_register == VM_REG_GUEST_RIP) 2196 base += vie->num_valid; 2197 } 2198 2199 idx = 0; 2200 if (vie->index_register != VM_REG_LAST) { 2201 error = vm_get_register(vm, cpuid, vie->index_register, &idx); 2202 if (error) { 2203 printf("verify_gla: error %d getting index reg %d\n", 2204 error, vie->index_register); 2205 return (-1); 2206 } 2207 } 2208 2209 /* XXX assuming that the base address of the segment is 0 */ 2210 gla2 = base + vie->scale * idx + vie->displacement; 2211 gla2 &= size2mask[vie->addrsize]; 2212 if (gla != gla2) { 2213 printf("verify_gla mismatch: " 2214 "base(0x%0lx), scale(%d), index(0x%0lx), " 2215 "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n", 2216 base, vie->scale, idx, vie->displacement, gla, gla2); 2217 return (-1); 2218 } 2219 2220 return (0); 2221 } 2222 2223 int 2224 vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, 2225 enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) 2226 { 2227 2228 if (decode_prefixes(vie, cpu_mode, cs_d)) 2229 return (-1); 2230 2231 if (decode_opcode(vie)) 2232 return (-1); 2233 2234 if (decode_modrm(vie, cpu_mode)) 2235 return (-1); 2236 2237 if (decode_sib(vie)) 2238 return (-1); 2239 2240 if (decode_displacement(vie)) 2241 return (-1); 2242 2243 if (decode_immediate(vie)) 2244 return (-1); 2245 2246 if (decode_moffset(vie)) 2247 return (-1); 2248 2249 if (verify_inst_length(vie)) 2250 return (-1); 2251 2252 if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) { 2253 if (verify_gla(vm, cpuid, gla, vie)) 2254 return (-1); 2255 } 2256 2257 vie->decoded = 1; /* success */ 2258 2259 return (0); 2260 } 2261 #endif /* _KERNEL */ 2262