1 /*- 2 * Copyright (c) 2012 Sandvine, Inc. 3 * Copyright (c) 2012 NetApp, Inc. 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 * 27 * $FreeBSD$ 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #ifdef _KERNEL 34 #include <sys/param.h> 35 #include <sys/pcpu.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 39 #include <vm/vm.h> 40 #include <vm/pmap.h> 41 42 #include <machine/vmparam.h> 43 #include <machine/vmm.h> 44 #else /* !_KERNEL */ 45 #include <sys/types.h> 46 #include <sys/errno.h> 47 #include <sys/_iovec.h> 48 49 #include <machine/vmm.h> 50 51 #include <assert.h> 52 #include <vmmapi.h> 53 #define KASSERT(exp,msg) assert((exp)) 54 #endif /* _KERNEL */ 55 56 #include <machine/vmm_instruction_emul.h> 57 #include <x86/psl.h> 58 #include <x86/specialreg.h> 59 60 /* struct vie_op.op_type */ 61 enum { 62 VIE_OP_TYPE_NONE = 0, 63 VIE_OP_TYPE_MOV, 64 VIE_OP_TYPE_MOVSX, 65 VIE_OP_TYPE_MOVZX, 66 VIE_OP_TYPE_AND, 67 VIE_OP_TYPE_OR, 68 VIE_OP_TYPE_SUB, 69 VIE_OP_TYPE_TWO_BYTE, 70 VIE_OP_TYPE_PUSH, 71 VIE_OP_TYPE_CMP, 72 VIE_OP_TYPE_POP, 73 VIE_OP_TYPE_MOVS, 74 VIE_OP_TYPE_LAST 75 }; 76 77 /* struct vie_op.op_flags */ 78 #define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */ 79 #define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ 80 #define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */ 81 #define VIE_OP_F_NO_MODRM (1 << 3) 82 #define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4) 83 84 static const struct vie_op two_byte_opcodes[256] = { 85 [0xB6] = { 86 .op_byte = 0xB6, 87 .op_type = VIE_OP_TYPE_MOVZX, 88 }, 89 [0xB7] = { 90 .op_byte = 0xB7, 91 .op_type = VIE_OP_TYPE_MOVZX, 92 }, 93 [0xBE] = { 94 .op_byte = 0xBE, 95 .op_type = VIE_OP_TYPE_MOVSX, 96 }, 97 }; 98 99 static const struct vie_op one_byte_opcodes[256] = { 100 [0x0F] = { 101 .op_byte = 0x0F, 102 .op_type = VIE_OP_TYPE_TWO_BYTE 103 }, 104 [0x2B] = { 105 .op_byte = 0x2B, 106 .op_type = VIE_OP_TYPE_SUB, 107 }, 108 [0x3B] = { 109 .op_byte = 0x3B, 110 .op_type = VIE_OP_TYPE_CMP, 111 }, 112 [0x88] = { 113 .op_byte = 0x88, 114 .op_type = VIE_OP_TYPE_MOV, 115 }, 116 [0x89] = { 117 .op_byte = 0x89, 118 .op_type = VIE_OP_TYPE_MOV, 119 }, 120 [0x8A] = { 121 .op_byte = 0x8A, 122 .op_type = VIE_OP_TYPE_MOV, 123 }, 124 [0x8B] = { 125 .op_byte = 0x8B, 126 .op_type = VIE_OP_TYPE_MOV, 127 }, 128 [0xA1] = { 129 .op_byte = 0xA1, 130 .op_type = VIE_OP_TYPE_MOV, 131 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 132 }, 133 [0xA3] = { 134 .op_byte = 0xA3, 135 .op_type = VIE_OP_TYPE_MOV, 136 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 137 }, 138 [0xA4] = { 139 .op_byte = 0xA4, 140 .op_type = VIE_OP_TYPE_MOVS, 141 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 142 }, 143 [0xA5] = { 144 .op_byte = 0xA5, 145 .op_type = VIE_OP_TYPE_MOVS, 146 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 147 }, 148 [0xC6] = { 149 /* XXX Group 11 extended opcode - not just MOV */ 150 .op_byte = 0xC6, 151 .op_type = VIE_OP_TYPE_MOV, 152 .op_flags = VIE_OP_F_IMM8, 153 }, 154 [0xC7] = { 155 .op_byte = 0xC7, 156 .op_type = VIE_OP_TYPE_MOV, 157 .op_flags = VIE_OP_F_IMM, 158 }, 159 [0x23] = { 160 .op_byte = 0x23, 161 .op_type = VIE_OP_TYPE_AND, 162 }, 163 [0x81] = { 164 /* XXX Group 1 extended opcode - not just AND */ 165 .op_byte = 0x81, 166 .op_type = VIE_OP_TYPE_AND, 167 .op_flags = VIE_OP_F_IMM, 168 }, 169 [0x83] = { 170 /* XXX Group 1 extended opcode - not just OR */ 171 .op_byte = 0x83, 172 .op_type = VIE_OP_TYPE_OR, 173 .op_flags = VIE_OP_F_IMM8, 174 }, 175 [0x8F] = { 176 /* XXX Group 1A extended opcode - not just POP */ 177 .op_byte = 0x8F, 178 .op_type = VIE_OP_TYPE_POP, 179 }, 180 [0xFF] = { 181 /* XXX Group 5 extended opcode - not just PUSH */ 182 .op_byte = 0xFF, 183 .op_type = VIE_OP_TYPE_PUSH, 184 } 185 }; 186 187 /* struct vie.mod */ 188 #define VIE_MOD_INDIRECT 0 189 #define VIE_MOD_INDIRECT_DISP8 1 190 #define VIE_MOD_INDIRECT_DISP32 2 191 #define VIE_MOD_DIRECT 3 192 193 /* struct vie.rm */ 194 #define VIE_RM_SIB 4 195 #define VIE_RM_DISP32 5 196 197 #define GB (1024 * 1024 * 1024) 198 199 static enum vm_reg_name gpr_map[16] = { 200 VM_REG_GUEST_RAX, 201 VM_REG_GUEST_RCX, 202 VM_REG_GUEST_RDX, 203 VM_REG_GUEST_RBX, 204 VM_REG_GUEST_RSP, 205 VM_REG_GUEST_RBP, 206 VM_REG_GUEST_RSI, 207 VM_REG_GUEST_RDI, 208 VM_REG_GUEST_R8, 209 VM_REG_GUEST_R9, 210 VM_REG_GUEST_R10, 211 VM_REG_GUEST_R11, 212 VM_REG_GUEST_R12, 213 VM_REG_GUEST_R13, 214 VM_REG_GUEST_R14, 215 VM_REG_GUEST_R15 216 }; 217 218 static uint64_t size2mask[] = { 219 [1] = 0xff, 220 [2] = 0xffff, 221 [4] = 0xffffffff, 222 [8] = 0xffffffffffffffff, 223 }; 224 225 static int 226 vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval) 227 { 228 int error; 229 230 error = vm_get_register(vm, vcpuid, reg, rval); 231 232 return (error); 233 } 234 235 static void 236 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr) 237 { 238 *lhbr = 0; 239 *reg = gpr_map[vie->reg]; 240 241 /* 242 * 64-bit mode imposes limitations on accessing legacy high byte 243 * registers (lhbr). 244 * 245 * The legacy high-byte registers cannot be addressed if the REX 246 * prefix is present. In this case the values 4, 5, 6 and 7 of the 247 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively. 248 * 249 * If the REX prefix is not present then the values 4, 5, 6 and 7 250 * of the 'ModRM:reg' field address the legacy high-byte registers, 251 * %ah, %ch, %dh and %bh respectively. 252 */ 253 if (!vie->rex_present) { 254 if (vie->reg & 0x4) { 255 *lhbr = 1; 256 *reg = gpr_map[vie->reg & 0x3]; 257 } 258 } 259 } 260 261 static int 262 vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval) 263 { 264 uint64_t val; 265 int error, lhbr; 266 enum vm_reg_name reg; 267 268 vie_calc_bytereg(vie, ®, &lhbr); 269 error = vm_get_register(vm, vcpuid, reg, &val); 270 271 /* 272 * To obtain the value of a legacy high byte register shift the 273 * base register right by 8 bits (%ah = %rax >> 8). 274 */ 275 if (lhbr) 276 *rval = val >> 8; 277 else 278 *rval = val; 279 return (error); 280 } 281 282 static int 283 vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte) 284 { 285 uint64_t origval, val, mask; 286 int error, lhbr; 287 enum vm_reg_name reg; 288 289 vie_calc_bytereg(vie, ®, &lhbr); 290 error = vm_get_register(vm, vcpuid, reg, &origval); 291 if (error == 0) { 292 val = byte; 293 mask = 0xff; 294 if (lhbr) { 295 /* 296 * Shift left by 8 to store 'byte' in a legacy high 297 * byte register. 298 */ 299 val <<= 8; 300 mask <<= 8; 301 } 302 val |= origval & ~mask; 303 error = vm_set_register(vm, vcpuid, reg, val); 304 } 305 return (error); 306 } 307 308 int 309 vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, 310 uint64_t val, int size) 311 { 312 int error; 313 uint64_t origval; 314 315 switch (size) { 316 case 1: 317 case 2: 318 error = vie_read_register(vm, vcpuid, reg, &origval); 319 if (error) 320 return (error); 321 val &= size2mask[size]; 322 val |= origval & ~size2mask[size]; 323 break; 324 case 4: 325 val &= 0xffffffffUL; 326 break; 327 case 8: 328 break; 329 default: 330 return (EINVAL); 331 } 332 333 error = vm_set_register(vm, vcpuid, reg, val); 334 return (error); 335 } 336 337 #define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V) 338 339 /* 340 * Return the status flags that would result from doing (x - y). 341 */ 342 #define GETCC(sz) \ 343 static u_long \ 344 getcc##sz(uint##sz##_t x, uint##sz##_t y) \ 345 { \ 346 u_long rflags; \ 347 \ 348 __asm __volatile("sub %2,%1; pushfq; popq %0" : \ 349 "=r" (rflags), "+r" (x) : "m" (y)); \ 350 return (rflags); \ 351 } struct __hack 352 353 GETCC(8); 354 GETCC(16); 355 GETCC(32); 356 GETCC(64); 357 358 static u_long 359 getcc(int opsize, uint64_t x, uint64_t y) 360 { 361 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 362 ("getcc: invalid operand size %d", opsize)); 363 364 if (opsize == 1) 365 return (getcc8(x, y)); 366 else if (opsize == 2) 367 return (getcc16(x, y)); 368 else if (opsize == 4) 369 return (getcc32(x, y)); 370 else 371 return (getcc64(x, y)); 372 } 373 374 static int 375 emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 376 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 377 { 378 int error, size; 379 enum vm_reg_name reg; 380 uint8_t byte; 381 uint64_t val; 382 383 size = vie->opsize; 384 error = EINVAL; 385 386 switch (vie->op.op_byte) { 387 case 0x88: 388 /* 389 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m) 390 * 88/r: mov r/m8, r8 391 * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available) 392 */ 393 size = 1; /* override for byte operation */ 394 error = vie_read_bytereg(vm, vcpuid, vie, &byte); 395 if (error == 0) 396 error = memwrite(vm, vcpuid, gpa, byte, size, arg); 397 break; 398 case 0x89: 399 /* 400 * MOV from reg (ModRM:reg) to mem (ModRM:r/m) 401 * 89/r: mov r/m16, r16 402 * 89/r: mov r/m32, r32 403 * REX.W + 89/r mov r/m64, r64 404 */ 405 reg = gpr_map[vie->reg]; 406 error = vie_read_register(vm, vcpuid, reg, &val); 407 if (error == 0) { 408 val &= size2mask[size]; 409 error = memwrite(vm, vcpuid, gpa, val, size, arg); 410 } 411 break; 412 case 0x8A: 413 /* 414 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg) 415 * 8A/r: mov r8, r/m8 416 * REX + 8A/r: mov r8, r/m8 417 */ 418 size = 1; /* override for byte operation */ 419 error = memread(vm, vcpuid, gpa, &val, size, arg); 420 if (error == 0) 421 error = vie_write_bytereg(vm, vcpuid, vie, val); 422 break; 423 case 0x8B: 424 /* 425 * MOV from mem (ModRM:r/m) to reg (ModRM:reg) 426 * 8B/r: mov r16, r/m16 427 * 8B/r: mov r32, r/m32 428 * REX.W 8B/r: mov r64, r/m64 429 */ 430 error = memread(vm, vcpuid, gpa, &val, size, arg); 431 if (error == 0) { 432 reg = gpr_map[vie->reg]; 433 error = vie_update_register(vm, vcpuid, reg, val, size); 434 } 435 break; 436 case 0xA1: 437 /* 438 * MOV from seg:moffset to AX/EAX/RAX 439 * A1: mov AX, moffs16 440 * A1: mov EAX, moffs32 441 * REX.W + A1: mov RAX, moffs64 442 */ 443 error = memread(vm, vcpuid, gpa, &val, size, arg); 444 if (error == 0) { 445 reg = VM_REG_GUEST_RAX; 446 error = vie_update_register(vm, vcpuid, reg, val, size); 447 } 448 break; 449 case 0xA3: 450 /* 451 * MOV from AX/EAX/RAX to seg:moffset 452 * A3: mov moffs16, AX 453 * A3: mov moffs32, EAX 454 * REX.W + A3: mov moffs64, RAX 455 */ 456 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); 457 if (error == 0) { 458 val &= size2mask[size]; 459 error = memwrite(vm, vcpuid, gpa, val, size, arg); 460 } 461 break; 462 case 0xC6: 463 /* 464 * MOV from imm8 to mem (ModRM:r/m) 465 * C6/0 mov r/m8, imm8 466 * REX + C6/0 mov r/m8, imm8 467 */ 468 size = 1; /* override for byte operation */ 469 error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg); 470 break; 471 case 0xC7: 472 /* 473 * MOV from imm16/imm32 to mem (ModRM:r/m) 474 * C7/0 mov r/m16, imm16 475 * C7/0 mov r/m32, imm32 476 * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) 477 */ 478 val = vie->immediate & size2mask[size]; 479 error = memwrite(vm, vcpuid, gpa, val, size, arg); 480 break; 481 default: 482 break; 483 } 484 485 return (error); 486 } 487 488 static int 489 emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 490 mem_region_read_t memread, mem_region_write_t memwrite, 491 void *arg) 492 { 493 int error, size; 494 enum vm_reg_name reg; 495 uint64_t val; 496 497 size = vie->opsize; 498 error = EINVAL; 499 500 switch (vie->op.op_byte) { 501 case 0xB6: 502 /* 503 * MOV and zero extend byte from mem (ModRM:r/m) to 504 * reg (ModRM:reg). 505 * 506 * 0F B6/r movzx r16, r/m8 507 * 0F B6/r movzx r32, r/m8 508 * REX.W + 0F B6/r movzx r64, r/m8 509 */ 510 511 /* get the first operand */ 512 error = memread(vm, vcpuid, gpa, &val, 1, arg); 513 if (error) 514 break; 515 516 /* get the second operand */ 517 reg = gpr_map[vie->reg]; 518 519 /* zero-extend byte */ 520 val = (uint8_t)val; 521 522 /* write the result */ 523 error = vie_update_register(vm, vcpuid, reg, val, size); 524 break; 525 case 0xB7: 526 /* 527 * MOV and zero extend word from mem (ModRM:r/m) to 528 * reg (ModRM:reg). 529 * 530 * 0F B7/r movzx r32, r/m16 531 * REX.W + 0F B7/r movzx r64, r/m16 532 */ 533 error = memread(vm, vcpuid, gpa, &val, 2, arg); 534 if (error) 535 return (error); 536 537 reg = gpr_map[vie->reg]; 538 539 /* zero-extend word */ 540 val = (uint16_t)val; 541 542 error = vie_update_register(vm, vcpuid, reg, val, size); 543 break; 544 case 0xBE: 545 /* 546 * MOV and sign extend byte from mem (ModRM:r/m) to 547 * reg (ModRM:reg). 548 * 549 * 0F BE/r movsx r16, r/m8 550 * 0F BE/r movsx r32, r/m8 551 * REX.W + 0F BE/r movsx r64, r/m8 552 */ 553 554 /* get the first operand */ 555 error = memread(vm, vcpuid, gpa, &val, 1, arg); 556 if (error) 557 break; 558 559 /* get the second operand */ 560 reg = gpr_map[vie->reg]; 561 562 /* sign extend byte */ 563 val = (int8_t)val; 564 565 /* write the result */ 566 error = vie_update_register(vm, vcpuid, reg, val, size); 567 break; 568 default: 569 break; 570 } 571 return (error); 572 } 573 574 /* 575 * Helper function to calculate and validate a linear address. 576 * 577 * Returns 0 on success and 1 if an exception was injected into the guest. 578 */ 579 static int 580 get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging, 581 int opsize, int addrsize, int prot, enum vm_reg_name seg, 582 enum vm_reg_name gpr, uint64_t *gla) 583 { 584 struct seg_desc desc; 585 uint64_t cr0, val, rflags; 586 int error; 587 588 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); 589 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); 590 591 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 592 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 593 594 error = vm_get_seg_desc(vm, vcpuid, seg, &desc); 595 KASSERT(error == 0, ("%s: error %d getting segment descriptor %d", 596 __func__, error, seg)); 597 598 error = vie_read_register(vm, vcpuid, gpr, &val); 599 KASSERT(error == 0, ("%s: error %d getting register %d", __func__, 600 error, gpr)); 601 602 if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize, 603 addrsize, prot, gla)) { 604 if (seg == VM_REG_GUEST_SS) 605 vm_inject_ss(vm, vcpuid, 0); 606 else 607 vm_inject_gp(vm, vcpuid); 608 return (1); 609 } 610 611 if (vie_canonical_check(paging->cpu_mode, *gla)) { 612 if (seg == VM_REG_GUEST_SS) 613 vm_inject_ss(vm, vcpuid, 0); 614 else 615 vm_inject_gp(vm, vcpuid); 616 return (1); 617 } 618 619 if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) { 620 vm_inject_ac(vm, vcpuid, 0); 621 return (1); 622 } 623 624 return (0); 625 } 626 627 static int 628 emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 629 struct vm_guest_paging *paging, mem_region_read_t memread, 630 mem_region_write_t memwrite, void *arg) 631 { 632 #ifdef _KERNEL 633 struct vm_copyinfo copyinfo[2]; 634 #else 635 struct iovec copyinfo[2]; 636 #endif 637 uint64_t dstaddr, srcaddr, val; 638 uint64_t rcx, rdi, rsi, rflags; 639 int error, opsize, seg, repeat; 640 641 opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize; 642 val = 0; 643 error = 0; 644 645 /* 646 * XXX although the MOVS instruction is only supposed to be used with 647 * the "rep" prefix some guests like FreeBSD will use "repnz" instead. 648 * 649 * Empirically the "repnz" prefix has identical behavior to "rep" 650 * and the zero flag does not make a difference. 651 */ 652 repeat = vie->repz_present | vie->repnz_present; 653 654 if (repeat) { 655 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); 656 KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); 657 658 /* 659 * The count register is %rcx, %ecx or %cx depending on the 660 * address size of the instruction. 661 */ 662 if ((rcx & vie_size2mask(vie->addrsize)) == 0) 663 return (0); 664 } 665 666 /* 667 * Source Destination Comments 668 * -------------------------------------------- 669 * (1) memory memory n/a 670 * (2) memory mmio emulated 671 * (3) mmio memory emulated 672 * (4) mmio mmio not emulated 673 * 674 * At this point we don't have sufficient information to distinguish 675 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this 676 * out because it will succeed only when operating on regular memory. 677 * 678 * XXX the emulation doesn't properly handle the case where 'gpa' 679 * is straddling the boundary between the normal memory and MMIO. 680 */ 681 682 seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS; 683 error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, 684 PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr); 685 if (error) 686 goto done; 687 688 error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ, 689 copyinfo, nitems(copyinfo)); 690 if (error == 0) { 691 /* 692 * case (2): read from system memory and write to mmio. 693 */ 694 vm_copyin(vm, vcpuid, copyinfo, &val, opsize); 695 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 696 error = memwrite(vm, vcpuid, gpa, val, opsize, arg); 697 goto done; 698 } else if (error > 0) { 699 /* 700 * Resume guest execution to handle fault. 701 */ 702 goto done; 703 } else { 704 /* 705 * 'vm_copy_setup()' is expected to fail for cases (3) and (4) 706 * if 'srcaddr' is in the mmio space. 707 */ 708 } 709 710 error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, 711 PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr); 712 if (error) 713 goto done; 714 715 error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize, 716 PROT_WRITE, copyinfo, nitems(copyinfo)); 717 if (error == 0) { 718 /* 719 * case (3): read from MMIO and write to system memory. 720 * 721 * A MMIO read can have side-effects so we commit to it 722 * only after vm_copy_setup() is successful. If a page-fault 723 * needs to be injected into the guest then it will happen 724 * before the MMIO read is attempted. 725 */ 726 error = memread(vm, vcpuid, gpa, &val, opsize, arg); 727 if (error) 728 goto done; 729 730 vm_copyout(vm, vcpuid, &val, copyinfo, opsize); 731 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 732 } else if (error > 0) { 733 /* 734 * Resume guest execution to handle fault. 735 */ 736 goto done; 737 } else { 738 goto done; 739 } 740 741 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi); 742 KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error)); 743 744 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); 745 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); 746 747 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 748 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 749 750 if (rflags & PSL_D) { 751 rsi -= opsize; 752 rdi -= opsize; 753 } else { 754 rsi += opsize; 755 rdi += opsize; 756 } 757 758 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi, 759 vie->addrsize); 760 KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error)); 761 762 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, 763 vie->addrsize); 764 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); 765 766 if (repeat) { 767 rcx = rcx - 1; 768 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, 769 rcx, vie->addrsize); 770 KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); 771 772 /* 773 * Repeat the instruction if the count register is not zero. 774 */ 775 if ((rcx & vie_size2mask(vie->addrsize)) != 0) 776 vm_restart_instruction(vm, vcpuid); 777 } 778 done: 779 if (error < 0) 780 return (EFAULT); 781 else 782 return (0); 783 } 784 785 static int 786 emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 787 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 788 { 789 int error, size; 790 enum vm_reg_name reg; 791 uint64_t result, rflags, rflags2, val1, val2; 792 793 size = vie->opsize; 794 error = EINVAL; 795 796 switch (vie->op.op_byte) { 797 case 0x23: 798 /* 799 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the 800 * result in reg. 801 * 802 * 23/r and r16, r/m16 803 * 23/r and r32, r/m32 804 * REX.W + 23/r and r64, r/m64 805 */ 806 807 /* get the first operand */ 808 reg = gpr_map[vie->reg]; 809 error = vie_read_register(vm, vcpuid, reg, &val1); 810 if (error) 811 break; 812 813 /* get the second operand */ 814 error = memread(vm, vcpuid, gpa, &val2, size, arg); 815 if (error) 816 break; 817 818 /* perform the operation and write the result */ 819 result = val1 & val2; 820 error = vie_update_register(vm, vcpuid, reg, result, size); 821 break; 822 case 0x81: 823 /* 824 * AND/OR mem (ModRM:r/m) with immediate and store the 825 * result in mem. 826 * 827 * AND: i = 4 828 * OR: i = 1 829 * 81 /i op r/m16, imm16 830 * 81 /i op r/m32, imm32 831 * REX.W + 81 /i op r/m64, imm32 sign-extended to 64 832 * 833 */ 834 835 /* get the first operand */ 836 error = memread(vm, vcpuid, gpa, &val1, size, arg); 837 if (error) 838 break; 839 840 /* 841 * perform the operation with the pre-fetched immediate 842 * operand and write the result 843 */ 844 switch (vie->reg & 7) { 845 case 0x4: 846 /* modrm:reg == b100, AND */ 847 result = val1 & vie->immediate; 848 break; 849 case 0x1: 850 /* modrm:reg == b001, OR */ 851 result = val1 | vie->immediate; 852 break; 853 default: 854 error = EINVAL; 855 break; 856 } 857 if (error) 858 break; 859 860 error = memwrite(vm, vcpuid, gpa, result, size, arg); 861 break; 862 default: 863 break; 864 } 865 if (error) 866 return (error); 867 868 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 869 if (error) 870 return (error); 871 872 /* 873 * OF and CF are cleared; the SF, ZF and PF flags are set according 874 * to the result; AF is undefined. 875 * 876 * The updated status flags are obtained by subtracting 0 from 'result'. 877 */ 878 rflags2 = getcc(size, result, 0); 879 rflags &= ~RFLAGS_STATUS_BITS; 880 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 881 882 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 883 return (error); 884 } 885 886 static int 887 emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 888 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 889 { 890 int error, size; 891 uint64_t val1, result, rflags, rflags2; 892 893 size = vie->opsize; 894 error = EINVAL; 895 896 switch (vie->op.op_byte) { 897 case 0x83: 898 /* 899 * OR mem (ModRM:r/m) with immediate and store the 900 * result in mem. 901 * 902 * 83 /1 OR r/m16, imm8 sign-extended to 16 903 * 83 /1 OR r/m32, imm8 sign-extended to 32 904 * REX.W + 83/1 OR r/m64, imm8 sign-extended to 64 905 * 906 * Currently, only the OR operation of the 0x83 opcode 907 * is implemented (ModRM:reg = b001). 908 */ 909 if ((vie->reg & 7) != 1) 910 break; 911 912 /* get the first operand */ 913 error = memread(vm, vcpuid, gpa, &val1, size, arg); 914 if (error) 915 break; 916 917 /* 918 * perform the operation with the pre-fetched immediate 919 * operand and write the result 920 */ 921 result = val1 | vie->immediate; 922 error = memwrite(vm, vcpuid, gpa, result, size, arg); 923 break; 924 default: 925 break; 926 } 927 if (error) 928 return (error); 929 930 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 931 if (error) 932 return (error); 933 934 /* 935 * OF and CF are cleared; the SF, ZF and PF flags are set according 936 * to the result; AF is undefined. 937 * 938 * The updated status flags are obtained by subtracting 0 from 'result'. 939 */ 940 rflags2 = getcc(size, result, 0); 941 rflags &= ~RFLAGS_STATUS_BITS; 942 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 943 944 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 945 return (error); 946 } 947 948 static int 949 emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 950 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 951 { 952 int error, size; 953 uint64_t op1, op2, rflags, rflags2; 954 enum vm_reg_name reg; 955 956 size = vie->opsize; 957 switch (vie->op.op_byte) { 958 case 0x3B: 959 /* 960 * 3B/r CMP r16, r/m16 961 * 3B/r CMP r32, r/m32 962 * REX.W + 3B/r CMP r64, r/m64 963 * 964 * Compare first operand (reg) with second operand (r/m) and 965 * set status flags in EFLAGS register. The comparison is 966 * performed by subtracting the second operand from the first 967 * operand and then setting the status flags. 968 */ 969 970 /* Get the first operand */ 971 reg = gpr_map[vie->reg]; 972 error = vie_read_register(vm, vcpuid, reg, &op1); 973 if (error) 974 return (error); 975 976 /* Get the second operand */ 977 error = memread(vm, vcpuid, gpa, &op2, size, arg); 978 if (error) 979 return (error); 980 981 break; 982 default: 983 return (EINVAL); 984 } 985 rflags2 = getcc(size, op1, op2); 986 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 987 if (error) 988 return (error); 989 rflags &= ~RFLAGS_STATUS_BITS; 990 rflags |= rflags2 & RFLAGS_STATUS_BITS; 991 992 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 993 return (error); 994 } 995 996 static int 997 emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 998 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 999 { 1000 int error, size; 1001 uint64_t nval, rflags, rflags2, val1, val2; 1002 enum vm_reg_name reg; 1003 1004 size = vie->opsize; 1005 error = EINVAL; 1006 1007 switch (vie->op.op_byte) { 1008 case 0x2B: 1009 /* 1010 * SUB r/m from r and store the result in r 1011 * 1012 * 2B/r SUB r16, r/m16 1013 * 2B/r SUB r32, r/m32 1014 * REX.W + 2B/r SUB r64, r/m64 1015 */ 1016 1017 /* get the first operand */ 1018 reg = gpr_map[vie->reg]; 1019 error = vie_read_register(vm, vcpuid, reg, &val1); 1020 if (error) 1021 break; 1022 1023 /* get the second operand */ 1024 error = memread(vm, vcpuid, gpa, &val2, size, arg); 1025 if (error) 1026 break; 1027 1028 /* perform the operation and write the result */ 1029 nval = val1 - val2; 1030 error = vie_update_register(vm, vcpuid, reg, nval, size); 1031 break; 1032 default: 1033 break; 1034 } 1035 1036 if (!error) { 1037 rflags2 = getcc(size, val1, val2); 1038 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1039 &rflags); 1040 if (error) 1041 return (error); 1042 1043 rflags &= ~RFLAGS_STATUS_BITS; 1044 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1045 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 1046 rflags, 8); 1047 } 1048 1049 return (error); 1050 } 1051 1052 static int 1053 emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, 1054 struct vm_guest_paging *paging, mem_region_read_t memread, 1055 mem_region_write_t memwrite, void *arg) 1056 { 1057 #ifdef _KERNEL 1058 struct vm_copyinfo copyinfo[2]; 1059 #else 1060 struct iovec copyinfo[2]; 1061 #endif 1062 struct seg_desc ss_desc; 1063 uint64_t cr0, rflags, rsp, stack_gla, val; 1064 int error, size, stackaddrsize, pushop; 1065 1066 val = 0; 1067 size = vie->opsize; 1068 pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0; 1069 1070 /* 1071 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1 1072 */ 1073 if (paging->cpu_mode == CPU_MODE_REAL) { 1074 stackaddrsize = 2; 1075 } else if (paging->cpu_mode == CPU_MODE_64BIT) { 1076 /* 1077 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3 1078 * - Stack pointer size is always 64-bits. 1079 * - PUSH/POP of 32-bit values is not possible in 64-bit mode. 1080 * - 16-bit PUSH/POP is supported by using the operand size 1081 * override prefix (66H). 1082 */ 1083 stackaddrsize = 8; 1084 size = vie->opsize_override ? 2 : 8; 1085 } else { 1086 /* 1087 * In protected or compability mode the 'B' flag in the 1088 * stack-segment descriptor determines the size of the 1089 * stack pointer. 1090 */ 1091 error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc); 1092 KASSERT(error == 0, ("%s: error %d getting SS descriptor", 1093 __func__, error)); 1094 if (SEG_DESC_DEF32(ss_desc.access)) 1095 stackaddrsize = 4; 1096 else 1097 stackaddrsize = 2; 1098 } 1099 1100 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); 1101 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); 1102 1103 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 1104 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 1105 1106 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp); 1107 KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error)); 1108 if (pushop) { 1109 rsp -= size; 1110 } 1111 1112 if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc, 1113 rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ, 1114 &stack_gla)) { 1115 vm_inject_ss(vm, vcpuid, 0); 1116 return (0); 1117 } 1118 1119 if (vie_canonical_check(paging->cpu_mode, stack_gla)) { 1120 vm_inject_ss(vm, vcpuid, 0); 1121 return (0); 1122 } 1123 1124 if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) { 1125 vm_inject_ac(vm, vcpuid, 0); 1126 return (0); 1127 } 1128 1129 error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, 1130 pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo)); 1131 if (error == -1) { 1132 /* 1133 * XXX cannot return a negative error value here because it 1134 * ends up being the return value of the VM_RUN() ioctl and 1135 * is interpreted as a pseudo-error (for e.g. ERESTART). 1136 */ 1137 return (EFAULT); 1138 } else if (error == 1) { 1139 /* Resume guest execution to handle page fault */ 1140 return (0); 1141 } 1142 1143 if (pushop) { 1144 error = memread(vm, vcpuid, mmio_gpa, &val, size, arg); 1145 if (error == 0) 1146 vm_copyout(vm, vcpuid, &val, copyinfo, size); 1147 } else { 1148 vm_copyin(vm, vcpuid, copyinfo, &val, size); 1149 error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg); 1150 rsp += size; 1151 } 1152 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 1153 1154 if (error == 0) { 1155 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp, 1156 stackaddrsize); 1157 KASSERT(error == 0, ("error %d updating rsp", error)); 1158 } 1159 return (error); 1160 } 1161 1162 static int 1163 emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, 1164 struct vm_guest_paging *paging, mem_region_read_t memread, 1165 mem_region_write_t memwrite, void *arg) 1166 { 1167 int error; 1168 1169 /* 1170 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. 1171 * 1172 * PUSH is part of the group 5 extended opcodes and is identified 1173 * by ModRM:reg = b110. 1174 */ 1175 if ((vie->reg & 7) != 6) 1176 return (EINVAL); 1177 1178 error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread, 1179 memwrite, arg); 1180 return (error); 1181 } 1182 1183 static int 1184 emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, 1185 struct vm_guest_paging *paging, mem_region_read_t memread, 1186 mem_region_write_t memwrite, void *arg) 1187 { 1188 int error; 1189 1190 /* 1191 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. 1192 * 1193 * POP is part of the group 1A extended opcodes and is identified 1194 * by ModRM:reg = b000. 1195 */ 1196 if ((vie->reg & 7) != 0) 1197 return (EINVAL); 1198 1199 error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread, 1200 memwrite, arg); 1201 return (error); 1202 } 1203 1204 int 1205 vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 1206 struct vm_guest_paging *paging, mem_region_read_t memread, 1207 mem_region_write_t memwrite, void *memarg) 1208 { 1209 int error; 1210 1211 if (!vie->decoded) 1212 return (EINVAL); 1213 1214 switch (vie->op.op_type) { 1215 case VIE_OP_TYPE_POP: 1216 error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread, 1217 memwrite, memarg); 1218 break; 1219 case VIE_OP_TYPE_PUSH: 1220 error = emulate_push(vm, vcpuid, gpa, vie, paging, memread, 1221 memwrite, memarg); 1222 break; 1223 case VIE_OP_TYPE_CMP: 1224 error = emulate_cmp(vm, vcpuid, gpa, vie, 1225 memread, memwrite, memarg); 1226 break; 1227 case VIE_OP_TYPE_MOV: 1228 error = emulate_mov(vm, vcpuid, gpa, vie, 1229 memread, memwrite, memarg); 1230 break; 1231 case VIE_OP_TYPE_MOVSX: 1232 case VIE_OP_TYPE_MOVZX: 1233 error = emulate_movx(vm, vcpuid, gpa, vie, 1234 memread, memwrite, memarg); 1235 break; 1236 case VIE_OP_TYPE_MOVS: 1237 error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread, 1238 memwrite, memarg); 1239 break; 1240 case VIE_OP_TYPE_AND: 1241 error = emulate_and(vm, vcpuid, gpa, vie, 1242 memread, memwrite, memarg); 1243 break; 1244 case VIE_OP_TYPE_OR: 1245 error = emulate_or(vm, vcpuid, gpa, vie, 1246 memread, memwrite, memarg); 1247 break; 1248 case VIE_OP_TYPE_SUB: 1249 error = emulate_sub(vm, vcpuid, gpa, vie, 1250 memread, memwrite, memarg); 1251 break; 1252 default: 1253 error = EINVAL; 1254 break; 1255 } 1256 1257 return (error); 1258 } 1259 1260 int 1261 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) 1262 { 1263 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 1264 ("%s: invalid size %d", __func__, size)); 1265 KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl)); 1266 1267 if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) 1268 return (0); 1269 1270 return ((gla & (size - 1)) ? 1 : 0); 1271 } 1272 1273 int 1274 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla) 1275 { 1276 uint64_t mask; 1277 1278 if (cpu_mode != CPU_MODE_64BIT) 1279 return (0); 1280 1281 /* 1282 * The value of the bit 47 in the 'gla' should be replicated in the 1283 * most significant 16 bits. 1284 */ 1285 mask = ~((1UL << 48) - 1); 1286 if (gla & (1UL << 47)) 1287 return ((gla & mask) != mask); 1288 else 1289 return ((gla & mask) != 0); 1290 } 1291 1292 uint64_t 1293 vie_size2mask(int size) 1294 { 1295 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 1296 ("vie_size2mask: invalid size %d", size)); 1297 return (size2mask[size]); 1298 } 1299 1300 int 1301 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, 1302 struct seg_desc *desc, uint64_t offset, int length, int addrsize, 1303 int prot, uint64_t *gla) 1304 { 1305 uint64_t firstoff, low_limit, high_limit, segbase; 1306 int glasize, type; 1307 1308 KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS, 1309 ("%s: invalid segment %d", __func__, seg)); 1310 KASSERT(length == 1 || length == 2 || length == 4 || length == 8, 1311 ("%s: invalid operand size %d", __func__, length)); 1312 KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, 1313 ("%s: invalid prot %#x", __func__, prot)); 1314 1315 firstoff = offset; 1316 if (cpu_mode == CPU_MODE_64BIT) { 1317 KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address " 1318 "size %d for cpu_mode %d", __func__, addrsize, cpu_mode)); 1319 glasize = 8; 1320 } else { 1321 KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address " 1322 "size %d for cpu mode %d", __func__, addrsize, cpu_mode)); 1323 glasize = 4; 1324 /* 1325 * If the segment selector is loaded with a NULL selector 1326 * then the descriptor is unusable and attempting to use 1327 * it results in a #GP(0). 1328 */ 1329 if (SEG_DESC_UNUSABLE(desc->access)) 1330 return (-1); 1331 1332 /* 1333 * The processor generates a #NP exception when a segment 1334 * register is loaded with a selector that points to a 1335 * descriptor that is not present. If this was the case then 1336 * it would have been checked before the VM-exit. 1337 */ 1338 KASSERT(SEG_DESC_PRESENT(desc->access), 1339 ("segment %d not present: %#x", seg, desc->access)); 1340 1341 /* 1342 * The descriptor type must indicate a code/data segment. 1343 */ 1344 type = SEG_DESC_TYPE(desc->access); 1345 KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " 1346 "descriptor type %#x", seg, type)); 1347 1348 if (prot & PROT_READ) { 1349 /* #GP on a read access to a exec-only code segment */ 1350 if ((type & 0xA) == 0x8) 1351 return (-1); 1352 } 1353 1354 if (prot & PROT_WRITE) { 1355 /* 1356 * #GP on a write access to a code segment or a 1357 * read-only data segment. 1358 */ 1359 if (type & 0x8) /* code segment */ 1360 return (-1); 1361 1362 if ((type & 0xA) == 0) /* read-only data seg */ 1363 return (-1); 1364 } 1365 1366 /* 1367 * 'desc->limit' is fully expanded taking granularity into 1368 * account. 1369 */ 1370 if ((type & 0xC) == 0x4) { 1371 /* expand-down data segment */ 1372 low_limit = desc->limit + 1; 1373 high_limit = SEG_DESC_DEF32(desc->access) ? 1374 0xffffffff : 0xffff; 1375 } else { 1376 /* code segment or expand-up data segment */ 1377 low_limit = 0; 1378 high_limit = desc->limit; 1379 } 1380 1381 while (length > 0) { 1382 offset &= vie_size2mask(addrsize); 1383 if (offset < low_limit || offset > high_limit) 1384 return (-1); 1385 offset++; 1386 length--; 1387 } 1388 } 1389 1390 /* 1391 * In 64-bit mode all segments except %fs and %gs have a segment 1392 * base address of 0. 1393 */ 1394 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && 1395 seg != VM_REG_GUEST_GS) { 1396 segbase = 0; 1397 } else { 1398 segbase = desc->base; 1399 } 1400 1401 /* 1402 * Truncate 'firstoff' to the effective address size before adding 1403 * it to the segment base. 1404 */ 1405 firstoff &= vie_size2mask(addrsize); 1406 *gla = (segbase + firstoff) & vie_size2mask(glasize); 1407 return (0); 1408 } 1409 1410 #ifdef _KERNEL 1411 void 1412 vie_init(struct vie *vie, const char *inst_bytes, int inst_length) 1413 { 1414 KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE, 1415 ("%s: invalid instruction length (%d)", __func__, inst_length)); 1416 1417 bzero(vie, sizeof(struct vie)); 1418 1419 vie->base_register = VM_REG_LAST; 1420 vie->index_register = VM_REG_LAST; 1421 vie->segment_register = VM_REG_LAST; 1422 1423 if (inst_length) { 1424 bcopy(inst_bytes, vie->inst, inst_length); 1425 vie->num_valid = inst_length; 1426 } 1427 } 1428 1429 static int 1430 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte) 1431 { 1432 int error_code = 0; 1433 1434 if (pte & PG_V) 1435 error_code |= PGEX_P; 1436 if (prot & VM_PROT_WRITE) 1437 error_code |= PGEX_W; 1438 if (usermode) 1439 error_code |= PGEX_U; 1440 if (rsvd) 1441 error_code |= PGEX_RSV; 1442 if (prot & VM_PROT_EXECUTE) 1443 error_code |= PGEX_I; 1444 1445 return (error_code); 1446 } 1447 1448 static void 1449 ptp_release(void **cookie) 1450 { 1451 if (*cookie != NULL) { 1452 vm_gpa_release(*cookie); 1453 *cookie = NULL; 1454 } 1455 } 1456 1457 static void * 1458 ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie) 1459 { 1460 void *ptr; 1461 1462 ptp_release(cookie); 1463 ptr = vm_gpa_hold(vm, ptpphys, len, VM_PROT_RW, cookie); 1464 return (ptr); 1465 } 1466 1467 int 1468 vmm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 1469 uint64_t gla, int prot, uint64_t *gpa) 1470 { 1471 int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable; 1472 u_int retries; 1473 uint64_t *ptpbase, ptpphys, pte, pgsize; 1474 uint32_t *ptpbase32, pte32; 1475 void *cookie; 1476 1477 usermode = (paging->cpl == 3 ? 1 : 0); 1478 writable = prot & VM_PROT_WRITE; 1479 cookie = NULL; 1480 retval = 0; 1481 retries = 0; 1482 restart: 1483 ptpphys = paging->cr3; /* root of the page tables */ 1484 ptp_release(&cookie); 1485 if (retries++ > 0) 1486 maybe_yield(); 1487 1488 if (vie_canonical_check(paging->cpu_mode, gla)) { 1489 /* 1490 * XXX assuming a non-stack reference otherwise a stack fault 1491 * should be generated. 1492 */ 1493 vm_inject_gp(vm, vcpuid); 1494 goto fault; 1495 } 1496 1497 if (paging->paging_mode == PAGING_MODE_FLAT) { 1498 *gpa = gla; 1499 goto done; 1500 } 1501 1502 if (paging->paging_mode == PAGING_MODE_32) { 1503 nlevels = 2; 1504 while (--nlevels >= 0) { 1505 /* Zero out the lower 12 bits. */ 1506 ptpphys &= ~0xfff; 1507 1508 ptpbase32 = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie); 1509 1510 if (ptpbase32 == NULL) 1511 goto error; 1512 1513 ptpshift = PAGE_SHIFT + nlevels * 10; 1514 ptpindex = (gla >> ptpshift) & 0x3FF; 1515 pgsize = 1UL << ptpshift; 1516 1517 pte32 = ptpbase32[ptpindex]; 1518 1519 if ((pte32 & PG_V) == 0 || 1520 (usermode && (pte32 & PG_U) == 0) || 1521 (writable && (pte32 & PG_RW) == 0)) { 1522 pfcode = pf_error_code(usermode, prot, 0, 1523 pte32); 1524 vm_inject_pf(vm, vcpuid, pfcode, gla); 1525 goto fault; 1526 } 1527 1528 /* 1529 * Emulate the x86 MMU's management of the accessed 1530 * and dirty flags. While the accessed flag is set 1531 * at every level of the page table, the dirty flag 1532 * is only set at the last level providing the guest 1533 * physical address. 1534 */ 1535 if ((pte32 & PG_A) == 0) { 1536 if (atomic_cmpset_32(&ptpbase32[ptpindex], 1537 pte32, pte32 | PG_A) == 0) { 1538 goto restart; 1539 } 1540 } 1541 1542 /* XXX must be ignored if CR4.PSE=0 */ 1543 if (nlevels > 0 && (pte32 & PG_PS) != 0) 1544 break; 1545 1546 ptpphys = pte32; 1547 } 1548 1549 /* Set the dirty bit in the page table entry if necessary */ 1550 if (writable && (pte32 & PG_M) == 0) { 1551 if (atomic_cmpset_32(&ptpbase32[ptpindex], 1552 pte32, pte32 | PG_M) == 0) { 1553 goto restart; 1554 } 1555 } 1556 1557 /* Zero out the lower 'ptpshift' bits */ 1558 pte32 >>= ptpshift; pte32 <<= ptpshift; 1559 *gpa = pte32 | (gla & (pgsize - 1)); 1560 goto done; 1561 } 1562 1563 if (paging->paging_mode == PAGING_MODE_PAE) { 1564 /* Zero out the lower 5 bits and the upper 32 bits */ 1565 ptpphys &= 0xffffffe0UL; 1566 1567 ptpbase = ptp_hold(vm, ptpphys, sizeof(*ptpbase) * 4, &cookie); 1568 if (ptpbase == NULL) 1569 goto error; 1570 1571 ptpindex = (gla >> 30) & 0x3; 1572 1573 pte = ptpbase[ptpindex]; 1574 1575 if ((pte & PG_V) == 0) { 1576 pfcode = pf_error_code(usermode, prot, 0, pte); 1577 vm_inject_pf(vm, vcpuid, pfcode, gla); 1578 goto fault; 1579 } 1580 1581 ptpphys = pte; 1582 1583 nlevels = 2; 1584 } else 1585 nlevels = 4; 1586 while (--nlevels >= 0) { 1587 /* Zero out the lower 12 bits and the upper 12 bits */ 1588 ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12; 1589 1590 ptpbase = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie); 1591 if (ptpbase == NULL) 1592 goto error; 1593 1594 ptpshift = PAGE_SHIFT + nlevels * 9; 1595 ptpindex = (gla >> ptpshift) & 0x1FF; 1596 pgsize = 1UL << ptpshift; 1597 1598 pte = ptpbase[ptpindex]; 1599 1600 if ((pte & PG_V) == 0 || 1601 (usermode && (pte & PG_U) == 0) || 1602 (writable && (pte & PG_RW) == 0)) { 1603 pfcode = pf_error_code(usermode, prot, 0, pte); 1604 vm_inject_pf(vm, vcpuid, pfcode, gla); 1605 goto fault; 1606 } 1607 1608 /* Set the accessed bit in the page table entry */ 1609 if ((pte & PG_A) == 0) { 1610 if (atomic_cmpset_64(&ptpbase[ptpindex], 1611 pte, pte | PG_A) == 0) { 1612 goto restart; 1613 } 1614 } 1615 1616 if (nlevels > 0 && (pte & PG_PS) != 0) { 1617 if (pgsize > 1 * GB) { 1618 pfcode = pf_error_code(usermode, prot, 1, pte); 1619 vm_inject_pf(vm, vcpuid, pfcode, gla); 1620 goto fault; 1621 } 1622 break; 1623 } 1624 1625 ptpphys = pte; 1626 } 1627 1628 /* Set the dirty bit in the page table entry if necessary */ 1629 if (writable && (pte & PG_M) == 0) { 1630 if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0) 1631 goto restart; 1632 } 1633 1634 /* Zero out the lower 'ptpshift' bits and the upper 12 bits */ 1635 pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12; 1636 *gpa = pte | (gla & (pgsize - 1)); 1637 done: 1638 ptp_release(&cookie); 1639 return (retval); 1640 error: 1641 retval = -1; 1642 goto done; 1643 fault: 1644 retval = 1; 1645 goto done; 1646 } 1647 1648 int 1649 vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 1650 uint64_t rip, int inst_length, struct vie *vie) 1651 { 1652 struct vm_copyinfo copyinfo[2]; 1653 int error, prot; 1654 1655 if (inst_length > VIE_INST_SIZE) 1656 panic("vmm_fetch_instruction: invalid length %d", inst_length); 1657 1658 prot = PROT_READ | PROT_EXEC; 1659 error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot, 1660 copyinfo, nitems(copyinfo)); 1661 if (error == 0) { 1662 vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length); 1663 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 1664 vie->num_valid = inst_length; 1665 } 1666 return (error); 1667 } 1668 1669 static int 1670 vie_peek(struct vie *vie, uint8_t *x) 1671 { 1672 1673 if (vie->num_processed < vie->num_valid) { 1674 *x = vie->inst[vie->num_processed]; 1675 return (0); 1676 } else 1677 return (-1); 1678 } 1679 1680 static void 1681 vie_advance(struct vie *vie) 1682 { 1683 1684 vie->num_processed++; 1685 } 1686 1687 static bool 1688 segment_override(uint8_t x, int *seg) 1689 { 1690 1691 switch (x) { 1692 case 0x2E: 1693 *seg = VM_REG_GUEST_CS; 1694 break; 1695 case 0x36: 1696 *seg = VM_REG_GUEST_SS; 1697 break; 1698 case 0x3E: 1699 *seg = VM_REG_GUEST_DS; 1700 break; 1701 case 0x26: 1702 *seg = VM_REG_GUEST_ES; 1703 break; 1704 case 0x64: 1705 *seg = VM_REG_GUEST_FS; 1706 break; 1707 case 0x65: 1708 *seg = VM_REG_GUEST_GS; 1709 break; 1710 default: 1711 return (false); 1712 } 1713 return (true); 1714 } 1715 1716 static int 1717 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d) 1718 { 1719 uint8_t x; 1720 1721 while (1) { 1722 if (vie_peek(vie, &x)) 1723 return (-1); 1724 1725 if (x == 0x66) 1726 vie->opsize_override = 1; 1727 else if (x == 0x67) 1728 vie->addrsize_override = 1; 1729 else if (x == 0xF3) 1730 vie->repz_present = 1; 1731 else if (x == 0xF2) 1732 vie->repnz_present = 1; 1733 else if (segment_override(x, &vie->segment_register)) 1734 vie->segment_override = 1; 1735 else 1736 break; 1737 1738 vie_advance(vie); 1739 } 1740 1741 /* 1742 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2: 1743 * - Only one REX prefix is allowed per instruction. 1744 * - The REX prefix must immediately precede the opcode byte or the 1745 * escape opcode byte. 1746 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3) 1747 * the mandatory prefix must come before the REX prefix. 1748 */ 1749 if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) { 1750 vie->rex_present = 1; 1751 vie->rex_w = x & 0x8 ? 1 : 0; 1752 vie->rex_r = x & 0x4 ? 1 : 0; 1753 vie->rex_x = x & 0x2 ? 1 : 0; 1754 vie->rex_b = x & 0x1 ? 1 : 0; 1755 vie_advance(vie); 1756 } 1757 1758 /* 1759 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1 1760 */ 1761 if (cpu_mode == CPU_MODE_64BIT) { 1762 /* 1763 * Default address size is 64-bits and default operand size 1764 * is 32-bits. 1765 */ 1766 vie->addrsize = vie->addrsize_override ? 4 : 8; 1767 if (vie->rex_w) 1768 vie->opsize = 8; 1769 else if (vie->opsize_override) 1770 vie->opsize = 2; 1771 else 1772 vie->opsize = 4; 1773 } else if (cs_d) { 1774 /* Default address and operand sizes are 32-bits */ 1775 vie->addrsize = vie->addrsize_override ? 2 : 4; 1776 vie->opsize = vie->opsize_override ? 2 : 4; 1777 } else { 1778 /* Default address and operand sizes are 16-bits */ 1779 vie->addrsize = vie->addrsize_override ? 4 : 2; 1780 vie->opsize = vie->opsize_override ? 4 : 2; 1781 } 1782 return (0); 1783 } 1784 1785 static int 1786 decode_two_byte_opcode(struct vie *vie) 1787 { 1788 uint8_t x; 1789 1790 if (vie_peek(vie, &x)) 1791 return (-1); 1792 1793 vie->op = two_byte_opcodes[x]; 1794 1795 if (vie->op.op_type == VIE_OP_TYPE_NONE) 1796 return (-1); 1797 1798 vie_advance(vie); 1799 return (0); 1800 } 1801 1802 static int 1803 decode_opcode(struct vie *vie) 1804 { 1805 uint8_t x; 1806 1807 if (vie_peek(vie, &x)) 1808 return (-1); 1809 1810 vie->op = one_byte_opcodes[x]; 1811 1812 if (vie->op.op_type == VIE_OP_TYPE_NONE) 1813 return (-1); 1814 1815 vie_advance(vie); 1816 1817 if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE) 1818 return (decode_two_byte_opcode(vie)); 1819 1820 return (0); 1821 } 1822 1823 static int 1824 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode) 1825 { 1826 uint8_t x; 1827 1828 if (cpu_mode == CPU_MODE_REAL) 1829 return (-1); 1830 1831 if (vie->op.op_flags & VIE_OP_F_NO_MODRM) 1832 return (0); 1833 1834 if (vie_peek(vie, &x)) 1835 return (-1); 1836 1837 vie->mod = (x >> 6) & 0x3; 1838 vie->rm = (x >> 0) & 0x7; 1839 vie->reg = (x >> 3) & 0x7; 1840 1841 /* 1842 * A direct addressing mode makes no sense in the context of an EPT 1843 * fault. There has to be a memory access involved to cause the 1844 * EPT fault. 1845 */ 1846 if (vie->mod == VIE_MOD_DIRECT) 1847 return (-1); 1848 1849 if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) || 1850 (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) { 1851 /* 1852 * Table 2-5: Special Cases of REX Encodings 1853 * 1854 * mod=0, r/m=5 is used in the compatibility mode to 1855 * indicate a disp32 without a base register. 1856 * 1857 * mod!=3, r/m=4 is used in the compatibility mode to 1858 * indicate that the SIB byte is present. 1859 * 1860 * The 'b' bit in the REX prefix is don't care in 1861 * this case. 1862 */ 1863 } else { 1864 vie->rm |= (vie->rex_b << 3); 1865 } 1866 1867 vie->reg |= (vie->rex_r << 3); 1868 1869 /* SIB */ 1870 if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB) 1871 goto done; 1872 1873 vie->base_register = gpr_map[vie->rm]; 1874 1875 switch (vie->mod) { 1876 case VIE_MOD_INDIRECT_DISP8: 1877 vie->disp_bytes = 1; 1878 break; 1879 case VIE_MOD_INDIRECT_DISP32: 1880 vie->disp_bytes = 4; 1881 break; 1882 case VIE_MOD_INDIRECT: 1883 if (vie->rm == VIE_RM_DISP32) { 1884 vie->disp_bytes = 4; 1885 /* 1886 * Table 2-7. RIP-Relative Addressing 1887 * 1888 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32 1889 * whereas in compatibility mode it just implies disp32. 1890 */ 1891 1892 if (cpu_mode == CPU_MODE_64BIT) 1893 vie->base_register = VM_REG_GUEST_RIP; 1894 else 1895 vie->base_register = VM_REG_LAST; 1896 } 1897 break; 1898 } 1899 1900 done: 1901 vie_advance(vie); 1902 1903 return (0); 1904 } 1905 1906 static int 1907 decode_sib(struct vie *vie) 1908 { 1909 uint8_t x; 1910 1911 /* Proceed only if SIB byte is present */ 1912 if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB) 1913 return (0); 1914 1915 if (vie_peek(vie, &x)) 1916 return (-1); 1917 1918 /* De-construct the SIB byte */ 1919 vie->ss = (x >> 6) & 0x3; 1920 vie->index = (x >> 3) & 0x7; 1921 vie->base = (x >> 0) & 0x7; 1922 1923 /* Apply the REX prefix modifiers */ 1924 vie->index |= vie->rex_x << 3; 1925 vie->base |= vie->rex_b << 3; 1926 1927 switch (vie->mod) { 1928 case VIE_MOD_INDIRECT_DISP8: 1929 vie->disp_bytes = 1; 1930 break; 1931 case VIE_MOD_INDIRECT_DISP32: 1932 vie->disp_bytes = 4; 1933 break; 1934 } 1935 1936 if (vie->mod == VIE_MOD_INDIRECT && 1937 (vie->base == 5 || vie->base == 13)) { 1938 /* 1939 * Special case when base register is unused if mod = 0 1940 * and base = %rbp or %r13. 1941 * 1942 * Documented in: 1943 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 1944 * Table 2-5: Special Cases of REX Encodings 1945 */ 1946 vie->disp_bytes = 4; 1947 } else { 1948 vie->base_register = gpr_map[vie->base]; 1949 } 1950 1951 /* 1952 * All encodings of 'index' are valid except for %rsp (4). 1953 * 1954 * Documented in: 1955 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 1956 * Table 2-5: Special Cases of REX Encodings 1957 */ 1958 if (vie->index != 4) 1959 vie->index_register = gpr_map[vie->index]; 1960 1961 /* 'scale' makes sense only in the context of an index register */ 1962 if (vie->index_register < VM_REG_LAST) 1963 vie->scale = 1 << vie->ss; 1964 1965 vie_advance(vie); 1966 1967 return (0); 1968 } 1969 1970 static int 1971 decode_displacement(struct vie *vie) 1972 { 1973 int n, i; 1974 uint8_t x; 1975 1976 union { 1977 char buf[4]; 1978 int8_t signed8; 1979 int32_t signed32; 1980 } u; 1981 1982 if ((n = vie->disp_bytes) == 0) 1983 return (0); 1984 1985 if (n != 1 && n != 4) 1986 panic("decode_displacement: invalid disp_bytes %d", n); 1987 1988 for (i = 0; i < n; i++) { 1989 if (vie_peek(vie, &x)) 1990 return (-1); 1991 1992 u.buf[i] = x; 1993 vie_advance(vie); 1994 } 1995 1996 if (n == 1) 1997 vie->displacement = u.signed8; /* sign-extended */ 1998 else 1999 vie->displacement = u.signed32; /* sign-extended */ 2000 2001 return (0); 2002 } 2003 2004 static int 2005 decode_immediate(struct vie *vie) 2006 { 2007 int i, n; 2008 uint8_t x; 2009 union { 2010 char buf[4]; 2011 int8_t signed8; 2012 int16_t signed16; 2013 int32_t signed32; 2014 } u; 2015 2016 /* Figure out immediate operand size (if any) */ 2017 if (vie->op.op_flags & VIE_OP_F_IMM) { 2018 /* 2019 * Section 2.2.1.5 "Immediates", Intel SDM: 2020 * In 64-bit mode the typical size of immediate operands 2021 * remains 32-bits. When the operand size if 64-bits, the 2022 * processor sign-extends all immediates to 64-bits prior 2023 * to their use. 2024 */ 2025 if (vie->opsize == 4 || vie->opsize == 8) 2026 vie->imm_bytes = 4; 2027 else 2028 vie->imm_bytes = 2; 2029 } else if (vie->op.op_flags & VIE_OP_F_IMM8) { 2030 vie->imm_bytes = 1; 2031 } 2032 2033 if ((n = vie->imm_bytes) == 0) 2034 return (0); 2035 2036 KASSERT(n == 1 || n == 2 || n == 4, 2037 ("%s: invalid number of immediate bytes: %d", __func__, n)); 2038 2039 for (i = 0; i < n; i++) { 2040 if (vie_peek(vie, &x)) 2041 return (-1); 2042 2043 u.buf[i] = x; 2044 vie_advance(vie); 2045 } 2046 2047 /* sign-extend the immediate value before use */ 2048 if (n == 1) 2049 vie->immediate = u.signed8; 2050 else if (n == 2) 2051 vie->immediate = u.signed16; 2052 else 2053 vie->immediate = u.signed32; 2054 2055 return (0); 2056 } 2057 2058 static int 2059 decode_moffset(struct vie *vie) 2060 { 2061 int i, n; 2062 uint8_t x; 2063 union { 2064 char buf[8]; 2065 uint64_t u64; 2066 } u; 2067 2068 if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0) 2069 return (0); 2070 2071 /* 2072 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM: 2073 * The memory offset size follows the address-size of the instruction. 2074 */ 2075 n = vie->addrsize; 2076 KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n)); 2077 2078 u.u64 = 0; 2079 for (i = 0; i < n; i++) { 2080 if (vie_peek(vie, &x)) 2081 return (-1); 2082 2083 u.buf[i] = x; 2084 vie_advance(vie); 2085 } 2086 vie->displacement = u.u64; 2087 return (0); 2088 } 2089 2090 /* 2091 * Verify that all the bytes in the instruction buffer were consumed. 2092 */ 2093 static int 2094 verify_inst_length(struct vie *vie) 2095 { 2096 2097 if (vie->num_processed) 2098 return (0); 2099 else 2100 return (-1); 2101 } 2102 2103 /* 2104 * Verify that the 'guest linear address' provided as collateral of the nested 2105 * page table fault matches with our instruction decoding. 2106 */ 2107 static int 2108 verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie) 2109 { 2110 int error; 2111 uint64_t base, idx, gla2; 2112 2113 /* Skip 'gla' verification */ 2114 if (gla == VIE_INVALID_GLA) 2115 return (0); 2116 2117 base = 0; 2118 if (vie->base_register != VM_REG_LAST) { 2119 error = vm_get_register(vm, cpuid, vie->base_register, &base); 2120 if (error) { 2121 printf("verify_gla: error %d getting base reg %d\n", 2122 error, vie->base_register); 2123 return (-1); 2124 } 2125 2126 /* 2127 * RIP-relative addressing starts from the following 2128 * instruction 2129 */ 2130 if (vie->base_register == VM_REG_GUEST_RIP) 2131 base += vie->num_valid; 2132 } 2133 2134 idx = 0; 2135 if (vie->index_register != VM_REG_LAST) { 2136 error = vm_get_register(vm, cpuid, vie->index_register, &idx); 2137 if (error) { 2138 printf("verify_gla: error %d getting index reg %d\n", 2139 error, vie->index_register); 2140 return (-1); 2141 } 2142 } 2143 2144 /* XXX assuming that the base address of the segment is 0 */ 2145 gla2 = base + vie->scale * idx + vie->displacement; 2146 gla2 &= size2mask[vie->addrsize]; 2147 if (gla != gla2) { 2148 printf("verify_gla mismatch: " 2149 "base(0x%0lx), scale(%d), index(0x%0lx), " 2150 "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n", 2151 base, vie->scale, idx, vie->displacement, gla, gla2); 2152 return (-1); 2153 } 2154 2155 return (0); 2156 } 2157 2158 int 2159 vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, 2160 enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) 2161 { 2162 2163 if (decode_prefixes(vie, cpu_mode, cs_d)) 2164 return (-1); 2165 2166 if (decode_opcode(vie)) 2167 return (-1); 2168 2169 if (decode_modrm(vie, cpu_mode)) 2170 return (-1); 2171 2172 if (decode_sib(vie)) 2173 return (-1); 2174 2175 if (decode_displacement(vie)) 2176 return (-1); 2177 2178 if (decode_immediate(vie)) 2179 return (-1); 2180 2181 if (decode_moffset(vie)) 2182 return (-1); 2183 2184 if (verify_inst_length(vie)) 2185 return (-1); 2186 2187 if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) { 2188 if (verify_gla(vm, cpuid, gla, vie)) 2189 return (-1); 2190 } 2191 2192 vie->decoded = 1; /* success */ 2193 2194 return (0); 2195 } 2196 #endif /* _KERNEL */ 2197