1 /*- 2 * Copyright (c) 2012 Sandvine, Inc. 3 * Copyright (c) 2012 NetApp, Inc. 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 * 27 * $FreeBSD$ 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #ifdef _KERNEL 34 #include <sys/param.h> 35 #include <sys/pcpu.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 39 #include <vm/vm.h> 40 #include <vm/pmap.h> 41 42 #include <machine/vmparam.h> 43 #include <machine/vmm.h> 44 #else /* !_KERNEL */ 45 #include <sys/types.h> 46 #include <sys/errno.h> 47 #include <sys/_iovec.h> 48 49 #include <machine/vmm.h> 50 51 #include <assert.h> 52 #include <vmmapi.h> 53 #define KASSERT(exp,msg) assert((exp)) 54 #endif /* _KERNEL */ 55 56 #include <machine/vmm_instruction_emul.h> 57 #include <x86/psl.h> 58 #include <x86/specialreg.h> 59 60 /* struct vie_op.op_type */ 61 enum { 62 VIE_OP_TYPE_NONE = 0, 63 VIE_OP_TYPE_MOV, 64 VIE_OP_TYPE_MOVSX, 65 VIE_OP_TYPE_MOVZX, 66 VIE_OP_TYPE_AND, 67 VIE_OP_TYPE_OR, 68 VIE_OP_TYPE_TWO_BYTE, 69 VIE_OP_TYPE_PUSH, 70 VIE_OP_TYPE_CMP, 71 VIE_OP_TYPE_LAST 72 }; 73 74 /* struct vie_op.op_flags */ 75 #define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */ 76 #define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ 77 #define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */ 78 #define VIE_OP_F_NO_MODRM (1 << 3) 79 80 static const struct vie_op two_byte_opcodes[256] = { 81 [0xB6] = { 82 .op_byte = 0xB6, 83 .op_type = VIE_OP_TYPE_MOVZX, 84 }, 85 [0xBE] = { 86 .op_byte = 0xBE, 87 .op_type = VIE_OP_TYPE_MOVSX, 88 }, 89 }; 90 91 static const struct vie_op one_byte_opcodes[256] = { 92 [0x0F] = { 93 .op_byte = 0x0F, 94 .op_type = VIE_OP_TYPE_TWO_BYTE 95 }, 96 [0x3B] = { 97 .op_byte = 0x3B, 98 .op_type = VIE_OP_TYPE_CMP, 99 }, 100 [0x88] = { 101 .op_byte = 0x88, 102 .op_type = VIE_OP_TYPE_MOV, 103 }, 104 [0x89] = { 105 .op_byte = 0x89, 106 .op_type = VIE_OP_TYPE_MOV, 107 }, 108 [0x8A] = { 109 .op_byte = 0x8A, 110 .op_type = VIE_OP_TYPE_MOV, 111 }, 112 [0x8B] = { 113 .op_byte = 0x8B, 114 .op_type = VIE_OP_TYPE_MOV, 115 }, 116 [0xA1] = { 117 .op_byte = 0xA1, 118 .op_type = VIE_OP_TYPE_MOV, 119 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 120 }, 121 [0xA3] = { 122 .op_byte = 0xA3, 123 .op_type = VIE_OP_TYPE_MOV, 124 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 125 }, 126 [0xC6] = { 127 /* XXX Group 11 extended opcode - not just MOV */ 128 .op_byte = 0xC6, 129 .op_type = VIE_OP_TYPE_MOV, 130 .op_flags = VIE_OP_F_IMM8, 131 }, 132 [0xC7] = { 133 .op_byte = 0xC7, 134 .op_type = VIE_OP_TYPE_MOV, 135 .op_flags = VIE_OP_F_IMM, 136 }, 137 [0x23] = { 138 .op_byte = 0x23, 139 .op_type = VIE_OP_TYPE_AND, 140 }, 141 [0x81] = { 142 /* XXX Group 1 extended opcode - not just AND */ 143 .op_byte = 0x81, 144 .op_type = VIE_OP_TYPE_AND, 145 .op_flags = VIE_OP_F_IMM, 146 }, 147 [0x83] = { 148 /* XXX Group 1 extended opcode - not just OR */ 149 .op_byte = 0x83, 150 .op_type = VIE_OP_TYPE_OR, 151 .op_flags = VIE_OP_F_IMM8, 152 }, 153 [0xFF] = { 154 /* XXX Group 5 extended opcode - not just PUSH */ 155 .op_byte = 0xFF, 156 .op_type = VIE_OP_TYPE_PUSH, 157 } 158 }; 159 160 /* struct vie.mod */ 161 #define VIE_MOD_INDIRECT 0 162 #define VIE_MOD_INDIRECT_DISP8 1 163 #define VIE_MOD_INDIRECT_DISP32 2 164 #define VIE_MOD_DIRECT 3 165 166 /* struct vie.rm */ 167 #define VIE_RM_SIB 4 168 #define VIE_RM_DISP32 5 169 170 #define GB (1024 * 1024 * 1024) 171 172 static enum vm_reg_name gpr_map[16] = { 173 VM_REG_GUEST_RAX, 174 VM_REG_GUEST_RCX, 175 VM_REG_GUEST_RDX, 176 VM_REG_GUEST_RBX, 177 VM_REG_GUEST_RSP, 178 VM_REG_GUEST_RBP, 179 VM_REG_GUEST_RSI, 180 VM_REG_GUEST_RDI, 181 VM_REG_GUEST_R8, 182 VM_REG_GUEST_R9, 183 VM_REG_GUEST_R10, 184 VM_REG_GUEST_R11, 185 VM_REG_GUEST_R12, 186 VM_REG_GUEST_R13, 187 VM_REG_GUEST_R14, 188 VM_REG_GUEST_R15 189 }; 190 191 static uint64_t size2mask[] = { 192 [1] = 0xff, 193 [2] = 0xffff, 194 [4] = 0xffffffff, 195 [8] = 0xffffffffffffffff, 196 }; 197 198 static int 199 vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval) 200 { 201 int error; 202 203 error = vm_get_register(vm, vcpuid, reg, rval); 204 205 return (error); 206 } 207 208 static void 209 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr) 210 { 211 *lhbr = 0; 212 *reg = gpr_map[vie->reg]; 213 214 /* 215 * 64-bit mode imposes limitations on accessing legacy high byte 216 * registers (lhbr). 217 * 218 * The legacy high-byte registers cannot be addressed if the REX 219 * prefix is present. In this case the values 4, 5, 6 and 7 of the 220 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively. 221 * 222 * If the REX prefix is not present then the values 4, 5, 6 and 7 223 * of the 'ModRM:reg' field address the legacy high-byte registers, 224 * %ah, %ch, %dh and %bh respectively. 225 */ 226 if (!vie->rex_present) { 227 if (vie->reg & 0x4) { 228 *lhbr = 1; 229 *reg = gpr_map[vie->reg & 0x3]; 230 } 231 } 232 } 233 234 static int 235 vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval) 236 { 237 uint64_t val; 238 int error, lhbr; 239 enum vm_reg_name reg; 240 241 vie_calc_bytereg(vie, ®, &lhbr); 242 error = vm_get_register(vm, vcpuid, reg, &val); 243 244 /* 245 * To obtain the value of a legacy high byte register shift the 246 * base register right by 8 bits (%ah = %rax >> 8). 247 */ 248 if (lhbr) 249 *rval = val >> 8; 250 else 251 *rval = val; 252 return (error); 253 } 254 255 static int 256 vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte) 257 { 258 uint64_t origval, val, mask; 259 int error, lhbr; 260 enum vm_reg_name reg; 261 262 vie_calc_bytereg(vie, ®, &lhbr); 263 error = vm_get_register(vm, vcpuid, reg, &origval); 264 if (error == 0) { 265 val = byte; 266 mask = 0xff; 267 if (lhbr) { 268 /* 269 * Shift left by 8 to store 'byte' in a legacy high 270 * byte register. 271 */ 272 val <<= 8; 273 mask <<= 8; 274 } 275 val |= origval & ~mask; 276 error = vm_set_register(vm, vcpuid, reg, val); 277 } 278 return (error); 279 } 280 281 int 282 vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, 283 uint64_t val, int size) 284 { 285 int error; 286 uint64_t origval; 287 288 switch (size) { 289 case 1: 290 case 2: 291 error = vie_read_register(vm, vcpuid, reg, &origval); 292 if (error) 293 return (error); 294 val &= size2mask[size]; 295 val |= origval & ~size2mask[size]; 296 break; 297 case 4: 298 val &= 0xffffffffUL; 299 break; 300 case 8: 301 break; 302 default: 303 return (EINVAL); 304 } 305 306 error = vm_set_register(vm, vcpuid, reg, val); 307 return (error); 308 } 309 310 /* 311 * Return the status flags that would result from doing (x - y). 312 */ 313 static u_long 314 getcc16(uint16_t x, uint16_t y) 315 { 316 u_long rflags; 317 318 __asm __volatile("sub %1,%2; pushfq; popq %0" : 319 "=r" (rflags) : "m" (y), "r" (x)); 320 return (rflags); 321 } 322 323 static u_long 324 getcc32(uint32_t x, uint32_t y) 325 { 326 u_long rflags; 327 328 __asm __volatile("sub %1,%2; pushfq; popq %0" : 329 "=r" (rflags) : "m" (y), "r" (x)); 330 return (rflags); 331 } 332 333 static u_long 334 getcc64(uint64_t x, uint64_t y) 335 { 336 u_long rflags; 337 338 __asm __volatile("sub %1,%2; pushfq; popq %0" : 339 "=r" (rflags) : "m" (y), "r" (x)); 340 return (rflags); 341 } 342 343 static u_long 344 getcc(int opsize, uint64_t x, uint64_t y) 345 { 346 KASSERT(opsize == 2 || opsize == 4 || opsize == 8, 347 ("getcc: invalid operand size %d", opsize)); 348 349 if (opsize == 2) 350 return (getcc16(x, y)); 351 else if (opsize == 4) 352 return (getcc32(x, y)); 353 else 354 return (getcc64(x, y)); 355 } 356 357 static int 358 emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 359 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 360 { 361 int error, size; 362 enum vm_reg_name reg; 363 uint8_t byte; 364 uint64_t val; 365 366 size = vie->opsize; 367 error = EINVAL; 368 369 switch (vie->op.op_byte) { 370 case 0x88: 371 /* 372 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m) 373 * 88/r: mov r/m8, r8 374 * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available) 375 */ 376 size = 1; /* override for byte operation */ 377 error = vie_read_bytereg(vm, vcpuid, vie, &byte); 378 if (error == 0) 379 error = memwrite(vm, vcpuid, gpa, byte, size, arg); 380 break; 381 case 0x89: 382 /* 383 * MOV from reg (ModRM:reg) to mem (ModRM:r/m) 384 * 89/r: mov r/m16, r16 385 * 89/r: mov r/m32, r32 386 * REX.W + 89/r mov r/m64, r64 387 */ 388 reg = gpr_map[vie->reg]; 389 error = vie_read_register(vm, vcpuid, reg, &val); 390 if (error == 0) { 391 val &= size2mask[size]; 392 error = memwrite(vm, vcpuid, gpa, val, size, arg); 393 } 394 break; 395 case 0x8A: 396 /* 397 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg) 398 * 8A/r: mov r8, r/m8 399 * REX + 8A/r: mov r8, r/m8 400 */ 401 size = 1; /* override for byte operation */ 402 error = memread(vm, vcpuid, gpa, &val, size, arg); 403 if (error == 0) 404 error = vie_write_bytereg(vm, vcpuid, vie, val); 405 break; 406 case 0x8B: 407 /* 408 * MOV from mem (ModRM:r/m) to reg (ModRM:reg) 409 * 8B/r: mov r16, r/m16 410 * 8B/r: mov r32, r/m32 411 * REX.W 8B/r: mov r64, r/m64 412 */ 413 error = memread(vm, vcpuid, gpa, &val, size, arg); 414 if (error == 0) { 415 reg = gpr_map[vie->reg]; 416 error = vie_update_register(vm, vcpuid, reg, val, size); 417 } 418 break; 419 case 0xA1: 420 /* 421 * MOV from seg:moffset to AX/EAX/RAX 422 * A1: mov AX, moffs16 423 * A1: mov EAX, moffs32 424 * REX.W + A1: mov RAX, moffs64 425 */ 426 error = memread(vm, vcpuid, gpa, &val, size, arg); 427 if (error == 0) { 428 reg = VM_REG_GUEST_RAX; 429 error = vie_update_register(vm, vcpuid, reg, val, size); 430 } 431 break; 432 case 0xA3: 433 /* 434 * MOV from AX/EAX/RAX to seg:moffset 435 * A3: mov moffs16, AX 436 * A3: mov moffs32, EAX 437 * REX.W + A3: mov moffs64, RAX 438 */ 439 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); 440 if (error == 0) { 441 val &= size2mask[size]; 442 error = memwrite(vm, vcpuid, gpa, val, size, arg); 443 } 444 break; 445 case 0xC6: 446 /* 447 * MOV from imm8 to mem (ModRM:r/m) 448 * C6/0 mov r/m8, imm8 449 * REX + C6/0 mov r/m8, imm8 450 */ 451 size = 1; /* override for byte operation */ 452 error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg); 453 break; 454 case 0xC7: 455 /* 456 * MOV from imm16/imm32 to mem (ModRM:r/m) 457 * C7/0 mov r/m16, imm16 458 * C7/0 mov r/m32, imm32 459 * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) 460 */ 461 val = vie->immediate & size2mask[size]; 462 error = memwrite(vm, vcpuid, gpa, val, size, arg); 463 break; 464 default: 465 break; 466 } 467 468 return (error); 469 } 470 471 static int 472 emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 473 mem_region_read_t memread, mem_region_write_t memwrite, 474 void *arg) 475 { 476 int error, size; 477 enum vm_reg_name reg; 478 uint64_t val; 479 480 size = vie->opsize; 481 error = EINVAL; 482 483 switch (vie->op.op_byte) { 484 case 0xB6: 485 /* 486 * MOV and zero extend byte from mem (ModRM:r/m) to 487 * reg (ModRM:reg). 488 * 489 * 0F B6/r movzx r16, r/m8 490 * 0F B6/r movzx r32, r/m8 491 * REX.W + 0F B6/r movzx r64, r/m8 492 */ 493 494 /* get the first operand */ 495 error = memread(vm, vcpuid, gpa, &val, 1, arg); 496 if (error) 497 break; 498 499 /* get the second operand */ 500 reg = gpr_map[vie->reg]; 501 502 /* zero-extend byte */ 503 val = (uint8_t)val; 504 505 /* write the result */ 506 error = vie_update_register(vm, vcpuid, reg, val, size); 507 break; 508 case 0xBE: 509 /* 510 * MOV and sign extend byte from mem (ModRM:r/m) to 511 * reg (ModRM:reg). 512 * 513 * 0F BE/r movsx r16, r/m8 514 * 0F BE/r movsx r32, r/m8 515 * REX.W + 0F BE/r movsx r64, r/m8 516 */ 517 518 /* get the first operand */ 519 error = memread(vm, vcpuid, gpa, &val, 1, arg); 520 if (error) 521 break; 522 523 /* get the second operand */ 524 reg = gpr_map[vie->reg]; 525 526 /* sign extend byte */ 527 val = (int8_t)val; 528 529 /* write the result */ 530 error = vie_update_register(vm, vcpuid, reg, val, size); 531 break; 532 default: 533 break; 534 } 535 return (error); 536 } 537 538 static int 539 emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 540 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 541 { 542 int error, size; 543 enum vm_reg_name reg; 544 uint64_t val1, val2; 545 546 size = vie->opsize; 547 error = EINVAL; 548 549 switch (vie->op.op_byte) { 550 case 0x23: 551 /* 552 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the 553 * result in reg. 554 * 555 * 23/r and r16, r/m16 556 * 23/r and r32, r/m32 557 * REX.W + 23/r and r64, r/m64 558 */ 559 560 /* get the first operand */ 561 reg = gpr_map[vie->reg]; 562 error = vie_read_register(vm, vcpuid, reg, &val1); 563 if (error) 564 break; 565 566 /* get the second operand */ 567 error = memread(vm, vcpuid, gpa, &val2, size, arg); 568 if (error) 569 break; 570 571 /* perform the operation and write the result */ 572 val1 &= val2; 573 error = vie_update_register(vm, vcpuid, reg, val1, size); 574 break; 575 case 0x81: 576 /* 577 * AND mem (ModRM:r/m) with immediate and store the 578 * result in mem. 579 * 580 * 81 /4 and r/m16, imm16 581 * 81 /4 and r/m32, imm32 582 * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64 583 * 584 * Currently, only the AND operation of the 0x81 opcode 585 * is implemented (ModRM:reg = b100). 586 */ 587 if ((vie->reg & 7) != 4) 588 break; 589 590 /* get the first operand */ 591 error = memread(vm, vcpuid, gpa, &val1, size, arg); 592 if (error) 593 break; 594 595 /* 596 * perform the operation with the pre-fetched immediate 597 * operand and write the result 598 */ 599 val1 &= vie->immediate; 600 error = memwrite(vm, vcpuid, gpa, val1, size, arg); 601 break; 602 default: 603 break; 604 } 605 return (error); 606 } 607 608 static int 609 emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 610 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 611 { 612 int error, size; 613 uint64_t val1; 614 615 size = vie->opsize; 616 error = EINVAL; 617 618 switch (vie->op.op_byte) { 619 case 0x83: 620 /* 621 * OR mem (ModRM:r/m) with immediate and store the 622 * result in mem. 623 * 624 * 83 /1 OR r/m16, imm8 sign-extended to 16 625 * 83 /1 OR r/m32, imm8 sign-extended to 32 626 * REX.W + 83/1 OR r/m64, imm8 sign-extended to 64 627 * 628 * Currently, only the OR operation of the 0x83 opcode 629 * is implemented (ModRM:reg = b001). 630 */ 631 if ((vie->reg & 7) != 1) 632 break; 633 634 /* get the first operand */ 635 error = memread(vm, vcpuid, gpa, &val1, size, arg); 636 if (error) 637 break; 638 639 /* 640 * perform the operation with the pre-fetched immediate 641 * operand and write the result 642 */ 643 val1 |= vie->immediate; 644 error = memwrite(vm, vcpuid, gpa, val1, size, arg); 645 break; 646 default: 647 break; 648 } 649 return (error); 650 } 651 652 #define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V) 653 654 static int 655 emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 656 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 657 { 658 int error, size; 659 uint64_t op1, op2, rflags, rflags2; 660 enum vm_reg_name reg; 661 662 size = vie->opsize; 663 switch (vie->op.op_byte) { 664 case 0x3B: 665 /* 666 * 3B/r CMP r16, r/m16 667 * 3B/r CMP r32, r/m32 668 * REX.W + 3B/r CMP r64, r/m64 669 * 670 * Compare first operand (reg) with second operand (r/m) and 671 * set status flags in EFLAGS register. The comparison is 672 * performed by subtracting the second operand from the first 673 * operand and then setting the status flags. 674 */ 675 676 /* Get the first operand */ 677 reg = gpr_map[vie->reg]; 678 error = vie_read_register(vm, vcpuid, reg, &op1); 679 if (error) 680 return (error); 681 682 /* Get the second operand */ 683 error = memread(vm, vcpuid, gpa, &op2, size, arg); 684 if (error) 685 return (error); 686 687 break; 688 default: 689 return (EINVAL); 690 } 691 rflags2 = getcc(size, op1, op2); 692 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 693 if (error) 694 return (error); 695 rflags &= ~RFLAGS_STATUS_BITS; 696 rflags |= rflags2 & RFLAGS_STATUS_BITS; 697 698 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 699 return (error); 700 } 701 702 static int 703 emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, 704 struct vm_guest_paging *paging, mem_region_read_t memread, 705 mem_region_write_t memwrite, void *arg) 706 { 707 #ifdef _KERNEL 708 struct vm_copyinfo copyinfo[2]; 709 #else 710 struct iovec copyinfo[2]; 711 #endif 712 struct seg_desc ss_desc; 713 uint64_t cr0, rflags, rsp, stack_gla, val; 714 int error, size, stackaddrsize; 715 716 /* 717 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. 718 * 719 * PUSH is part of the group 5 extended opcodes and is identified 720 * by ModRM:reg = b110. 721 */ 722 if ((vie->reg & 7) != 6) 723 return (EINVAL); 724 725 size = vie->opsize; 726 /* 727 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1 728 */ 729 if (paging->cpu_mode == CPU_MODE_REAL) { 730 stackaddrsize = 2; 731 } else if (paging->cpu_mode == CPU_MODE_64BIT) { 732 /* 733 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3 734 * - Stack pointer size is always 64-bits. 735 * - PUSH/POP of 32-bit values is not possible in 64-bit mode. 736 * - 16-bit PUSH/POP is supported by using the operand size 737 * override prefix (66H). 738 */ 739 stackaddrsize = 8; 740 size = vie->opsize_override ? 2 : 8; 741 } else { 742 /* 743 * In protected or compability mode the 'B' flag in the 744 * stack-segment descriptor determines the size of the 745 * stack pointer. 746 */ 747 error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc); 748 KASSERT(error == 0, ("%s: error %d getting SS descriptor", 749 __func__, error)); 750 if (SEG_DESC_DEF32(ss_desc.access)) 751 stackaddrsize = 4; 752 else 753 stackaddrsize = 2; 754 } 755 756 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); 757 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); 758 759 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 760 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 761 762 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp); 763 KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error)); 764 765 rsp -= size; 766 if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc, 767 rsp, size, stackaddrsize, PROT_WRITE, &stack_gla)) { 768 vm_inject_ss(vm, vcpuid, 0); 769 return (0); 770 } 771 772 if (vie_canonical_check(paging->cpu_mode, stack_gla)) { 773 vm_inject_ss(vm, vcpuid, 0); 774 return (0); 775 } 776 777 if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) { 778 vm_inject_ac(vm, vcpuid, 0); 779 return (0); 780 } 781 782 error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, PROT_WRITE, 783 copyinfo, nitems(copyinfo)); 784 if (error == -1) { 785 /* 786 * XXX cannot return a negative error value here because it 787 * ends up being the return value of the VM_RUN() ioctl and 788 * is interpreted as a pseudo-error (for e.g. ERESTART). 789 */ 790 return (EFAULT); 791 } else if (error == 1) { 792 /* Resume guest execution to handle page fault */ 793 return (0); 794 } 795 796 error = memread(vm, vcpuid, mmio_gpa, &val, size, arg); 797 if (error == 0) { 798 vm_copyout(vm, vcpuid, &val, copyinfo, size); 799 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp, 800 stackaddrsize); 801 KASSERT(error == 0, ("error %d updating rsp", error)); 802 } 803 #ifdef _KERNEL 804 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 805 #endif 806 return (error); 807 } 808 809 int 810 vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 811 struct vm_guest_paging *paging, mem_region_read_t memread, 812 mem_region_write_t memwrite, void *memarg) 813 { 814 int error; 815 816 if (!vie->decoded) 817 return (EINVAL); 818 819 switch (vie->op.op_type) { 820 case VIE_OP_TYPE_PUSH: 821 error = emulate_push(vm, vcpuid, gpa, vie, paging, memread, 822 memwrite, memarg); 823 break; 824 case VIE_OP_TYPE_CMP: 825 error = emulate_cmp(vm, vcpuid, gpa, vie, 826 memread, memwrite, memarg); 827 break; 828 case VIE_OP_TYPE_MOV: 829 error = emulate_mov(vm, vcpuid, gpa, vie, 830 memread, memwrite, memarg); 831 break; 832 case VIE_OP_TYPE_MOVSX: 833 case VIE_OP_TYPE_MOVZX: 834 error = emulate_movx(vm, vcpuid, gpa, vie, 835 memread, memwrite, memarg); 836 break; 837 case VIE_OP_TYPE_AND: 838 error = emulate_and(vm, vcpuid, gpa, vie, 839 memread, memwrite, memarg); 840 break; 841 case VIE_OP_TYPE_OR: 842 error = emulate_or(vm, vcpuid, gpa, vie, 843 memread, memwrite, memarg); 844 break; 845 default: 846 error = EINVAL; 847 break; 848 } 849 850 return (error); 851 } 852 853 int 854 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) 855 { 856 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 857 ("%s: invalid size %d", __func__, size)); 858 KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl)); 859 860 if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) 861 return (0); 862 863 return ((gla & (size - 1)) ? 1 : 0); 864 } 865 866 int 867 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla) 868 { 869 uint64_t mask; 870 871 if (cpu_mode != CPU_MODE_64BIT) 872 return (0); 873 874 /* 875 * The value of the bit 47 in the 'gla' should be replicated in the 876 * most significant 16 bits. 877 */ 878 mask = ~((1UL << 48) - 1); 879 if (gla & (1UL << 47)) 880 return ((gla & mask) != mask); 881 else 882 return ((gla & mask) != 0); 883 } 884 885 uint64_t 886 vie_size2mask(int size) 887 { 888 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 889 ("vie_size2mask: invalid size %d", size)); 890 return (size2mask[size]); 891 } 892 893 int 894 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, 895 struct seg_desc *desc, uint64_t offset, int length, int addrsize, 896 int prot, uint64_t *gla) 897 { 898 uint64_t firstoff, low_limit, high_limit, segbase; 899 int glasize, type; 900 901 KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS, 902 ("%s: invalid segment %d", __func__, seg)); 903 KASSERT(length == 1 || length == 2 || length == 4 || length == 8, 904 ("%s: invalid operand size %d", __func__, length)); 905 KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, 906 ("%s: invalid prot %#x", __func__, prot)); 907 908 firstoff = offset; 909 if (cpu_mode == CPU_MODE_64BIT) { 910 KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address " 911 "size %d for cpu_mode %d", __func__, addrsize, cpu_mode)); 912 glasize = 8; 913 } else { 914 KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address " 915 "size %d for cpu mode %d", __func__, addrsize, cpu_mode)); 916 glasize = 4; 917 /* 918 * If the segment selector is loaded with a NULL selector 919 * then the descriptor is unusable and attempting to use 920 * it results in a #GP(0). 921 */ 922 if (SEG_DESC_UNUSABLE(desc->access)) 923 return (-1); 924 925 /* 926 * The processor generates a #NP exception when a segment 927 * register is loaded with a selector that points to a 928 * descriptor that is not present. If this was the case then 929 * it would have been checked before the VM-exit. 930 */ 931 KASSERT(SEG_DESC_PRESENT(desc->access), 932 ("segment %d not present: %#x", seg, desc->access)); 933 934 /* 935 * The descriptor type must indicate a code/data segment. 936 */ 937 type = SEG_DESC_TYPE(desc->access); 938 KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " 939 "descriptor type %#x", seg, type)); 940 941 if (prot & PROT_READ) { 942 /* #GP on a read access to a exec-only code segment */ 943 if ((type & 0xA) == 0x8) 944 return (-1); 945 } 946 947 if (prot & PROT_WRITE) { 948 /* 949 * #GP on a write access to a code segment or a 950 * read-only data segment. 951 */ 952 if (type & 0x8) /* code segment */ 953 return (-1); 954 955 if ((type & 0xA) == 0) /* read-only data seg */ 956 return (-1); 957 } 958 959 /* 960 * 'desc->limit' is fully expanded taking granularity into 961 * account. 962 */ 963 if ((type & 0xC) == 0x4) { 964 /* expand-down data segment */ 965 low_limit = desc->limit + 1; 966 high_limit = SEG_DESC_DEF32(desc->access) ? 967 0xffffffff : 0xffff; 968 } else { 969 /* code segment or expand-up data segment */ 970 low_limit = 0; 971 high_limit = desc->limit; 972 } 973 974 while (length > 0) { 975 offset &= vie_size2mask(addrsize); 976 if (offset < low_limit || offset > high_limit) 977 return (-1); 978 offset++; 979 length--; 980 } 981 } 982 983 /* 984 * In 64-bit mode all segments except %fs and %gs have a segment 985 * base address of 0. 986 */ 987 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && 988 seg != VM_REG_GUEST_GS) { 989 segbase = 0; 990 } else { 991 segbase = desc->base; 992 } 993 994 /* 995 * Truncate 'firstoff' to the effective address size before adding 996 * it to the segment base. 997 */ 998 firstoff &= vie_size2mask(addrsize); 999 *gla = (segbase + firstoff) & vie_size2mask(glasize); 1000 return (0); 1001 } 1002 1003 #ifdef _KERNEL 1004 void 1005 vie_init(struct vie *vie) 1006 { 1007 1008 bzero(vie, sizeof(struct vie)); 1009 1010 vie->base_register = VM_REG_LAST; 1011 vie->index_register = VM_REG_LAST; 1012 } 1013 1014 static int 1015 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte) 1016 { 1017 int error_code = 0; 1018 1019 if (pte & PG_V) 1020 error_code |= PGEX_P; 1021 if (prot & VM_PROT_WRITE) 1022 error_code |= PGEX_W; 1023 if (usermode) 1024 error_code |= PGEX_U; 1025 if (rsvd) 1026 error_code |= PGEX_RSV; 1027 if (prot & VM_PROT_EXECUTE) 1028 error_code |= PGEX_I; 1029 1030 return (error_code); 1031 } 1032 1033 static void 1034 ptp_release(void **cookie) 1035 { 1036 if (*cookie != NULL) { 1037 vm_gpa_release(*cookie); 1038 *cookie = NULL; 1039 } 1040 } 1041 1042 static void * 1043 ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie) 1044 { 1045 void *ptr; 1046 1047 ptp_release(cookie); 1048 ptr = vm_gpa_hold(vm, ptpphys, len, VM_PROT_RW, cookie); 1049 return (ptr); 1050 } 1051 1052 int 1053 vmm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 1054 uint64_t gla, int prot, uint64_t *gpa) 1055 { 1056 int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable; 1057 u_int retries; 1058 uint64_t *ptpbase, ptpphys, pte, pgsize; 1059 uint32_t *ptpbase32, pte32; 1060 void *cookie; 1061 1062 usermode = (paging->cpl == 3 ? 1 : 0); 1063 writable = prot & VM_PROT_WRITE; 1064 cookie = NULL; 1065 retval = 0; 1066 retries = 0; 1067 restart: 1068 ptpphys = paging->cr3; /* root of the page tables */ 1069 ptp_release(&cookie); 1070 if (retries++ > 0) 1071 maybe_yield(); 1072 1073 if (vie_canonical_check(paging->cpu_mode, gla)) { 1074 /* 1075 * XXX assuming a non-stack reference otherwise a stack fault 1076 * should be generated. 1077 */ 1078 vm_inject_gp(vm, vcpuid); 1079 goto fault; 1080 } 1081 1082 if (paging->paging_mode == PAGING_MODE_FLAT) { 1083 *gpa = gla; 1084 goto done; 1085 } 1086 1087 if (paging->paging_mode == PAGING_MODE_32) { 1088 nlevels = 2; 1089 while (--nlevels >= 0) { 1090 /* Zero out the lower 12 bits. */ 1091 ptpphys &= ~0xfff; 1092 1093 ptpbase32 = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie); 1094 1095 if (ptpbase32 == NULL) 1096 goto error; 1097 1098 ptpshift = PAGE_SHIFT + nlevels * 10; 1099 ptpindex = (gla >> ptpshift) & 0x3FF; 1100 pgsize = 1UL << ptpshift; 1101 1102 pte32 = ptpbase32[ptpindex]; 1103 1104 if ((pte32 & PG_V) == 0 || 1105 (usermode && (pte32 & PG_U) == 0) || 1106 (writable && (pte32 & PG_RW) == 0)) { 1107 pfcode = pf_error_code(usermode, prot, 0, 1108 pte32); 1109 vm_inject_pf(vm, vcpuid, pfcode, gla); 1110 goto fault; 1111 } 1112 1113 /* 1114 * Emulate the x86 MMU's management of the accessed 1115 * and dirty flags. While the accessed flag is set 1116 * at every level of the page table, the dirty flag 1117 * is only set at the last level providing the guest 1118 * physical address. 1119 */ 1120 if ((pte32 & PG_A) == 0) { 1121 if (atomic_cmpset_32(&ptpbase32[ptpindex], 1122 pte32, pte32 | PG_A) == 0) { 1123 goto restart; 1124 } 1125 } 1126 1127 /* XXX must be ignored if CR4.PSE=0 */ 1128 if (nlevels > 0 && (pte32 & PG_PS) != 0) 1129 break; 1130 1131 ptpphys = pte32; 1132 } 1133 1134 /* Set the dirty bit in the page table entry if necessary */ 1135 if (writable && (pte32 & PG_M) == 0) { 1136 if (atomic_cmpset_32(&ptpbase32[ptpindex], 1137 pte32, pte32 | PG_M) == 0) { 1138 goto restart; 1139 } 1140 } 1141 1142 /* Zero out the lower 'ptpshift' bits */ 1143 pte32 >>= ptpshift; pte32 <<= ptpshift; 1144 *gpa = pte32 | (gla & (pgsize - 1)); 1145 goto done; 1146 } 1147 1148 if (paging->paging_mode == PAGING_MODE_PAE) { 1149 /* Zero out the lower 5 bits and the upper 32 bits */ 1150 ptpphys &= 0xffffffe0UL; 1151 1152 ptpbase = ptp_hold(vm, ptpphys, sizeof(*ptpbase) * 4, &cookie); 1153 if (ptpbase == NULL) 1154 goto error; 1155 1156 ptpindex = (gla >> 30) & 0x3; 1157 1158 pte = ptpbase[ptpindex]; 1159 1160 if ((pte & PG_V) == 0) { 1161 pfcode = pf_error_code(usermode, prot, 0, pte); 1162 vm_inject_pf(vm, vcpuid, pfcode, gla); 1163 goto fault; 1164 } 1165 1166 ptpphys = pte; 1167 1168 nlevels = 2; 1169 } else 1170 nlevels = 4; 1171 while (--nlevels >= 0) { 1172 /* Zero out the lower 12 bits and the upper 12 bits */ 1173 ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12; 1174 1175 ptpbase = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie); 1176 if (ptpbase == NULL) 1177 goto error; 1178 1179 ptpshift = PAGE_SHIFT + nlevels * 9; 1180 ptpindex = (gla >> ptpshift) & 0x1FF; 1181 pgsize = 1UL << ptpshift; 1182 1183 pte = ptpbase[ptpindex]; 1184 1185 if ((pte & PG_V) == 0 || 1186 (usermode && (pte & PG_U) == 0) || 1187 (writable && (pte & PG_RW) == 0)) { 1188 pfcode = pf_error_code(usermode, prot, 0, pte); 1189 vm_inject_pf(vm, vcpuid, pfcode, gla); 1190 goto fault; 1191 } 1192 1193 /* Set the accessed bit in the page table entry */ 1194 if ((pte & PG_A) == 0) { 1195 if (atomic_cmpset_64(&ptpbase[ptpindex], 1196 pte, pte | PG_A) == 0) { 1197 goto restart; 1198 } 1199 } 1200 1201 if (nlevels > 0 && (pte & PG_PS) != 0) { 1202 if (pgsize > 1 * GB) { 1203 pfcode = pf_error_code(usermode, prot, 1, pte); 1204 vm_inject_pf(vm, vcpuid, pfcode, gla); 1205 goto fault; 1206 } 1207 break; 1208 } 1209 1210 ptpphys = pte; 1211 } 1212 1213 /* Set the dirty bit in the page table entry if necessary */ 1214 if (writable && (pte & PG_M) == 0) { 1215 if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0) 1216 goto restart; 1217 } 1218 1219 /* Zero out the lower 'ptpshift' bits and the upper 12 bits */ 1220 pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12; 1221 *gpa = pte | (gla & (pgsize - 1)); 1222 done: 1223 ptp_release(&cookie); 1224 return (retval); 1225 error: 1226 retval = -1; 1227 goto done; 1228 fault: 1229 retval = 1; 1230 goto done; 1231 } 1232 1233 int 1234 vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 1235 uint64_t rip, int inst_length, struct vie *vie) 1236 { 1237 struct vm_copyinfo copyinfo[2]; 1238 int error, prot; 1239 1240 if (inst_length > VIE_INST_SIZE) 1241 panic("vmm_fetch_instruction: invalid length %d", inst_length); 1242 1243 prot = PROT_READ | PROT_EXEC; 1244 error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot, 1245 copyinfo, nitems(copyinfo)); 1246 if (error == 0) { 1247 vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length); 1248 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 1249 vie->num_valid = inst_length; 1250 } 1251 return (error); 1252 } 1253 1254 static int 1255 vie_peek(struct vie *vie, uint8_t *x) 1256 { 1257 1258 if (vie->num_processed < vie->num_valid) { 1259 *x = vie->inst[vie->num_processed]; 1260 return (0); 1261 } else 1262 return (-1); 1263 } 1264 1265 static void 1266 vie_advance(struct vie *vie) 1267 { 1268 1269 vie->num_processed++; 1270 } 1271 1272 static int 1273 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d) 1274 { 1275 uint8_t x; 1276 1277 while (1) { 1278 if (vie_peek(vie, &x)) 1279 return (-1); 1280 1281 if (x == 0x66) 1282 vie->opsize_override = 1; 1283 else if (x == 0x67) 1284 vie->addrsize_override = 1; 1285 else 1286 break; 1287 1288 vie_advance(vie); 1289 } 1290 1291 /* 1292 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2: 1293 * - Only one REX prefix is allowed per instruction. 1294 * - The REX prefix must immediately precede the opcode byte or the 1295 * escape opcode byte. 1296 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3) 1297 * the mandatory prefix must come before the REX prefix. 1298 */ 1299 if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) { 1300 vie->rex_present = 1; 1301 vie->rex_w = x & 0x8 ? 1 : 0; 1302 vie->rex_r = x & 0x4 ? 1 : 0; 1303 vie->rex_x = x & 0x2 ? 1 : 0; 1304 vie->rex_b = x & 0x1 ? 1 : 0; 1305 vie_advance(vie); 1306 } 1307 1308 /* 1309 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1 1310 */ 1311 if (cpu_mode == CPU_MODE_64BIT) { 1312 /* 1313 * Default address size is 64-bits and default operand size 1314 * is 32-bits. 1315 */ 1316 vie->addrsize = vie->addrsize_override ? 4 : 8; 1317 if (vie->rex_w) 1318 vie->opsize = 8; 1319 else if (vie->opsize_override) 1320 vie->opsize = 2; 1321 else 1322 vie->opsize = 4; 1323 } else if (cs_d) { 1324 /* Default address and operand sizes are 32-bits */ 1325 vie->addrsize = vie->addrsize_override ? 2 : 4; 1326 vie->opsize = vie->opsize_override ? 2 : 4; 1327 } else { 1328 /* Default address and operand sizes are 16-bits */ 1329 vie->addrsize = vie->addrsize_override ? 4 : 2; 1330 vie->opsize = vie->opsize_override ? 4 : 2; 1331 } 1332 return (0); 1333 } 1334 1335 static int 1336 decode_two_byte_opcode(struct vie *vie) 1337 { 1338 uint8_t x; 1339 1340 if (vie_peek(vie, &x)) 1341 return (-1); 1342 1343 vie->op = two_byte_opcodes[x]; 1344 1345 if (vie->op.op_type == VIE_OP_TYPE_NONE) 1346 return (-1); 1347 1348 vie_advance(vie); 1349 return (0); 1350 } 1351 1352 static int 1353 decode_opcode(struct vie *vie) 1354 { 1355 uint8_t x; 1356 1357 if (vie_peek(vie, &x)) 1358 return (-1); 1359 1360 vie->op = one_byte_opcodes[x]; 1361 1362 if (vie->op.op_type == VIE_OP_TYPE_NONE) 1363 return (-1); 1364 1365 vie_advance(vie); 1366 1367 if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE) 1368 return (decode_two_byte_opcode(vie)); 1369 1370 return (0); 1371 } 1372 1373 static int 1374 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode) 1375 { 1376 uint8_t x; 1377 1378 if (cpu_mode == CPU_MODE_REAL) 1379 return (-1); 1380 1381 if (vie->op.op_flags & VIE_OP_F_NO_MODRM) 1382 return (0); 1383 1384 if (vie_peek(vie, &x)) 1385 return (-1); 1386 1387 vie->mod = (x >> 6) & 0x3; 1388 vie->rm = (x >> 0) & 0x7; 1389 vie->reg = (x >> 3) & 0x7; 1390 1391 /* 1392 * A direct addressing mode makes no sense in the context of an EPT 1393 * fault. There has to be a memory access involved to cause the 1394 * EPT fault. 1395 */ 1396 if (vie->mod == VIE_MOD_DIRECT) 1397 return (-1); 1398 1399 if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) || 1400 (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) { 1401 /* 1402 * Table 2-5: Special Cases of REX Encodings 1403 * 1404 * mod=0, r/m=5 is used in the compatibility mode to 1405 * indicate a disp32 without a base register. 1406 * 1407 * mod!=3, r/m=4 is used in the compatibility mode to 1408 * indicate that the SIB byte is present. 1409 * 1410 * The 'b' bit in the REX prefix is don't care in 1411 * this case. 1412 */ 1413 } else { 1414 vie->rm |= (vie->rex_b << 3); 1415 } 1416 1417 vie->reg |= (vie->rex_r << 3); 1418 1419 /* SIB */ 1420 if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB) 1421 goto done; 1422 1423 vie->base_register = gpr_map[vie->rm]; 1424 1425 switch (vie->mod) { 1426 case VIE_MOD_INDIRECT_DISP8: 1427 vie->disp_bytes = 1; 1428 break; 1429 case VIE_MOD_INDIRECT_DISP32: 1430 vie->disp_bytes = 4; 1431 break; 1432 case VIE_MOD_INDIRECT: 1433 if (vie->rm == VIE_RM_DISP32) { 1434 vie->disp_bytes = 4; 1435 /* 1436 * Table 2-7. RIP-Relative Addressing 1437 * 1438 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32 1439 * whereas in compatibility mode it just implies disp32. 1440 */ 1441 1442 if (cpu_mode == CPU_MODE_64BIT) 1443 vie->base_register = VM_REG_GUEST_RIP; 1444 else 1445 vie->base_register = VM_REG_LAST; 1446 } 1447 break; 1448 } 1449 1450 done: 1451 vie_advance(vie); 1452 1453 return (0); 1454 } 1455 1456 static int 1457 decode_sib(struct vie *vie) 1458 { 1459 uint8_t x; 1460 1461 /* Proceed only if SIB byte is present */ 1462 if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB) 1463 return (0); 1464 1465 if (vie_peek(vie, &x)) 1466 return (-1); 1467 1468 /* De-construct the SIB byte */ 1469 vie->ss = (x >> 6) & 0x3; 1470 vie->index = (x >> 3) & 0x7; 1471 vie->base = (x >> 0) & 0x7; 1472 1473 /* Apply the REX prefix modifiers */ 1474 vie->index |= vie->rex_x << 3; 1475 vie->base |= vie->rex_b << 3; 1476 1477 switch (vie->mod) { 1478 case VIE_MOD_INDIRECT_DISP8: 1479 vie->disp_bytes = 1; 1480 break; 1481 case VIE_MOD_INDIRECT_DISP32: 1482 vie->disp_bytes = 4; 1483 break; 1484 } 1485 1486 if (vie->mod == VIE_MOD_INDIRECT && 1487 (vie->base == 5 || vie->base == 13)) { 1488 /* 1489 * Special case when base register is unused if mod = 0 1490 * and base = %rbp or %r13. 1491 * 1492 * Documented in: 1493 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 1494 * Table 2-5: Special Cases of REX Encodings 1495 */ 1496 vie->disp_bytes = 4; 1497 } else { 1498 vie->base_register = gpr_map[vie->base]; 1499 } 1500 1501 /* 1502 * All encodings of 'index' are valid except for %rsp (4). 1503 * 1504 * Documented in: 1505 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 1506 * Table 2-5: Special Cases of REX Encodings 1507 */ 1508 if (vie->index != 4) 1509 vie->index_register = gpr_map[vie->index]; 1510 1511 /* 'scale' makes sense only in the context of an index register */ 1512 if (vie->index_register < VM_REG_LAST) 1513 vie->scale = 1 << vie->ss; 1514 1515 vie_advance(vie); 1516 1517 return (0); 1518 } 1519 1520 static int 1521 decode_displacement(struct vie *vie) 1522 { 1523 int n, i; 1524 uint8_t x; 1525 1526 union { 1527 char buf[4]; 1528 int8_t signed8; 1529 int32_t signed32; 1530 } u; 1531 1532 if ((n = vie->disp_bytes) == 0) 1533 return (0); 1534 1535 if (n != 1 && n != 4) 1536 panic("decode_displacement: invalid disp_bytes %d", n); 1537 1538 for (i = 0; i < n; i++) { 1539 if (vie_peek(vie, &x)) 1540 return (-1); 1541 1542 u.buf[i] = x; 1543 vie_advance(vie); 1544 } 1545 1546 if (n == 1) 1547 vie->displacement = u.signed8; /* sign-extended */ 1548 else 1549 vie->displacement = u.signed32; /* sign-extended */ 1550 1551 return (0); 1552 } 1553 1554 static int 1555 decode_immediate(struct vie *vie) 1556 { 1557 int i, n; 1558 uint8_t x; 1559 union { 1560 char buf[4]; 1561 int8_t signed8; 1562 int16_t signed16; 1563 int32_t signed32; 1564 } u; 1565 1566 /* Figure out immediate operand size (if any) */ 1567 if (vie->op.op_flags & VIE_OP_F_IMM) { 1568 /* 1569 * Section 2.2.1.5 "Immediates", Intel SDM: 1570 * In 64-bit mode the typical size of immediate operands 1571 * remains 32-bits. When the operand size if 64-bits, the 1572 * processor sign-extends all immediates to 64-bits prior 1573 * to their use. 1574 */ 1575 if (vie->opsize == 4 || vie->opsize == 8) 1576 vie->imm_bytes = 4; 1577 else 1578 vie->imm_bytes = 2; 1579 } else if (vie->op.op_flags & VIE_OP_F_IMM8) { 1580 vie->imm_bytes = 1; 1581 } 1582 1583 if ((n = vie->imm_bytes) == 0) 1584 return (0); 1585 1586 KASSERT(n == 1 || n == 2 || n == 4, 1587 ("%s: invalid number of immediate bytes: %d", __func__, n)); 1588 1589 for (i = 0; i < n; i++) { 1590 if (vie_peek(vie, &x)) 1591 return (-1); 1592 1593 u.buf[i] = x; 1594 vie_advance(vie); 1595 } 1596 1597 /* sign-extend the immediate value before use */ 1598 if (n == 1) 1599 vie->immediate = u.signed8; 1600 else if (n == 2) 1601 vie->immediate = u.signed16; 1602 else 1603 vie->immediate = u.signed32; 1604 1605 return (0); 1606 } 1607 1608 static int 1609 decode_moffset(struct vie *vie) 1610 { 1611 int i, n; 1612 uint8_t x; 1613 union { 1614 char buf[8]; 1615 uint64_t u64; 1616 } u; 1617 1618 if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0) 1619 return (0); 1620 1621 /* 1622 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM: 1623 * The memory offset size follows the address-size of the instruction. 1624 */ 1625 n = vie->addrsize; 1626 KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n)); 1627 1628 u.u64 = 0; 1629 for (i = 0; i < n; i++) { 1630 if (vie_peek(vie, &x)) 1631 return (-1); 1632 1633 u.buf[i] = x; 1634 vie_advance(vie); 1635 } 1636 vie->displacement = u.u64; 1637 return (0); 1638 } 1639 1640 /* 1641 * Verify that all the bytes in the instruction buffer were consumed. 1642 */ 1643 static int 1644 verify_inst_length(struct vie *vie) 1645 { 1646 1647 if (vie->num_processed == vie->num_valid) 1648 return (0); 1649 else 1650 return (-1); 1651 } 1652 1653 /* 1654 * Verify that the 'guest linear address' provided as collateral of the nested 1655 * page table fault matches with our instruction decoding. 1656 */ 1657 static int 1658 verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie) 1659 { 1660 int error; 1661 uint64_t base, idx, gla2; 1662 1663 /* Skip 'gla' verification */ 1664 if (gla == VIE_INVALID_GLA) 1665 return (0); 1666 1667 base = 0; 1668 if (vie->base_register != VM_REG_LAST) { 1669 error = vm_get_register(vm, cpuid, vie->base_register, &base); 1670 if (error) { 1671 printf("verify_gla: error %d getting base reg %d\n", 1672 error, vie->base_register); 1673 return (-1); 1674 } 1675 1676 /* 1677 * RIP-relative addressing starts from the following 1678 * instruction 1679 */ 1680 if (vie->base_register == VM_REG_GUEST_RIP) 1681 base += vie->num_valid; 1682 } 1683 1684 idx = 0; 1685 if (vie->index_register != VM_REG_LAST) { 1686 error = vm_get_register(vm, cpuid, vie->index_register, &idx); 1687 if (error) { 1688 printf("verify_gla: error %d getting index reg %d\n", 1689 error, vie->index_register); 1690 return (-1); 1691 } 1692 } 1693 1694 /* XXX assuming that the base address of the segment is 0 */ 1695 gla2 = base + vie->scale * idx + vie->displacement; 1696 gla2 &= size2mask[vie->addrsize]; 1697 if (gla != gla2) { 1698 printf("verify_gla mismatch: " 1699 "base(0x%0lx), scale(%d), index(0x%0lx), " 1700 "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n", 1701 base, vie->scale, idx, vie->displacement, gla, gla2); 1702 return (-1); 1703 } 1704 1705 return (0); 1706 } 1707 1708 int 1709 vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, 1710 enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) 1711 { 1712 1713 if (decode_prefixes(vie, cpu_mode, cs_d)) 1714 return (-1); 1715 1716 if (decode_opcode(vie)) 1717 return (-1); 1718 1719 if (decode_modrm(vie, cpu_mode)) 1720 return (-1); 1721 1722 if (decode_sib(vie)) 1723 return (-1); 1724 1725 if (decode_displacement(vie)) 1726 return (-1); 1727 1728 if (decode_immediate(vie)) 1729 return (-1); 1730 1731 if (decode_moffset(vie)) 1732 return (-1); 1733 1734 if (verify_inst_length(vie)) 1735 return (-1); 1736 1737 if (verify_gla(vm, cpuid, gla, vie)) 1738 return (-1); 1739 1740 vie->decoded = 1; /* success */ 1741 1742 return (0); 1743 } 1744 #endif /* _KERNEL */ 1745