1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2012 Sandvine, Inc. 5 * Copyright (c) 2012 NetApp, Inc. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 #ifdef _KERNEL 32 #include <sys/param.h> 33 #include <sys/pcpu.h> 34 #include <sys/systm.h> 35 #include <sys/proc.h> 36 37 #include <vm/vm.h> 38 #include <vm/pmap.h> 39 40 #include <machine/vmparam.h> 41 #include <machine/vmm.h> 42 43 #include <dev/vmm/vmm_mem.h> 44 #else /* !_KERNEL */ 45 #include <sys/types.h> 46 #include <sys/errno.h> 47 #include <sys/_iovec.h> 48 49 #include <machine/vmm.h> 50 51 #include <err.h> 52 #include <assert.h> 53 #include <stdbool.h> 54 #include <stddef.h> 55 #include <stdio.h> 56 #include <string.h> 57 #include <strings.h> 58 #include <vmmapi.h> 59 #define __diagused 60 #define KASSERT(exp,msg) assert((exp)) 61 #define panic(...) errx(4, __VA_ARGS__) 62 #endif /* _KERNEL */ 63 64 #include <machine/vmm_instruction_emul.h> 65 #include <x86/psl.h> 66 #include <x86/specialreg.h> 67 68 /* struct vie_op.op_type */ 69 enum { 70 VIE_OP_TYPE_NONE = 0, 71 VIE_OP_TYPE_MOV, 72 VIE_OP_TYPE_MOVSX, 73 VIE_OP_TYPE_MOVZX, 74 VIE_OP_TYPE_AND, 75 VIE_OP_TYPE_OR, 76 VIE_OP_TYPE_SUB, 77 VIE_OP_TYPE_TWO_BYTE, 78 VIE_OP_TYPE_PUSH, 79 VIE_OP_TYPE_CMP, 80 VIE_OP_TYPE_POP, 81 VIE_OP_TYPE_MOVS, 82 VIE_OP_TYPE_GROUP1, 83 VIE_OP_TYPE_STOS, 84 VIE_OP_TYPE_BITTEST, 85 VIE_OP_TYPE_TWOB_GRP15, 86 VIE_OP_TYPE_ADD, 87 VIE_OP_TYPE_TEST, 88 VIE_OP_TYPE_BEXTR, 89 VIE_OP_TYPE_LAST 90 }; 91 92 /* struct vie_op.op_flags */ 93 #define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */ 94 #define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ 95 #define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */ 96 #define VIE_OP_F_NO_MODRM (1 << 3) 97 #define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4) 98 99 static const struct vie_op three_byte_opcodes_0f38[256] = { 100 [0xF7] = { 101 .op_byte = 0xF7, 102 .op_type = VIE_OP_TYPE_BEXTR, 103 }, 104 }; 105 106 static const struct vie_op two_byte_opcodes[256] = { 107 [0xAE] = { 108 .op_byte = 0xAE, 109 .op_type = VIE_OP_TYPE_TWOB_GRP15, 110 }, 111 [0xB6] = { 112 .op_byte = 0xB6, 113 .op_type = VIE_OP_TYPE_MOVZX, 114 }, 115 [0xB7] = { 116 .op_byte = 0xB7, 117 .op_type = VIE_OP_TYPE_MOVZX, 118 }, 119 [0xBA] = { 120 .op_byte = 0xBA, 121 .op_type = VIE_OP_TYPE_BITTEST, 122 .op_flags = VIE_OP_F_IMM8, 123 }, 124 [0xBE] = { 125 .op_byte = 0xBE, 126 .op_type = VIE_OP_TYPE_MOVSX, 127 }, 128 }; 129 130 static const struct vie_op one_byte_opcodes[256] = { 131 [0x03] = { 132 .op_byte = 0x03, 133 .op_type = VIE_OP_TYPE_ADD, 134 }, 135 [0x0F] = { 136 .op_byte = 0x0F, 137 .op_type = VIE_OP_TYPE_TWO_BYTE 138 }, 139 [0x0B] = { 140 .op_byte = 0x0B, 141 .op_type = VIE_OP_TYPE_OR, 142 }, 143 [0x2B] = { 144 .op_byte = 0x2B, 145 .op_type = VIE_OP_TYPE_SUB, 146 }, 147 [0x39] = { 148 .op_byte = 0x39, 149 .op_type = VIE_OP_TYPE_CMP, 150 }, 151 [0x3B] = { 152 .op_byte = 0x3B, 153 .op_type = VIE_OP_TYPE_CMP, 154 }, 155 [0x88] = { 156 .op_byte = 0x88, 157 .op_type = VIE_OP_TYPE_MOV, 158 }, 159 [0x89] = { 160 .op_byte = 0x89, 161 .op_type = VIE_OP_TYPE_MOV, 162 }, 163 [0x8A] = { 164 .op_byte = 0x8A, 165 .op_type = VIE_OP_TYPE_MOV, 166 }, 167 [0x8B] = { 168 .op_byte = 0x8B, 169 .op_type = VIE_OP_TYPE_MOV, 170 }, 171 [0xA1] = { 172 .op_byte = 0xA1, 173 .op_type = VIE_OP_TYPE_MOV, 174 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 175 }, 176 [0xA3] = { 177 .op_byte = 0xA3, 178 .op_type = VIE_OP_TYPE_MOV, 179 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 180 }, 181 [0xA4] = { 182 .op_byte = 0xA4, 183 .op_type = VIE_OP_TYPE_MOVS, 184 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 185 }, 186 [0xA5] = { 187 .op_byte = 0xA5, 188 .op_type = VIE_OP_TYPE_MOVS, 189 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 190 }, 191 [0xAA] = { 192 .op_byte = 0xAA, 193 .op_type = VIE_OP_TYPE_STOS, 194 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 195 }, 196 [0xAB] = { 197 .op_byte = 0xAB, 198 .op_type = VIE_OP_TYPE_STOS, 199 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 200 }, 201 [0xC6] = { 202 /* XXX Group 11 extended opcode - not just MOV */ 203 .op_byte = 0xC6, 204 .op_type = VIE_OP_TYPE_MOV, 205 .op_flags = VIE_OP_F_IMM8, 206 }, 207 [0xC7] = { 208 .op_byte = 0xC7, 209 .op_type = VIE_OP_TYPE_MOV, 210 .op_flags = VIE_OP_F_IMM, 211 }, 212 [0x23] = { 213 .op_byte = 0x23, 214 .op_type = VIE_OP_TYPE_AND, 215 }, 216 [0x80] = { 217 /* Group 1 extended opcode */ 218 .op_byte = 0x80, 219 .op_type = VIE_OP_TYPE_GROUP1, 220 .op_flags = VIE_OP_F_IMM8, 221 }, 222 [0x81] = { 223 /* Group 1 extended opcode */ 224 .op_byte = 0x81, 225 .op_type = VIE_OP_TYPE_GROUP1, 226 .op_flags = VIE_OP_F_IMM, 227 }, 228 [0x83] = { 229 /* Group 1 extended opcode */ 230 .op_byte = 0x83, 231 .op_type = VIE_OP_TYPE_GROUP1, 232 .op_flags = VIE_OP_F_IMM8, 233 }, 234 [0x8F] = { 235 /* XXX Group 1A extended opcode - not just POP */ 236 .op_byte = 0x8F, 237 .op_type = VIE_OP_TYPE_POP, 238 }, 239 [0xF6] = { 240 /* XXX Group 3 extended opcode - not just TEST */ 241 .op_byte = 0xF6, 242 .op_type = VIE_OP_TYPE_TEST, 243 .op_flags = VIE_OP_F_IMM8, 244 }, 245 [0xF7] = { 246 /* XXX Group 3 extended opcode - not just TEST */ 247 .op_byte = 0xF7, 248 .op_type = VIE_OP_TYPE_TEST, 249 .op_flags = VIE_OP_F_IMM, 250 }, 251 [0xFF] = { 252 /* XXX Group 5 extended opcode - not just PUSH */ 253 .op_byte = 0xFF, 254 .op_type = VIE_OP_TYPE_PUSH, 255 } 256 }; 257 258 /* struct vie.mod */ 259 #define VIE_MOD_INDIRECT 0 260 #define VIE_MOD_INDIRECT_DISP8 1 261 #define VIE_MOD_INDIRECT_DISP32 2 262 #define VIE_MOD_DIRECT 3 263 264 /* struct vie.rm */ 265 #define VIE_RM_SIB 4 266 #define VIE_RM_DISP32 5 267 268 #define GB (1024 * 1024 * 1024) 269 270 static enum vm_reg_name gpr_map[16] = { 271 VM_REG_GUEST_RAX, 272 VM_REG_GUEST_RCX, 273 VM_REG_GUEST_RDX, 274 VM_REG_GUEST_RBX, 275 VM_REG_GUEST_RSP, 276 VM_REG_GUEST_RBP, 277 VM_REG_GUEST_RSI, 278 VM_REG_GUEST_RDI, 279 VM_REG_GUEST_R8, 280 VM_REG_GUEST_R9, 281 VM_REG_GUEST_R10, 282 VM_REG_GUEST_R11, 283 VM_REG_GUEST_R12, 284 VM_REG_GUEST_R13, 285 VM_REG_GUEST_R14, 286 VM_REG_GUEST_R15 287 }; 288 289 static uint64_t size2mask[] = { 290 [1] = 0xff, 291 [2] = 0xffff, 292 [4] = 0xffffffff, 293 [8] = 0xffffffffffffffff, 294 }; 295 296 static int 297 vie_read_register(struct vcpu *vcpu, enum vm_reg_name reg, uint64_t *rval) 298 { 299 int error; 300 301 error = vm_get_register(vcpu, reg, rval); 302 303 return (error); 304 } 305 306 static void 307 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr) 308 { 309 *lhbr = 0; 310 *reg = gpr_map[vie->reg]; 311 312 /* 313 * 64-bit mode imposes limitations on accessing legacy high byte 314 * registers (lhbr). 315 * 316 * The legacy high-byte registers cannot be addressed if the REX 317 * prefix is present. In this case the values 4, 5, 6 and 7 of the 318 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively. 319 * 320 * If the REX prefix is not present then the values 4, 5, 6 and 7 321 * of the 'ModRM:reg' field address the legacy high-byte registers, 322 * %ah, %ch, %dh and %bh respectively. 323 */ 324 if (!vie->rex_present) { 325 if (vie->reg & 0x4) { 326 *lhbr = 1; 327 *reg = gpr_map[vie->reg & 0x3]; 328 } 329 } 330 } 331 332 static int 333 vie_read_bytereg(struct vcpu *vcpu, struct vie *vie, uint8_t *rval) 334 { 335 uint64_t val; 336 int error, lhbr; 337 enum vm_reg_name reg; 338 339 vie_calc_bytereg(vie, ®, &lhbr); 340 error = vm_get_register(vcpu, reg, &val); 341 342 /* 343 * To obtain the value of a legacy high byte register shift the 344 * base register right by 8 bits (%ah = %rax >> 8). 345 */ 346 if (lhbr) 347 *rval = val >> 8; 348 else 349 *rval = val; 350 return (error); 351 } 352 353 static int 354 vie_write_bytereg(struct vcpu *vcpu, struct vie *vie, uint8_t byte) 355 { 356 uint64_t origval, val, mask; 357 int error, lhbr; 358 enum vm_reg_name reg; 359 360 vie_calc_bytereg(vie, ®, &lhbr); 361 error = vm_get_register(vcpu, reg, &origval); 362 if (error == 0) { 363 val = byte; 364 mask = 0xff; 365 if (lhbr) { 366 /* 367 * Shift left by 8 to store 'byte' in a legacy high 368 * byte register. 369 */ 370 val <<= 8; 371 mask <<= 8; 372 } 373 val |= origval & ~mask; 374 error = vm_set_register(vcpu, reg, val); 375 } 376 return (error); 377 } 378 379 int 380 vie_update_register(struct vcpu *vcpu, enum vm_reg_name reg, 381 uint64_t val, int size) 382 { 383 int error; 384 uint64_t origval; 385 386 switch (size) { 387 case 1: 388 case 2: 389 error = vie_read_register(vcpu, reg, &origval); 390 if (error) 391 return (error); 392 val &= size2mask[size]; 393 val |= origval & ~size2mask[size]; 394 break; 395 case 4: 396 val &= 0xffffffffUL; 397 break; 398 case 8: 399 break; 400 default: 401 return (EINVAL); 402 } 403 404 error = vm_set_register(vcpu, reg, val); 405 return (error); 406 } 407 408 #define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V) 409 410 /* 411 * Return the status flags that would result from doing (x - y). 412 */ 413 #define GETCC(sz) \ 414 static u_long \ 415 getcc##sz(uint##sz##_t x, uint##sz##_t y) \ 416 { \ 417 u_long rflags; \ 418 \ 419 __asm __volatile("sub %2,%1; pushfq; popq %0" : \ 420 "=r" (rflags), "+r" (x) : "m" (y)); \ 421 return (rflags); \ 422 } struct __hack 423 424 GETCC(8); 425 GETCC(16); 426 GETCC(32); 427 GETCC(64); 428 429 static u_long 430 getcc(int opsize, uint64_t x, uint64_t y) 431 { 432 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 433 ("getcc: invalid operand size %d", opsize)); 434 435 if (opsize == 1) 436 return (getcc8(x, y)); 437 else if (opsize == 2) 438 return (getcc16(x, y)); 439 else if (opsize == 4) 440 return (getcc32(x, y)); 441 else 442 return (getcc64(x, y)); 443 } 444 445 /* 446 * Macro creation of functions getaddflags{8,16,32,64} 447 */ 448 #define GETADDFLAGS(sz) \ 449 static u_long \ 450 getaddflags##sz(uint##sz##_t x, uint##sz##_t y) \ 451 { \ 452 u_long rflags; \ 453 \ 454 __asm __volatile("add %2,%1; pushfq; popq %0" : \ 455 "=r" (rflags), "+r" (x) : "m" (y)); \ 456 return (rflags); \ 457 } struct __hack 458 459 GETADDFLAGS(8); 460 GETADDFLAGS(16); 461 GETADDFLAGS(32); 462 GETADDFLAGS(64); 463 464 static u_long 465 getaddflags(int opsize, uint64_t x, uint64_t y) 466 { 467 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 468 ("getaddflags: invalid operand size %d", opsize)); 469 470 if (opsize == 1) 471 return (getaddflags8(x, y)); 472 else if (opsize == 2) 473 return (getaddflags16(x, y)); 474 else if (opsize == 4) 475 return (getaddflags32(x, y)); 476 else 477 return (getaddflags64(x, y)); 478 } 479 480 /* 481 * Return the status flags that would result from doing (x & y). 482 */ 483 #define GETANDFLAGS(sz) \ 484 static u_long \ 485 getandflags##sz(uint##sz##_t x, uint##sz##_t y) \ 486 { \ 487 u_long rflags; \ 488 \ 489 __asm __volatile("and %2,%1; pushfq; popq %0" : \ 490 "=r" (rflags), "+r" (x) : "m" (y)); \ 491 return (rflags); \ 492 } struct __hack 493 494 GETANDFLAGS(8); 495 GETANDFLAGS(16); 496 GETANDFLAGS(32); 497 GETANDFLAGS(64); 498 499 static u_long 500 getandflags(int opsize, uint64_t x, uint64_t y) 501 { 502 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 503 ("getandflags: invalid operand size %d", opsize)); 504 505 if (opsize == 1) 506 return (getandflags8(x, y)); 507 else if (opsize == 2) 508 return (getandflags16(x, y)); 509 else if (opsize == 4) 510 return (getandflags32(x, y)); 511 else 512 return (getandflags64(x, y)); 513 } 514 515 static int 516 emulate_mov(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 517 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 518 { 519 int error, size; 520 enum vm_reg_name reg; 521 uint8_t byte; 522 uint64_t val; 523 524 size = vie->opsize; 525 error = EINVAL; 526 527 switch (vie->op.op_byte) { 528 case 0x88: 529 /* 530 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m) 531 * 88/r: mov r/m8, r8 532 * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available) 533 */ 534 size = 1; /* override for byte operation */ 535 error = vie_read_bytereg(vcpu, vie, &byte); 536 if (error == 0) 537 error = memwrite(vcpu, gpa, byte, size, arg); 538 break; 539 case 0x89: 540 /* 541 * MOV from reg (ModRM:reg) to mem (ModRM:r/m) 542 * 89/r: mov r/m16, r16 543 * 89/r: mov r/m32, r32 544 * REX.W + 89/r mov r/m64, r64 545 */ 546 reg = gpr_map[vie->reg]; 547 error = vie_read_register(vcpu, reg, &val); 548 if (error == 0) { 549 val &= size2mask[size]; 550 error = memwrite(vcpu, gpa, val, size, arg); 551 } 552 break; 553 case 0x8A: 554 /* 555 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg) 556 * 8A/r: mov r8, r/m8 557 * REX + 8A/r: mov r8, r/m8 558 */ 559 size = 1; /* override for byte operation */ 560 error = memread(vcpu, gpa, &val, size, arg); 561 if (error == 0) 562 error = vie_write_bytereg(vcpu, vie, val); 563 break; 564 case 0x8B: 565 /* 566 * MOV from mem (ModRM:r/m) to reg (ModRM:reg) 567 * 8B/r: mov r16, r/m16 568 * 8B/r: mov r32, r/m32 569 * REX.W 8B/r: mov r64, r/m64 570 */ 571 error = memread(vcpu, gpa, &val, size, arg); 572 if (error == 0) { 573 reg = gpr_map[vie->reg]; 574 error = vie_update_register(vcpu, reg, val, size); 575 } 576 break; 577 case 0xA1: 578 /* 579 * MOV from seg:moffset to AX/EAX/RAX 580 * A1: mov AX, moffs16 581 * A1: mov EAX, moffs32 582 * REX.W + A1: mov RAX, moffs64 583 */ 584 error = memread(vcpu, gpa, &val, size, arg); 585 if (error == 0) { 586 reg = VM_REG_GUEST_RAX; 587 error = vie_update_register(vcpu, reg, val, size); 588 } 589 break; 590 case 0xA3: 591 /* 592 * MOV from AX/EAX/RAX to seg:moffset 593 * A3: mov moffs16, AX 594 * A3: mov moffs32, EAX 595 * REX.W + A3: mov moffs64, RAX 596 */ 597 error = vie_read_register(vcpu, VM_REG_GUEST_RAX, &val); 598 if (error == 0) { 599 val &= size2mask[size]; 600 error = memwrite(vcpu, gpa, val, size, arg); 601 } 602 break; 603 case 0xC6: 604 /* 605 * MOV from imm8 to mem (ModRM:r/m) 606 * C6/0 mov r/m8, imm8 607 * REX + C6/0 mov r/m8, imm8 608 */ 609 size = 1; /* override for byte operation */ 610 error = memwrite(vcpu, gpa, vie->immediate, size, arg); 611 break; 612 case 0xC7: 613 /* 614 * MOV from imm16/imm32 to mem (ModRM:r/m) 615 * C7/0 mov r/m16, imm16 616 * C7/0 mov r/m32, imm32 617 * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) 618 */ 619 val = vie->immediate & size2mask[size]; 620 error = memwrite(vcpu, gpa, val, size, arg); 621 break; 622 default: 623 break; 624 } 625 626 return (error); 627 } 628 629 static int 630 emulate_movx(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 631 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) 632 { 633 int error, size; 634 enum vm_reg_name reg; 635 uint64_t val; 636 637 size = vie->opsize; 638 error = EINVAL; 639 640 switch (vie->op.op_byte) { 641 case 0xB6: 642 /* 643 * MOV and zero extend byte from mem (ModRM:r/m) to 644 * reg (ModRM:reg). 645 * 646 * 0F B6/r movzx r16, r/m8 647 * 0F B6/r movzx r32, r/m8 648 * REX.W + 0F B6/r movzx r64, r/m8 649 */ 650 651 /* get the first operand */ 652 error = memread(vcpu, gpa, &val, 1, arg); 653 if (error) 654 break; 655 656 /* get the second operand */ 657 reg = gpr_map[vie->reg]; 658 659 /* zero-extend byte */ 660 val = (uint8_t)val; 661 662 /* write the result */ 663 error = vie_update_register(vcpu, reg, val, size); 664 break; 665 case 0xB7: 666 /* 667 * MOV and zero extend word from mem (ModRM:r/m) to 668 * reg (ModRM:reg). 669 * 670 * 0F B7/r movzx r32, r/m16 671 * REX.W + 0F B7/r movzx r64, r/m16 672 */ 673 error = memread(vcpu, gpa, &val, 2, arg); 674 if (error) 675 return (error); 676 677 reg = gpr_map[vie->reg]; 678 679 /* zero-extend word */ 680 val = (uint16_t)val; 681 682 error = vie_update_register(vcpu, reg, val, size); 683 break; 684 case 0xBE: 685 /* 686 * MOV and sign extend byte from mem (ModRM:r/m) to 687 * reg (ModRM:reg). 688 * 689 * 0F BE/r movsx r16, r/m8 690 * 0F BE/r movsx r32, r/m8 691 * REX.W + 0F BE/r movsx r64, r/m8 692 */ 693 694 /* get the first operand */ 695 error = memread(vcpu, gpa, &val, 1, arg); 696 if (error) 697 break; 698 699 /* get the second operand */ 700 reg = gpr_map[vie->reg]; 701 702 /* sign extend byte */ 703 val = (int8_t)val; 704 705 /* write the result */ 706 error = vie_update_register(vcpu, reg, val, size); 707 break; 708 default: 709 break; 710 } 711 return (error); 712 } 713 714 /* 715 * Helper function to calculate and validate a linear address. 716 */ 717 static int 718 get_gla(struct vcpu *vcpu, struct vie *vie __unused, 719 struct vm_guest_paging *paging, int opsize, int addrsize, int prot, 720 enum vm_reg_name seg, enum vm_reg_name gpr, uint64_t *gla, int *fault) 721 { 722 struct seg_desc desc; 723 uint64_t cr0, val, rflags; 724 int error __diagused; 725 726 error = vie_read_register(vcpu, VM_REG_GUEST_CR0, &cr0); 727 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); 728 729 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 730 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 731 732 error = vm_get_seg_desc(vcpu, seg, &desc); 733 KASSERT(error == 0, ("%s: error %d getting segment descriptor %d", 734 __func__, error, seg)); 735 736 error = vie_read_register(vcpu, gpr, &val); 737 KASSERT(error == 0, ("%s: error %d getting register %d", __func__, 738 error, gpr)); 739 740 if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize, 741 addrsize, prot, gla)) { 742 if (seg == VM_REG_GUEST_SS) 743 vm_inject_ss(vcpu, 0); 744 else 745 vm_inject_gp(vcpu); 746 goto guest_fault; 747 } 748 749 if (vie_canonical_check(paging->cpu_mode, *gla)) { 750 if (seg == VM_REG_GUEST_SS) 751 vm_inject_ss(vcpu, 0); 752 else 753 vm_inject_gp(vcpu); 754 goto guest_fault; 755 } 756 757 if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) { 758 vm_inject_ac(vcpu, 0); 759 goto guest_fault; 760 } 761 762 *fault = 0; 763 return (0); 764 765 guest_fault: 766 *fault = 1; 767 return (0); 768 } 769 770 static int 771 emulate_movs(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 772 struct vm_guest_paging *paging, mem_region_read_t memread, 773 mem_region_write_t memwrite, void *arg) 774 { 775 #ifdef _KERNEL 776 struct vm_copyinfo copyinfo[2]; 777 #else 778 struct iovec copyinfo[2]; 779 #endif 780 uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val; 781 uint64_t rcx, rdi, rsi, rflags; 782 int error, fault, opsize, seg, repeat; 783 784 opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize; 785 val = 0; 786 error = 0; 787 788 /* 789 * XXX although the MOVS instruction is only supposed to be used with 790 * the "rep" prefix some guests like FreeBSD will use "repnz" instead. 791 * 792 * Empirically the "repnz" prefix has identical behavior to "rep" 793 * and the zero flag does not make a difference. 794 */ 795 repeat = vie->repz_present | vie->repnz_present; 796 797 if (repeat) { 798 error = vie_read_register(vcpu, VM_REG_GUEST_RCX, &rcx); 799 KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); 800 801 /* 802 * The count register is %rcx, %ecx or %cx depending on the 803 * address size of the instruction. 804 */ 805 if ((rcx & vie_size2mask(vie->addrsize)) == 0) { 806 error = 0; 807 goto done; 808 } 809 } 810 811 /* 812 * Source Destination Comments 813 * -------------------------------------------- 814 * (1) memory memory n/a 815 * (2) memory mmio emulated 816 * (3) mmio memory emulated 817 * (4) mmio mmio emulated 818 * 819 * At this point we don't have sufficient information to distinguish 820 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this 821 * out because it will succeed only when operating on regular memory. 822 * 823 * XXX the emulation doesn't properly handle the case where 'gpa' 824 * is straddling the boundary between the normal memory and MMIO. 825 */ 826 827 seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS; 828 error = get_gla(vcpu, vie, paging, opsize, vie->addrsize, 829 PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault); 830 if (error || fault) 831 goto done; 832 833 error = vm_copy_setup(vcpu, paging, srcaddr, opsize, PROT_READ, 834 copyinfo, nitems(copyinfo), &fault); 835 if (error == 0) { 836 if (fault) 837 goto done; /* Resume guest to handle fault */ 838 839 /* 840 * case (2): read from system memory and write to mmio. 841 */ 842 vm_copyin(copyinfo, &val, opsize); 843 vm_copy_teardown(copyinfo, nitems(copyinfo)); 844 error = memwrite(vcpu, gpa, val, opsize, arg); 845 if (error) 846 goto done; 847 } else { 848 /* 849 * 'vm_copy_setup()' is expected to fail for cases (3) and (4) 850 * if 'srcaddr' is in the mmio space. 851 */ 852 853 error = get_gla(vcpu, vie, paging, opsize, vie->addrsize, 854 PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr, 855 &fault); 856 if (error || fault) 857 goto done; 858 859 error = vm_copy_setup(vcpu, paging, dstaddr, opsize, 860 PROT_WRITE, copyinfo, nitems(copyinfo), &fault); 861 if (error == 0) { 862 if (fault) 863 goto done; /* Resume guest to handle fault */ 864 865 /* 866 * case (3): read from MMIO and write to system memory. 867 * 868 * A MMIO read can have side-effects so we 869 * commit to it only after vm_copy_setup() is 870 * successful. If a page-fault needs to be 871 * injected into the guest then it will happen 872 * before the MMIO read is attempted. 873 */ 874 error = memread(vcpu, gpa, &val, opsize, arg); 875 if (error) 876 goto done; 877 878 vm_copyout(&val, copyinfo, opsize); 879 vm_copy_teardown(copyinfo, nitems(copyinfo)); 880 } else { 881 /* 882 * Case (4): read from and write to mmio. 883 * 884 * Commit to the MMIO read/write (with potential 885 * side-effects) only after we are sure that the 886 * instruction is not going to be restarted due 887 * to address translation faults. 888 */ 889 error = vm_gla2gpa(vcpu, paging, srcaddr, 890 PROT_READ, &srcgpa, &fault); 891 if (error || fault) 892 goto done; 893 894 error = vm_gla2gpa(vcpu, paging, dstaddr, 895 PROT_WRITE, &dstgpa, &fault); 896 if (error || fault) 897 goto done; 898 899 error = memread(vcpu, srcgpa, &val, opsize, arg); 900 if (error) 901 goto done; 902 903 error = memwrite(vcpu, dstgpa, val, opsize, arg); 904 if (error) 905 goto done; 906 } 907 } 908 909 error = vie_read_register(vcpu, VM_REG_GUEST_RSI, &rsi); 910 KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error)); 911 912 error = vie_read_register(vcpu, VM_REG_GUEST_RDI, &rdi); 913 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); 914 915 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 916 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 917 918 if (rflags & PSL_D) { 919 rsi -= opsize; 920 rdi -= opsize; 921 } else { 922 rsi += opsize; 923 rdi += opsize; 924 } 925 926 error = vie_update_register(vcpu, VM_REG_GUEST_RSI, rsi, 927 vie->addrsize); 928 KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error)); 929 930 error = vie_update_register(vcpu, VM_REG_GUEST_RDI, rdi, 931 vie->addrsize); 932 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); 933 934 if (repeat) { 935 rcx = rcx - 1; 936 error = vie_update_register(vcpu, VM_REG_GUEST_RCX, 937 rcx, vie->addrsize); 938 KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); 939 940 /* 941 * Repeat the instruction if the count register is not zero. 942 */ 943 if ((rcx & vie_size2mask(vie->addrsize)) != 0) 944 vm_restart_instruction(vcpu); 945 } 946 done: 947 KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d", 948 __func__, error)); 949 return (error); 950 } 951 952 static int 953 emulate_stos(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 954 struct vm_guest_paging *paging __unused, mem_region_read_t memread __unused, 955 mem_region_write_t memwrite, void *arg) 956 { 957 int error, opsize, repeat; 958 uint64_t val; 959 uint64_t rcx, rdi, rflags; 960 961 opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize; 962 repeat = vie->repz_present | vie->repnz_present; 963 964 if (repeat) { 965 error = vie_read_register(vcpu, VM_REG_GUEST_RCX, &rcx); 966 KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); 967 968 /* 969 * The count register is %rcx, %ecx or %cx depending on the 970 * address size of the instruction. 971 */ 972 if ((rcx & vie_size2mask(vie->addrsize)) == 0) 973 return (0); 974 } 975 976 error = vie_read_register(vcpu, VM_REG_GUEST_RAX, &val); 977 KASSERT(!error, ("%s: error %d getting rax", __func__, error)); 978 979 error = memwrite(vcpu, gpa, val, opsize, arg); 980 if (error) 981 return (error); 982 983 error = vie_read_register(vcpu, VM_REG_GUEST_RDI, &rdi); 984 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); 985 986 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 987 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 988 989 if (rflags & PSL_D) 990 rdi -= opsize; 991 else 992 rdi += opsize; 993 994 error = vie_update_register(vcpu, VM_REG_GUEST_RDI, rdi, 995 vie->addrsize); 996 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); 997 998 if (repeat) { 999 rcx = rcx - 1; 1000 error = vie_update_register(vcpu, VM_REG_GUEST_RCX, 1001 rcx, vie->addrsize); 1002 KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); 1003 1004 /* 1005 * Repeat the instruction if the count register is not zero. 1006 */ 1007 if ((rcx & vie_size2mask(vie->addrsize)) != 0) 1008 vm_restart_instruction(vcpu); 1009 } 1010 1011 return (0); 1012 } 1013 1014 static int 1015 emulate_and(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1016 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 1017 { 1018 int error, size; 1019 enum vm_reg_name reg; 1020 uint64_t result, rflags, rflags2, val1, val2; 1021 1022 size = vie->opsize; 1023 error = EINVAL; 1024 1025 switch (vie->op.op_byte) { 1026 case 0x23: 1027 /* 1028 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the 1029 * result in reg. 1030 * 1031 * 23/r and r16, r/m16 1032 * 23/r and r32, r/m32 1033 * REX.W + 23/r and r64, r/m64 1034 */ 1035 1036 /* get the first operand */ 1037 reg = gpr_map[vie->reg]; 1038 error = vie_read_register(vcpu, reg, &val1); 1039 if (error) 1040 break; 1041 1042 /* get the second operand */ 1043 error = memread(vcpu, gpa, &val2, size, arg); 1044 if (error) 1045 break; 1046 1047 /* perform the operation and write the result */ 1048 result = val1 & val2; 1049 error = vie_update_register(vcpu, reg, result, size); 1050 break; 1051 case 0x81: 1052 case 0x83: 1053 /* 1054 * AND mem (ModRM:r/m) with immediate and store the 1055 * result in mem. 1056 * 1057 * 81 /4 and r/m16, imm16 1058 * 81 /4 and r/m32, imm32 1059 * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64 1060 * 1061 * 83 /4 and r/m16, imm8 sign-extended to 16 1062 * 83 /4 and r/m32, imm8 sign-extended to 32 1063 * REX.W + 83/4 and r/m64, imm8 sign-extended to 64 1064 */ 1065 1066 /* get the first operand */ 1067 error = memread(vcpu, gpa, &val1, size, arg); 1068 if (error) 1069 break; 1070 1071 /* 1072 * perform the operation with the pre-fetched immediate 1073 * operand and write the result 1074 */ 1075 result = val1 & vie->immediate; 1076 error = memwrite(vcpu, gpa, result, size, arg); 1077 break; 1078 default: 1079 break; 1080 } 1081 if (error) 1082 return (error); 1083 1084 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 1085 if (error) 1086 return (error); 1087 1088 /* 1089 * OF and CF are cleared; the SF, ZF and PF flags are set according 1090 * to the result; AF is undefined. 1091 * 1092 * The updated status flags are obtained by subtracting 0 from 'result'. 1093 */ 1094 rflags2 = getcc(size, result, 0); 1095 rflags &= ~RFLAGS_STATUS_BITS; 1096 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 1097 1098 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8); 1099 return (error); 1100 } 1101 1102 static int 1103 emulate_or(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1104 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 1105 { 1106 int error, size; 1107 enum vm_reg_name reg; 1108 uint64_t result, rflags, rflags2, val1, val2; 1109 1110 size = vie->opsize; 1111 error = EINVAL; 1112 1113 switch (vie->op.op_byte) { 1114 case 0x0B: 1115 /* 1116 * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the 1117 * result in reg. 1118 * 1119 * 0b/r or r16, r/m16 1120 * 0b/r or r32, r/m32 1121 * REX.W + 0b/r or r64, r/m64 1122 */ 1123 1124 /* get the first operand */ 1125 reg = gpr_map[vie->reg]; 1126 error = vie_read_register(vcpu, reg, &val1); 1127 if (error) 1128 break; 1129 1130 /* get the second operand */ 1131 error = memread(vcpu, gpa, &val2, size, arg); 1132 if (error) 1133 break; 1134 1135 /* perform the operation and write the result */ 1136 result = val1 | val2; 1137 error = vie_update_register(vcpu, reg, result, size); 1138 break; 1139 case 0x81: 1140 case 0x83: 1141 /* 1142 * OR mem (ModRM:r/m) with immediate and store the 1143 * result in mem. 1144 * 1145 * 81 /1 or r/m16, imm16 1146 * 81 /1 or r/m32, imm32 1147 * REX.W + 81 /1 or r/m64, imm32 sign-extended to 64 1148 * 1149 * 83 /1 or r/m16, imm8 sign-extended to 16 1150 * 83 /1 or r/m32, imm8 sign-extended to 32 1151 * REX.W + 83/1 or r/m64, imm8 sign-extended to 64 1152 */ 1153 1154 /* get the first operand */ 1155 error = memread(vcpu, gpa, &val1, size, arg); 1156 if (error) 1157 break; 1158 1159 /* 1160 * perform the operation with the pre-fetched immediate 1161 * operand and write the result 1162 */ 1163 result = val1 | vie->immediate; 1164 error = memwrite(vcpu, gpa, result, size, arg); 1165 break; 1166 default: 1167 break; 1168 } 1169 if (error) 1170 return (error); 1171 1172 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 1173 if (error) 1174 return (error); 1175 1176 /* 1177 * OF and CF are cleared; the SF, ZF and PF flags are set according 1178 * to the result; AF is undefined. 1179 * 1180 * The updated status flags are obtained by subtracting 0 from 'result'. 1181 */ 1182 rflags2 = getcc(size, result, 0); 1183 rflags &= ~RFLAGS_STATUS_BITS; 1184 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 1185 1186 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8); 1187 return (error); 1188 } 1189 1190 static int 1191 emulate_cmp(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1192 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) 1193 { 1194 int error, size; 1195 uint64_t regop, memop, op1, op2, rflags, rflags2; 1196 enum vm_reg_name reg; 1197 1198 size = vie->opsize; 1199 switch (vie->op.op_byte) { 1200 case 0x39: 1201 case 0x3B: 1202 /* 1203 * 39/r CMP r/m16, r16 1204 * 39/r CMP r/m32, r32 1205 * REX.W 39/r CMP r/m64, r64 1206 * 1207 * 3B/r CMP r16, r/m16 1208 * 3B/r CMP r32, r/m32 1209 * REX.W + 3B/r CMP r64, r/m64 1210 * 1211 * Compare the first operand with the second operand and 1212 * set status flags in EFLAGS register. The comparison is 1213 * performed by subtracting the second operand from the first 1214 * operand and then setting the status flags. 1215 */ 1216 1217 /* Get the register operand */ 1218 reg = gpr_map[vie->reg]; 1219 error = vie_read_register(vcpu, reg, ®op); 1220 if (error) 1221 return (error); 1222 1223 /* Get the memory operand */ 1224 error = memread(vcpu, gpa, &memop, size, arg); 1225 if (error) 1226 return (error); 1227 1228 if (vie->op.op_byte == 0x3B) { 1229 op1 = regop; 1230 op2 = memop; 1231 } else { 1232 op1 = memop; 1233 op2 = regop; 1234 } 1235 rflags2 = getcc(size, op1, op2); 1236 break; 1237 case 0x80: 1238 case 0x81: 1239 case 0x83: 1240 /* 1241 * 80 /7 cmp r/m8, imm8 1242 * REX + 80 /7 cmp r/m8, imm8 1243 * 1244 * 81 /7 cmp r/m16, imm16 1245 * 81 /7 cmp r/m32, imm32 1246 * REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 64 1247 * 1248 * 83 /7 cmp r/m16, imm8 sign-extended to 16 1249 * 83 /7 cmp r/m32, imm8 sign-extended to 32 1250 * REX.W + 83 /7 cmp r/m64, imm8 sign-extended to 64 1251 * 1252 * Compare mem (ModRM:r/m) with immediate and set 1253 * status flags according to the results. The 1254 * comparison is performed by subtracting the 1255 * immediate from the first operand and then setting 1256 * the status flags. 1257 * 1258 */ 1259 if (vie->op.op_byte == 0x80) 1260 size = 1; 1261 1262 /* get the first operand */ 1263 error = memread(vcpu, gpa, &op1, size, arg); 1264 if (error) 1265 return (error); 1266 1267 rflags2 = getcc(size, op1, vie->immediate); 1268 break; 1269 default: 1270 return (EINVAL); 1271 } 1272 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 1273 if (error) 1274 return (error); 1275 rflags &= ~RFLAGS_STATUS_BITS; 1276 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1277 1278 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8); 1279 return (error); 1280 } 1281 1282 static int 1283 emulate_test(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1284 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) 1285 { 1286 int error, size; 1287 uint64_t op1, rflags, rflags2; 1288 1289 size = vie->opsize; 1290 error = EINVAL; 1291 1292 switch (vie->op.op_byte) { 1293 case 0xF6: 1294 /* 1295 * F6 /0 test r/m8, imm8 1296 */ 1297 size = 1; /* override for byte operation */ 1298 /* FALLTHROUGH */ 1299 case 0xF7: 1300 /* 1301 * F7 /0 test r/m16, imm16 1302 * F7 /0 test r/m32, imm32 1303 * REX.W + F7 /0 test r/m64, imm32 sign-extended to 64 1304 * 1305 * Test mem (ModRM:r/m) with immediate and set status 1306 * flags according to the results. The comparison is 1307 * performed by anding the immediate from the first 1308 * operand and then setting the status flags. 1309 */ 1310 if ((vie->reg & 7) != 0) 1311 return (EINVAL); 1312 1313 error = memread(vcpu, gpa, &op1, size, arg); 1314 if (error) 1315 return (error); 1316 1317 rflags2 = getandflags(size, op1, vie->immediate); 1318 break; 1319 default: 1320 return (EINVAL); 1321 } 1322 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 1323 if (error) 1324 return (error); 1325 1326 /* 1327 * OF and CF are cleared; the SF, ZF and PF flags are set according 1328 * to the result; AF is undefined. 1329 */ 1330 rflags &= ~RFLAGS_STATUS_BITS; 1331 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 1332 1333 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8); 1334 return (error); 1335 } 1336 1337 static int 1338 emulate_bextr(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1339 struct vm_guest_paging *paging, mem_region_read_t memread, 1340 mem_region_write_t memwrite __unused, void *arg) 1341 { 1342 uint64_t src1, src2, dst, rflags; 1343 unsigned start, len, size; 1344 int error; 1345 1346 size = vie->opsize; 1347 error = EINVAL; 1348 1349 /* 1350 * VEX.LZ.0F38.W0 F7 /r BEXTR r32a, r/m32, r32b 1351 * VEX.LZ.0F38.W1 F7 /r BEXTR r64a, r/m64, r64b 1352 * 1353 * Destination operand is ModRM:reg. Source operands are ModRM:r/m and 1354 * Vex.vvvv. 1355 * 1356 * Operand size is always 32-bit if not in 64-bit mode (W1 is ignored). 1357 */ 1358 if (size != 4 && paging->cpu_mode != CPU_MODE_64BIT) 1359 size = 4; 1360 1361 /* 1362 * Extracts contiguous bits from the first /source/ operand (second 1363 * operand) using an index and length specified in the second /source/ 1364 * operand (third operand). 1365 */ 1366 error = memread(vcpu, gpa, &src1, size, arg); 1367 if (error) 1368 return (error); 1369 error = vie_read_register(vcpu, gpr_map[vie->vex_reg], &src2); 1370 if (error) 1371 return (error); 1372 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 1373 if (error) 1374 return (error); 1375 1376 start = (src2 & 0xff); 1377 len = (src2 & 0xff00) >> 8; 1378 1379 /* If no bits are extracted, the destination register is cleared. */ 1380 dst = 0; 1381 1382 /* If START exceeds the operand size, no bits are extracted. */ 1383 if (start > size * 8) 1384 goto done; 1385 /* Length is bounded by both the destination size and start offset. */ 1386 if (start + len > size * 8) 1387 len = (size * 8) - start; 1388 if (len == 0) 1389 goto done; 1390 1391 if (start > 0) 1392 src1 = (src1 >> start); 1393 if (len < 64) 1394 src1 = src1 & ((1ull << len) - 1); 1395 dst = src1; 1396 1397 done: 1398 error = vie_update_register(vcpu, gpr_map[vie->reg], dst, size); 1399 if (error) 1400 return (error); 1401 1402 /* 1403 * AMD: OF, CF cleared; SF/AF/PF undefined; ZF set by result. 1404 * Intel: ZF is set by result; AF/SF/PF undefined; all others cleared. 1405 */ 1406 rflags &= ~RFLAGS_STATUS_BITS; 1407 if (dst == 0) 1408 rflags |= PSL_Z; 1409 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 1410 8); 1411 return (error); 1412 } 1413 1414 static int 1415 emulate_add(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1416 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) 1417 { 1418 int error, size; 1419 uint64_t nval, rflags, rflags2, val1, val2; 1420 enum vm_reg_name reg; 1421 1422 size = vie->opsize; 1423 error = EINVAL; 1424 1425 switch (vie->op.op_byte) { 1426 case 0x03: 1427 /* 1428 * ADD r/m to r and store the result in r 1429 * 1430 * 03/r ADD r16, r/m16 1431 * 03/r ADD r32, r/m32 1432 * REX.W + 03/r ADD r64, r/m64 1433 */ 1434 1435 /* get the first operand */ 1436 reg = gpr_map[vie->reg]; 1437 error = vie_read_register(vcpu, reg, &val1); 1438 if (error) 1439 break; 1440 1441 /* get the second operand */ 1442 error = memread(vcpu, gpa, &val2, size, arg); 1443 if (error) 1444 break; 1445 1446 /* perform the operation and write the result */ 1447 nval = val1 + val2; 1448 error = vie_update_register(vcpu, reg, nval, size); 1449 break; 1450 default: 1451 break; 1452 } 1453 1454 if (!error) { 1455 rflags2 = getaddflags(size, val1, val2); 1456 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, 1457 &rflags); 1458 if (error) 1459 return (error); 1460 1461 rflags &= ~RFLAGS_STATUS_BITS; 1462 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1463 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, 1464 rflags, 8); 1465 } 1466 1467 return (error); 1468 } 1469 1470 static int 1471 emulate_sub(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1472 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) 1473 { 1474 int error, size; 1475 uint64_t nval, rflags, rflags2, val1, val2; 1476 enum vm_reg_name reg; 1477 1478 size = vie->opsize; 1479 error = EINVAL; 1480 1481 switch (vie->op.op_byte) { 1482 case 0x2B: 1483 /* 1484 * SUB r/m from r and store the result in r 1485 * 1486 * 2B/r SUB r16, r/m16 1487 * 2B/r SUB r32, r/m32 1488 * REX.W + 2B/r SUB r64, r/m64 1489 */ 1490 1491 /* get the first operand */ 1492 reg = gpr_map[vie->reg]; 1493 error = vie_read_register(vcpu, reg, &val1); 1494 if (error) 1495 break; 1496 1497 /* get the second operand */ 1498 error = memread(vcpu, gpa, &val2, size, arg); 1499 if (error) 1500 break; 1501 1502 /* perform the operation and write the result */ 1503 nval = val1 - val2; 1504 error = vie_update_register(vcpu, reg, nval, size); 1505 break; 1506 default: 1507 break; 1508 } 1509 1510 if (!error) { 1511 rflags2 = getcc(size, val1, val2); 1512 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, 1513 &rflags); 1514 if (error) 1515 return (error); 1516 1517 rflags &= ~RFLAGS_STATUS_BITS; 1518 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1519 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, 1520 rflags, 8); 1521 } 1522 1523 return (error); 1524 } 1525 1526 static int 1527 emulate_stack_op(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie, 1528 struct vm_guest_paging *paging, mem_region_read_t memread, 1529 mem_region_write_t memwrite, void *arg) 1530 { 1531 #ifdef _KERNEL 1532 struct vm_copyinfo copyinfo[2]; 1533 #else 1534 struct iovec copyinfo[2]; 1535 #endif 1536 struct seg_desc ss_desc; 1537 uint64_t cr0, rflags, rsp, stack_gla, val; 1538 int error, fault, size, stackaddrsize, pushop; 1539 1540 val = 0; 1541 size = vie->opsize; 1542 pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0; 1543 1544 /* 1545 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1 1546 */ 1547 if (paging->cpu_mode == CPU_MODE_REAL) { 1548 stackaddrsize = 2; 1549 } else if (paging->cpu_mode == CPU_MODE_64BIT) { 1550 /* 1551 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3 1552 * - Stack pointer size is always 64-bits. 1553 * - PUSH/POP of 32-bit values is not possible in 64-bit mode. 1554 * - 16-bit PUSH/POP is supported by using the operand size 1555 * override prefix (66H). 1556 */ 1557 stackaddrsize = 8; 1558 size = vie->opsize_override ? 2 : 8; 1559 } else { 1560 /* 1561 * In protected or compatibility mode the 'B' flag in the 1562 * stack-segment descriptor determines the size of the 1563 * stack pointer. 1564 */ 1565 error = vm_get_seg_desc(vcpu, VM_REG_GUEST_SS, &ss_desc); 1566 KASSERT(error == 0, ("%s: error %d getting SS descriptor", 1567 __func__, error)); 1568 if (SEG_DESC_DEF32(ss_desc.access)) 1569 stackaddrsize = 4; 1570 else 1571 stackaddrsize = 2; 1572 } 1573 1574 error = vie_read_register(vcpu, VM_REG_GUEST_CR0, &cr0); 1575 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); 1576 1577 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 1578 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 1579 1580 error = vie_read_register(vcpu, VM_REG_GUEST_RSP, &rsp); 1581 KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error)); 1582 if (pushop) { 1583 rsp -= size; 1584 } 1585 1586 if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc, 1587 rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ, 1588 &stack_gla)) { 1589 vm_inject_ss(vcpu, 0); 1590 return (0); 1591 } 1592 1593 if (vie_canonical_check(paging->cpu_mode, stack_gla)) { 1594 vm_inject_ss(vcpu, 0); 1595 return (0); 1596 } 1597 1598 if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) { 1599 vm_inject_ac(vcpu, 0); 1600 return (0); 1601 } 1602 1603 error = vm_copy_setup(vcpu, paging, stack_gla, size, 1604 pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo), 1605 &fault); 1606 if (error || fault) 1607 return (error); 1608 1609 if (pushop) { 1610 error = memread(vcpu, mmio_gpa, &val, size, arg); 1611 if (error == 0) 1612 vm_copyout(&val, copyinfo, size); 1613 } else { 1614 vm_copyin(copyinfo, &val, size); 1615 error = memwrite(vcpu, mmio_gpa, val, size, arg); 1616 rsp += size; 1617 } 1618 vm_copy_teardown(copyinfo, nitems(copyinfo)); 1619 1620 if (error == 0) { 1621 error = vie_update_register(vcpu, VM_REG_GUEST_RSP, rsp, 1622 stackaddrsize); 1623 KASSERT(error == 0, ("error %d updating rsp", error)); 1624 } 1625 return (error); 1626 } 1627 1628 static int 1629 emulate_push(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie, 1630 struct vm_guest_paging *paging, mem_region_read_t memread, 1631 mem_region_write_t memwrite, void *arg) 1632 { 1633 int error; 1634 1635 /* 1636 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. 1637 * 1638 * PUSH is part of the group 5 extended opcodes and is identified 1639 * by ModRM:reg = b110. 1640 */ 1641 if ((vie->reg & 7) != 6) 1642 return (EINVAL); 1643 1644 error = emulate_stack_op(vcpu, mmio_gpa, vie, paging, memread, 1645 memwrite, arg); 1646 return (error); 1647 } 1648 1649 static int 1650 emulate_pop(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie, 1651 struct vm_guest_paging *paging, mem_region_read_t memread, 1652 mem_region_write_t memwrite, void *arg) 1653 { 1654 int error; 1655 1656 /* 1657 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. 1658 * 1659 * POP is part of the group 1A extended opcodes and is identified 1660 * by ModRM:reg = b000. 1661 */ 1662 if ((vie->reg & 7) != 0) 1663 return (EINVAL); 1664 1665 error = emulate_stack_op(vcpu, mmio_gpa, vie, paging, memread, 1666 memwrite, arg); 1667 return (error); 1668 } 1669 1670 static int 1671 emulate_group1(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1672 struct vm_guest_paging *paging __unused, mem_region_read_t memread, 1673 mem_region_write_t memwrite, void *memarg) 1674 { 1675 int error; 1676 1677 switch (vie->reg & 7) { 1678 case 0x1: /* OR */ 1679 error = emulate_or(vcpu, gpa, vie, 1680 memread, memwrite, memarg); 1681 break; 1682 case 0x4: /* AND */ 1683 error = emulate_and(vcpu, gpa, vie, 1684 memread, memwrite, memarg); 1685 break; 1686 case 0x7: /* CMP */ 1687 error = emulate_cmp(vcpu, gpa, vie, 1688 memread, memwrite, memarg); 1689 break; 1690 default: 1691 error = EINVAL; 1692 break; 1693 } 1694 1695 return (error); 1696 } 1697 1698 static int 1699 emulate_bittest(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1700 mem_region_read_t memread, mem_region_write_t memwrite __unused, 1701 void *memarg) 1702 { 1703 uint64_t val, rflags; 1704 int error, bitmask, bitoff; 1705 1706 /* 1707 * 0F BA is a Group 8 extended opcode. 1708 * 1709 * Currently we only emulate the 'Bit Test' instruction which is 1710 * identified by a ModR/M:reg encoding of 100b. 1711 */ 1712 if ((vie->reg & 7) != 4) 1713 return (EINVAL); 1714 1715 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 1716 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 1717 1718 error = memread(vcpu, gpa, &val, vie->opsize, memarg); 1719 if (error) 1720 return (error); 1721 1722 /* 1723 * Intel SDM, Vol 2, Table 3-2: 1724 * "Range of Bit Positions Specified by Bit Offset Operands" 1725 */ 1726 bitmask = vie->opsize * 8 - 1; 1727 bitoff = vie->immediate & bitmask; 1728 1729 /* Copy the bit into the Carry flag in %rflags */ 1730 if (val & (1UL << bitoff)) 1731 rflags |= PSL_C; 1732 else 1733 rflags &= ~PSL_C; 1734 1735 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8); 1736 KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error)); 1737 1738 return (0); 1739 } 1740 1741 static int 1742 emulate_twob_group15(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1743 mem_region_read_t memread, mem_region_write_t memwrite __unused, 1744 void *memarg) 1745 { 1746 int error; 1747 uint64_t buf; 1748 1749 switch (vie->reg & 7) { 1750 case 0x7: /* CLFLUSH, CLFLUSHOPT, and SFENCE */ 1751 if (vie->mod == 0x3) { 1752 /* 1753 * SFENCE. Ignore it, VM exit provides enough 1754 * barriers on its own. 1755 */ 1756 error = 0; 1757 } else { 1758 /* 1759 * CLFLUSH, CLFLUSHOPT. Only check for access 1760 * rights. 1761 */ 1762 error = memread(vcpu, gpa, &buf, 1, memarg); 1763 } 1764 break; 1765 default: 1766 error = EINVAL; 1767 break; 1768 } 1769 1770 return (error); 1771 } 1772 1773 int 1774 vmm_emulate_instruction(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1775 struct vm_guest_paging *paging, mem_region_read_t memread, 1776 mem_region_write_t memwrite, void *memarg) 1777 { 1778 int error; 1779 1780 if (!vie->decoded) 1781 return (EINVAL); 1782 1783 switch (vie->op.op_type) { 1784 case VIE_OP_TYPE_GROUP1: 1785 error = emulate_group1(vcpu, gpa, vie, paging, memread, 1786 memwrite, memarg); 1787 break; 1788 case VIE_OP_TYPE_POP: 1789 error = emulate_pop(vcpu, gpa, vie, paging, memread, 1790 memwrite, memarg); 1791 break; 1792 case VIE_OP_TYPE_PUSH: 1793 error = emulate_push(vcpu, gpa, vie, paging, memread, 1794 memwrite, memarg); 1795 break; 1796 case VIE_OP_TYPE_CMP: 1797 error = emulate_cmp(vcpu, gpa, vie, 1798 memread, memwrite, memarg); 1799 break; 1800 case VIE_OP_TYPE_MOV: 1801 error = emulate_mov(vcpu, gpa, vie, 1802 memread, memwrite, memarg); 1803 break; 1804 case VIE_OP_TYPE_MOVSX: 1805 case VIE_OP_TYPE_MOVZX: 1806 error = emulate_movx(vcpu, gpa, vie, 1807 memread, memwrite, memarg); 1808 break; 1809 case VIE_OP_TYPE_MOVS: 1810 error = emulate_movs(vcpu, gpa, vie, paging, memread, 1811 memwrite, memarg); 1812 break; 1813 case VIE_OP_TYPE_STOS: 1814 error = emulate_stos(vcpu, gpa, vie, paging, memread, 1815 memwrite, memarg); 1816 break; 1817 case VIE_OP_TYPE_AND: 1818 error = emulate_and(vcpu, gpa, vie, 1819 memread, memwrite, memarg); 1820 break; 1821 case VIE_OP_TYPE_OR: 1822 error = emulate_or(vcpu, gpa, vie, 1823 memread, memwrite, memarg); 1824 break; 1825 case VIE_OP_TYPE_SUB: 1826 error = emulate_sub(vcpu, gpa, vie, 1827 memread, memwrite, memarg); 1828 break; 1829 case VIE_OP_TYPE_BITTEST: 1830 error = emulate_bittest(vcpu, gpa, vie, 1831 memread, memwrite, memarg); 1832 break; 1833 case VIE_OP_TYPE_TWOB_GRP15: 1834 error = emulate_twob_group15(vcpu, gpa, vie, 1835 memread, memwrite, memarg); 1836 break; 1837 case VIE_OP_TYPE_ADD: 1838 error = emulate_add(vcpu, gpa, vie, memread, 1839 memwrite, memarg); 1840 break; 1841 case VIE_OP_TYPE_TEST: 1842 error = emulate_test(vcpu, gpa, vie, 1843 memread, memwrite, memarg); 1844 break; 1845 case VIE_OP_TYPE_BEXTR: 1846 error = emulate_bextr(vcpu, gpa, vie, paging, 1847 memread, memwrite, memarg); 1848 break; 1849 default: 1850 error = EINVAL; 1851 break; 1852 } 1853 1854 return (error); 1855 } 1856 1857 int 1858 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) 1859 { 1860 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 1861 ("%s: invalid size %d", __func__, size)); 1862 KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl)); 1863 1864 if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) 1865 return (0); 1866 1867 return ((gla & (size - 1)) ? 1 : 0); 1868 } 1869 1870 int 1871 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla) 1872 { 1873 uint64_t mask; 1874 1875 if (cpu_mode != CPU_MODE_64BIT) 1876 return (0); 1877 1878 /* 1879 * The value of the bit 47 in the 'gla' should be replicated in the 1880 * most significant 16 bits. 1881 */ 1882 mask = ~((1UL << 48) - 1); 1883 if (gla & (1UL << 47)) 1884 return ((gla & mask) != mask); 1885 else 1886 return ((gla & mask) != 0); 1887 } 1888 1889 uint64_t 1890 vie_size2mask(int size) 1891 { 1892 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 1893 ("vie_size2mask: invalid size %d", size)); 1894 return (size2mask[size]); 1895 } 1896 1897 int 1898 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, 1899 struct seg_desc *desc, uint64_t offset, int length, int addrsize, 1900 int prot, uint64_t *gla) 1901 { 1902 uint64_t firstoff, low_limit, high_limit, segbase; 1903 int glasize, type; 1904 1905 KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS, 1906 ("%s: invalid segment %d", __func__, seg)); 1907 KASSERT(length == 1 || length == 2 || length == 4 || length == 8, 1908 ("%s: invalid operand size %d", __func__, length)); 1909 KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, 1910 ("%s: invalid prot %#x", __func__, prot)); 1911 1912 firstoff = offset; 1913 if (cpu_mode == CPU_MODE_64BIT) { 1914 KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address " 1915 "size %d for cpu_mode %d", __func__, addrsize, cpu_mode)); 1916 glasize = 8; 1917 } else { 1918 KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address " 1919 "size %d for cpu mode %d", __func__, addrsize, cpu_mode)); 1920 glasize = 4; 1921 /* 1922 * If the segment selector is loaded with a NULL selector 1923 * then the descriptor is unusable and attempting to use 1924 * it results in a #GP(0). 1925 */ 1926 if (SEG_DESC_UNUSABLE(desc->access)) 1927 return (-1); 1928 1929 /* 1930 * The processor generates a #NP exception when a segment 1931 * register is loaded with a selector that points to a 1932 * descriptor that is not present. If this was the case then 1933 * it would have been checked before the VM-exit. 1934 */ 1935 KASSERT(SEG_DESC_PRESENT(desc->access), 1936 ("segment %d not present: %#x", seg, desc->access)); 1937 1938 /* 1939 * The descriptor type must indicate a code/data segment. 1940 */ 1941 type = SEG_DESC_TYPE(desc->access); 1942 KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " 1943 "descriptor type %#x", seg, type)); 1944 1945 if (prot & PROT_READ) { 1946 /* #GP on a read access to a exec-only code segment */ 1947 if ((type & 0xA) == 0x8) 1948 return (-1); 1949 } 1950 1951 if (prot & PROT_WRITE) { 1952 /* 1953 * #GP on a write access to a code segment or a 1954 * read-only data segment. 1955 */ 1956 if (type & 0x8) /* code segment */ 1957 return (-1); 1958 1959 if ((type & 0xA) == 0) /* read-only data seg */ 1960 return (-1); 1961 } 1962 1963 /* 1964 * 'desc->limit' is fully expanded taking granularity into 1965 * account. 1966 */ 1967 if ((type & 0xC) == 0x4) { 1968 /* expand-down data segment */ 1969 low_limit = desc->limit + 1; 1970 high_limit = SEG_DESC_DEF32(desc->access) ? 1971 0xffffffff : 0xffff; 1972 } else { 1973 /* code segment or expand-up data segment */ 1974 low_limit = 0; 1975 high_limit = desc->limit; 1976 } 1977 1978 while (length > 0) { 1979 offset &= vie_size2mask(addrsize); 1980 if (offset < low_limit || offset > high_limit) 1981 return (-1); 1982 offset++; 1983 length--; 1984 } 1985 } 1986 1987 /* 1988 * In 64-bit mode all segments except %fs and %gs have a segment 1989 * base address of 0. 1990 */ 1991 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && 1992 seg != VM_REG_GUEST_GS) { 1993 segbase = 0; 1994 } else { 1995 segbase = desc->base; 1996 } 1997 1998 /* 1999 * Truncate 'firstoff' to the effective address size before adding 2000 * it to the segment base. 2001 */ 2002 firstoff &= vie_size2mask(addrsize); 2003 *gla = (segbase + firstoff) & vie_size2mask(glasize); 2004 return (0); 2005 } 2006 2007 /* 2008 * Prepare a partially decoded vie for a 2nd attempt. 2009 */ 2010 void 2011 vie_restart(struct vie *vie) 2012 { 2013 _Static_assert( 2014 offsetof(struct vie, inst) < offsetof(struct vie, vie_startzero) && 2015 offsetof(struct vie, num_valid) < offsetof(struct vie, vie_startzero), 2016 "restart should not erase instruction length or contents"); 2017 2018 memset((char *)vie + offsetof(struct vie, vie_startzero), 0, 2019 sizeof(*vie) - offsetof(struct vie, vie_startzero)); 2020 2021 vie->base_register = VM_REG_LAST; 2022 vie->index_register = VM_REG_LAST; 2023 vie->segment_register = VM_REG_LAST; 2024 } 2025 2026 void 2027 vie_init(struct vie *vie, const char *inst_bytes, int inst_length) 2028 { 2029 KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE, 2030 ("%s: invalid instruction length (%d)", __func__, inst_length)); 2031 2032 vie_restart(vie); 2033 memset(vie->inst, 0, sizeof(vie->inst)); 2034 if (inst_length != 0) 2035 memcpy(vie->inst, inst_bytes, inst_length); 2036 vie->num_valid = inst_length; 2037 } 2038 2039 #ifdef _KERNEL 2040 static int 2041 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte) 2042 { 2043 int error_code = 0; 2044 2045 if (pte & PG_V) 2046 error_code |= PGEX_P; 2047 if (prot & VM_PROT_WRITE) 2048 error_code |= PGEX_W; 2049 if (usermode) 2050 error_code |= PGEX_U; 2051 if (rsvd) 2052 error_code |= PGEX_RSV; 2053 if (prot & VM_PROT_EXECUTE) 2054 error_code |= PGEX_I; 2055 2056 return (error_code); 2057 } 2058 2059 static void 2060 ptp_release(void **cookie) 2061 { 2062 if (*cookie != NULL) { 2063 vm_gpa_release(*cookie); 2064 *cookie = NULL; 2065 } 2066 } 2067 2068 static void * 2069 ptp_hold(struct vcpu *vcpu, vm_paddr_t ptpphys, size_t len, void **cookie) 2070 { 2071 void *ptr; 2072 2073 ptp_release(cookie); 2074 ptr = vm_gpa_hold(vcpu, ptpphys, len, VM_PROT_RW, cookie); 2075 return (ptr); 2076 } 2077 2078 static int 2079 _vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, 2080 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only) 2081 { 2082 int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable; 2083 u_int retries; 2084 uint64_t *ptpbase, ptpphys, pte, pgsize; 2085 uint32_t *ptpbase32, pte32; 2086 void *cookie; 2087 2088 *guest_fault = 0; 2089 2090 usermode = (paging->cpl == 3 ? 1 : 0); 2091 writable = prot & VM_PROT_WRITE; 2092 cookie = NULL; 2093 retval = 0; 2094 retries = 0; 2095 restart: 2096 ptpphys = paging->cr3; /* root of the page tables */ 2097 ptp_release(&cookie); 2098 if (retries++ > 0) 2099 maybe_yield(); 2100 2101 if (vie_canonical_check(paging->cpu_mode, gla)) { 2102 /* 2103 * XXX assuming a non-stack reference otherwise a stack fault 2104 * should be generated. 2105 */ 2106 if (!check_only) 2107 vm_inject_gp(vcpu); 2108 goto fault; 2109 } 2110 2111 if (paging->paging_mode == PAGING_MODE_FLAT) { 2112 *gpa = gla; 2113 goto done; 2114 } 2115 2116 if (paging->paging_mode == PAGING_MODE_32) { 2117 nlevels = 2; 2118 while (--nlevels >= 0) { 2119 /* Zero out the lower 12 bits. */ 2120 ptpphys &= ~0xfff; 2121 2122 ptpbase32 = ptp_hold(vcpu, ptpphys, PAGE_SIZE, 2123 &cookie); 2124 2125 if (ptpbase32 == NULL) 2126 goto error; 2127 2128 ptpshift = PAGE_SHIFT + nlevels * 10; 2129 ptpindex = (gla >> ptpshift) & 0x3FF; 2130 pgsize = 1UL << ptpshift; 2131 2132 pte32 = ptpbase32[ptpindex]; 2133 2134 if ((pte32 & PG_V) == 0 || 2135 (usermode && (pte32 & PG_U) == 0) || 2136 (writable && (pte32 & PG_RW) == 0)) { 2137 if (!check_only) { 2138 pfcode = pf_error_code(usermode, prot, 0, 2139 pte32); 2140 vm_inject_pf(vcpu, pfcode, gla); 2141 } 2142 goto fault; 2143 } 2144 2145 /* 2146 * Emulate the x86 MMU's management of the accessed 2147 * and dirty flags. While the accessed flag is set 2148 * at every level of the page table, the dirty flag 2149 * is only set at the last level providing the guest 2150 * physical address. 2151 */ 2152 if (!check_only && (pte32 & PG_A) == 0) { 2153 if (atomic_cmpset_32(&ptpbase32[ptpindex], 2154 pte32, pte32 | PG_A) == 0) { 2155 goto restart; 2156 } 2157 } 2158 2159 /* XXX must be ignored if CR4.PSE=0 */ 2160 if (nlevels > 0 && (pte32 & PG_PS) != 0) 2161 break; 2162 2163 ptpphys = pte32; 2164 } 2165 2166 /* Set the dirty bit in the page table entry if necessary */ 2167 if (!check_only && writable && (pte32 & PG_M) == 0) { 2168 if (atomic_cmpset_32(&ptpbase32[ptpindex], 2169 pte32, pte32 | PG_M) == 0) { 2170 goto restart; 2171 } 2172 } 2173 2174 /* Zero out the lower 'ptpshift' bits */ 2175 pte32 >>= ptpshift; pte32 <<= ptpshift; 2176 *gpa = pte32 | (gla & (pgsize - 1)); 2177 goto done; 2178 } 2179 2180 if (paging->paging_mode == PAGING_MODE_PAE) { 2181 /* Zero out the lower 5 bits and the upper 32 bits */ 2182 ptpphys &= 0xffffffe0UL; 2183 2184 ptpbase = ptp_hold(vcpu, ptpphys, sizeof(*ptpbase) * 4, 2185 &cookie); 2186 if (ptpbase == NULL) 2187 goto error; 2188 2189 ptpindex = (gla >> 30) & 0x3; 2190 2191 pte = ptpbase[ptpindex]; 2192 2193 if ((pte & PG_V) == 0) { 2194 if (!check_only) { 2195 pfcode = pf_error_code(usermode, prot, 0, pte); 2196 vm_inject_pf(vcpu, pfcode, gla); 2197 } 2198 goto fault; 2199 } 2200 2201 ptpphys = pte; 2202 2203 nlevels = 2; 2204 } else if (paging->paging_mode == PAGING_MODE_64_LA57) { 2205 nlevels = 5; 2206 } else { 2207 nlevels = 4; 2208 } 2209 2210 while (--nlevels >= 0) { 2211 /* Zero out the lower 12 bits and the upper 12 bits */ 2212 ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12; 2213 2214 ptpbase = ptp_hold(vcpu, ptpphys, PAGE_SIZE, &cookie); 2215 if (ptpbase == NULL) 2216 goto error; 2217 2218 ptpshift = PAGE_SHIFT + nlevels * 9; 2219 ptpindex = (gla >> ptpshift) & 0x1FF; 2220 pgsize = 1UL << ptpshift; 2221 2222 pte = ptpbase[ptpindex]; 2223 2224 if ((pte & PG_V) == 0 || 2225 (usermode && (pte & PG_U) == 0) || 2226 (writable && (pte & PG_RW) == 0)) { 2227 if (!check_only) { 2228 pfcode = pf_error_code(usermode, prot, 0, pte); 2229 vm_inject_pf(vcpu, pfcode, gla); 2230 } 2231 goto fault; 2232 } 2233 2234 /* Set the accessed bit in the page table entry */ 2235 if (!check_only && (pte & PG_A) == 0) { 2236 if (atomic_cmpset_64(&ptpbase[ptpindex], 2237 pte, pte | PG_A) == 0) { 2238 goto restart; 2239 } 2240 } 2241 2242 if (nlevels > 0 && (pte & PG_PS) != 0) { 2243 if (pgsize > 1 * GB) { 2244 if (!check_only) { 2245 pfcode = pf_error_code(usermode, prot, 1, 2246 pte); 2247 vm_inject_pf(vcpu, pfcode, gla); 2248 } 2249 goto fault; 2250 } 2251 break; 2252 } 2253 2254 ptpphys = pte; 2255 } 2256 2257 /* Set the dirty bit in the page table entry if necessary */ 2258 if (!check_only && writable && (pte & PG_M) == 0) { 2259 if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0) 2260 goto restart; 2261 } 2262 2263 /* Zero out the lower 'ptpshift' bits and the upper 12 bits */ 2264 pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12; 2265 *gpa = pte | (gla & (pgsize - 1)); 2266 done: 2267 ptp_release(&cookie); 2268 KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d", 2269 __func__, retval)); 2270 return (retval); 2271 error: 2272 retval = EFAULT; 2273 goto done; 2274 fault: 2275 *guest_fault = 1; 2276 goto done; 2277 } 2278 2279 int 2280 vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, 2281 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) 2282 { 2283 2284 return (_vm_gla2gpa(vcpu, paging, gla, prot, gpa, guest_fault, 2285 false)); 2286 } 2287 2288 int 2289 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, 2290 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) 2291 { 2292 2293 return (_vm_gla2gpa(vcpu, paging, gla, prot, gpa, guest_fault, 2294 true)); 2295 } 2296 2297 int 2298 vmm_fetch_instruction(struct vcpu *vcpu, struct vm_guest_paging *paging, 2299 uint64_t rip, int inst_length, struct vie *vie, int *faultptr) 2300 { 2301 struct vm_copyinfo copyinfo[2]; 2302 int error, prot; 2303 2304 if (inst_length > VIE_INST_SIZE) 2305 panic("vmm_fetch_instruction: invalid length %d", inst_length); 2306 2307 prot = PROT_READ | PROT_EXEC; 2308 error = vm_copy_setup(vcpu, paging, rip, inst_length, prot, 2309 copyinfo, nitems(copyinfo), faultptr); 2310 if (error || *faultptr) 2311 return (error); 2312 2313 vm_copyin(copyinfo, vie->inst, inst_length); 2314 vm_copy_teardown(copyinfo, nitems(copyinfo)); 2315 vie->num_valid = inst_length; 2316 return (0); 2317 } 2318 #endif /* _KERNEL */ 2319 2320 static int 2321 vie_peek(struct vie *vie, uint8_t *x) 2322 { 2323 2324 if (vie->num_processed < vie->num_valid) { 2325 *x = vie->inst[vie->num_processed]; 2326 return (0); 2327 } else 2328 return (-1); 2329 } 2330 2331 static void 2332 vie_advance(struct vie *vie) 2333 { 2334 2335 vie->num_processed++; 2336 } 2337 2338 static bool 2339 segment_override(uint8_t x, int *seg) 2340 { 2341 2342 switch (x) { 2343 case 0x2E: 2344 *seg = VM_REG_GUEST_CS; 2345 break; 2346 case 0x36: 2347 *seg = VM_REG_GUEST_SS; 2348 break; 2349 case 0x3E: 2350 *seg = VM_REG_GUEST_DS; 2351 break; 2352 case 0x26: 2353 *seg = VM_REG_GUEST_ES; 2354 break; 2355 case 0x64: 2356 *seg = VM_REG_GUEST_FS; 2357 break; 2358 case 0x65: 2359 *seg = VM_REG_GUEST_GS; 2360 break; 2361 default: 2362 return (false); 2363 } 2364 return (true); 2365 } 2366 2367 static int 2368 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d) 2369 { 2370 uint8_t x; 2371 2372 while (1) { 2373 if (vie_peek(vie, &x)) 2374 return (-1); 2375 2376 if (x == 0x66) 2377 vie->opsize_override = 1; 2378 else if (x == 0x67) 2379 vie->addrsize_override = 1; 2380 else if (x == 0xF3) 2381 vie->repz_present = 1; 2382 else if (x == 0xF2) 2383 vie->repnz_present = 1; 2384 else if (segment_override(x, &vie->segment_register)) 2385 vie->segment_override = 1; 2386 else 2387 break; 2388 2389 vie_advance(vie); 2390 } 2391 2392 /* 2393 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2: 2394 * - Only one REX prefix is allowed per instruction. 2395 * - The REX prefix must immediately precede the opcode byte or the 2396 * escape opcode byte. 2397 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3) 2398 * the mandatory prefix must come before the REX prefix. 2399 */ 2400 if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) { 2401 vie->rex_present = 1; 2402 vie->rex_w = x & 0x8 ? 1 : 0; 2403 vie->rex_r = x & 0x4 ? 1 : 0; 2404 vie->rex_x = x & 0x2 ? 1 : 0; 2405 vie->rex_b = x & 0x1 ? 1 : 0; 2406 vie_advance(vie); 2407 } 2408 2409 /* 2410 * § 2.3.5, "The VEX Prefix", SDM Vol 2. 2411 */ 2412 if ((cpu_mode == CPU_MODE_64BIT || cpu_mode == CPU_MODE_COMPATIBILITY) 2413 && x == 0xC4) { 2414 const struct vie_op *optab; 2415 2416 /* 3-byte VEX prefix. */ 2417 vie->vex_present = 1; 2418 2419 vie_advance(vie); 2420 if (vie_peek(vie, &x)) 2421 return (-1); 2422 2423 /* 2424 * 2nd byte: [R', X', B', mmmmm[4:0]]. Bits are inverted 2425 * relative to REX encoding. 2426 */ 2427 vie->rex_r = x & 0x80 ? 0 : 1; 2428 vie->rex_x = x & 0x40 ? 0 : 1; 2429 vie->rex_b = x & 0x20 ? 0 : 1; 2430 2431 switch (x & 0x1F) { 2432 case 0x2: 2433 /* 0F 38. */ 2434 optab = three_byte_opcodes_0f38; 2435 break; 2436 case 0x1: 2437 /* 0F class - nothing handled here yet. */ 2438 /* FALLTHROUGH */ 2439 case 0x3: 2440 /* 0F 3A class - nothing handled here yet. */ 2441 /* FALLTHROUGH */ 2442 default: 2443 /* Reserved (#UD). */ 2444 return (-1); 2445 } 2446 2447 vie_advance(vie); 2448 if (vie_peek(vie, &x)) 2449 return (-1); 2450 2451 /* 3rd byte: [W, vvvv[6:3], L, pp[1:0]]. */ 2452 vie->rex_w = x & 0x80 ? 1 : 0; 2453 2454 vie->vex_reg = ((~(unsigned)x & 0x78u) >> 3); 2455 vie->vex_l = !!(x & 0x4); 2456 vie->vex_pp = (x & 0x3); 2457 2458 /* PP: 1=66 2=F3 3=F2 prefixes. */ 2459 switch (vie->vex_pp) { 2460 case 0x1: 2461 vie->opsize_override = 1; 2462 break; 2463 case 0x2: 2464 vie->repz_present = 1; 2465 break; 2466 case 0x3: 2467 vie->repnz_present = 1; 2468 break; 2469 } 2470 2471 vie_advance(vie); 2472 2473 /* Opcode, sans literal prefix prefix. */ 2474 if (vie_peek(vie, &x)) 2475 return (-1); 2476 2477 vie->op = optab[x]; 2478 if (vie->op.op_type == VIE_OP_TYPE_NONE) 2479 return (-1); 2480 2481 vie_advance(vie); 2482 } 2483 2484 /* 2485 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1 2486 */ 2487 if (cpu_mode == CPU_MODE_64BIT) { 2488 /* 2489 * Default address size is 64-bits and default operand size 2490 * is 32-bits. 2491 */ 2492 vie->addrsize = vie->addrsize_override ? 4 : 8; 2493 if (vie->rex_w) 2494 vie->opsize = 8; 2495 else if (vie->opsize_override) 2496 vie->opsize = 2; 2497 else 2498 vie->opsize = 4; 2499 } else if (cs_d) { 2500 /* Default address and operand sizes are 32-bits */ 2501 vie->addrsize = vie->addrsize_override ? 2 : 4; 2502 vie->opsize = vie->opsize_override ? 2 : 4; 2503 } else { 2504 /* Default address and operand sizes are 16-bits */ 2505 vie->addrsize = vie->addrsize_override ? 4 : 2; 2506 vie->opsize = vie->opsize_override ? 4 : 2; 2507 } 2508 return (0); 2509 } 2510 2511 static int 2512 decode_two_byte_opcode(struct vie *vie) 2513 { 2514 uint8_t x; 2515 2516 if (vie_peek(vie, &x)) 2517 return (-1); 2518 2519 vie->op = two_byte_opcodes[x]; 2520 2521 if (vie->op.op_type == VIE_OP_TYPE_NONE) 2522 return (-1); 2523 2524 vie_advance(vie); 2525 return (0); 2526 } 2527 2528 static int 2529 decode_opcode(struct vie *vie) 2530 { 2531 uint8_t x; 2532 2533 if (vie_peek(vie, &x)) 2534 return (-1); 2535 2536 /* Already did this via VEX prefix. */ 2537 if (vie->op.op_type != VIE_OP_TYPE_NONE) 2538 return (0); 2539 2540 vie->op = one_byte_opcodes[x]; 2541 2542 if (vie->op.op_type == VIE_OP_TYPE_NONE) 2543 return (-1); 2544 2545 vie_advance(vie); 2546 2547 if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE) 2548 return (decode_two_byte_opcode(vie)); 2549 2550 return (0); 2551 } 2552 2553 static int 2554 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode) 2555 { 2556 uint8_t x; 2557 2558 if (vie->op.op_flags & VIE_OP_F_NO_MODRM) 2559 return (0); 2560 2561 if (cpu_mode == CPU_MODE_REAL) 2562 return (-1); 2563 2564 if (vie_peek(vie, &x)) 2565 return (-1); 2566 2567 vie->mod = (x >> 6) & 0x3; 2568 vie->rm = (x >> 0) & 0x7; 2569 vie->reg = (x >> 3) & 0x7; 2570 2571 /* 2572 * A direct addressing mode makes no sense in the context of an EPT 2573 * fault. There has to be a memory access involved to cause the 2574 * EPT fault. 2575 */ 2576 if (vie->mod == VIE_MOD_DIRECT) 2577 return (-1); 2578 2579 if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) || 2580 (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) { 2581 /* 2582 * Table 2-5: Special Cases of REX Encodings 2583 * 2584 * mod=0, r/m=5 is used in the compatibility mode to 2585 * indicate a disp32 without a base register. 2586 * 2587 * mod!=3, r/m=4 is used in the compatibility mode to 2588 * indicate that the SIB byte is present. 2589 * 2590 * The 'b' bit in the REX prefix is don't care in 2591 * this case. 2592 */ 2593 } else { 2594 vie->rm |= (vie->rex_b << 3); 2595 } 2596 2597 vie->reg |= (vie->rex_r << 3); 2598 2599 /* SIB */ 2600 if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB) 2601 goto done; 2602 2603 vie->base_register = gpr_map[vie->rm]; 2604 2605 switch (vie->mod) { 2606 case VIE_MOD_INDIRECT_DISP8: 2607 vie->disp_bytes = 1; 2608 break; 2609 case VIE_MOD_INDIRECT_DISP32: 2610 vie->disp_bytes = 4; 2611 break; 2612 case VIE_MOD_INDIRECT: 2613 if (vie->rm == VIE_RM_DISP32) { 2614 vie->disp_bytes = 4; 2615 /* 2616 * Table 2-7. RIP-Relative Addressing 2617 * 2618 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32 2619 * whereas in compatibility mode it just implies disp32. 2620 */ 2621 2622 if (cpu_mode == CPU_MODE_64BIT) 2623 vie->base_register = VM_REG_GUEST_RIP; 2624 else 2625 vie->base_register = VM_REG_LAST; 2626 } 2627 break; 2628 } 2629 2630 done: 2631 vie_advance(vie); 2632 2633 return (0); 2634 } 2635 2636 static int 2637 decode_sib(struct vie *vie) 2638 { 2639 uint8_t x; 2640 2641 /* Proceed only if SIB byte is present */ 2642 if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB) 2643 return (0); 2644 2645 if (vie_peek(vie, &x)) 2646 return (-1); 2647 2648 /* De-construct the SIB byte */ 2649 vie->ss = (x >> 6) & 0x3; 2650 vie->index = (x >> 3) & 0x7; 2651 vie->base = (x >> 0) & 0x7; 2652 2653 /* Apply the REX prefix modifiers */ 2654 vie->index |= vie->rex_x << 3; 2655 vie->base |= vie->rex_b << 3; 2656 2657 switch (vie->mod) { 2658 case VIE_MOD_INDIRECT_DISP8: 2659 vie->disp_bytes = 1; 2660 break; 2661 case VIE_MOD_INDIRECT_DISP32: 2662 vie->disp_bytes = 4; 2663 break; 2664 } 2665 2666 if (vie->mod == VIE_MOD_INDIRECT && 2667 (vie->base == 5 || vie->base == 13)) { 2668 /* 2669 * Special case when base register is unused if mod = 0 2670 * and base = %rbp or %r13. 2671 * 2672 * Documented in: 2673 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 2674 * Table 2-5: Special Cases of REX Encodings 2675 */ 2676 vie->disp_bytes = 4; 2677 } else { 2678 vie->base_register = gpr_map[vie->base]; 2679 } 2680 2681 /* 2682 * All encodings of 'index' are valid except for %rsp (4). 2683 * 2684 * Documented in: 2685 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 2686 * Table 2-5: Special Cases of REX Encodings 2687 */ 2688 if (vie->index != 4) 2689 vie->index_register = gpr_map[vie->index]; 2690 2691 /* 'scale' makes sense only in the context of an index register */ 2692 if (vie->index_register < VM_REG_LAST) 2693 vie->scale = 1 << vie->ss; 2694 2695 vie_advance(vie); 2696 2697 return (0); 2698 } 2699 2700 static int 2701 decode_displacement(struct vie *vie) 2702 { 2703 int n, i; 2704 uint8_t x; 2705 2706 union { 2707 char buf[4]; 2708 int8_t signed8; 2709 int32_t signed32; 2710 } u; 2711 2712 if ((n = vie->disp_bytes) == 0) 2713 return (0); 2714 2715 if (n != 1 && n != 4) 2716 panic("decode_displacement: invalid disp_bytes %d", n); 2717 2718 for (i = 0; i < n; i++) { 2719 if (vie_peek(vie, &x)) 2720 return (-1); 2721 2722 u.buf[i] = x; 2723 vie_advance(vie); 2724 } 2725 2726 if (n == 1) 2727 vie->displacement = u.signed8; /* sign-extended */ 2728 else 2729 vie->displacement = u.signed32; /* sign-extended */ 2730 2731 return (0); 2732 } 2733 2734 static int 2735 decode_immediate(struct vie *vie) 2736 { 2737 int i, n; 2738 uint8_t x; 2739 union { 2740 char buf[4]; 2741 int8_t signed8; 2742 int16_t signed16; 2743 int32_t signed32; 2744 } u; 2745 2746 /* Figure out immediate operand size (if any) */ 2747 if (vie->op.op_flags & VIE_OP_F_IMM) { 2748 /* 2749 * Section 2.2.1.5 "Immediates", Intel SDM: 2750 * In 64-bit mode the typical size of immediate operands 2751 * remains 32-bits. When the operand size if 64-bits, the 2752 * processor sign-extends all immediates to 64-bits prior 2753 * to their use. 2754 */ 2755 if (vie->opsize == 4 || vie->opsize == 8) 2756 vie->imm_bytes = 4; 2757 else 2758 vie->imm_bytes = 2; 2759 } else if (vie->op.op_flags & VIE_OP_F_IMM8) { 2760 vie->imm_bytes = 1; 2761 } 2762 2763 if ((n = vie->imm_bytes) == 0) 2764 return (0); 2765 2766 KASSERT(n == 1 || n == 2 || n == 4, 2767 ("%s: invalid number of immediate bytes: %d", __func__, n)); 2768 2769 for (i = 0; i < n; i++) { 2770 if (vie_peek(vie, &x)) 2771 return (-1); 2772 2773 u.buf[i] = x; 2774 vie_advance(vie); 2775 } 2776 2777 /* sign-extend the immediate value before use */ 2778 if (n == 1) 2779 vie->immediate = u.signed8; 2780 else if (n == 2) 2781 vie->immediate = u.signed16; 2782 else 2783 vie->immediate = u.signed32; 2784 2785 return (0); 2786 } 2787 2788 static int 2789 decode_moffset(struct vie *vie) 2790 { 2791 int i, n; 2792 uint8_t x; 2793 union { 2794 char buf[8]; 2795 uint64_t u64; 2796 } u; 2797 2798 if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0) 2799 return (0); 2800 2801 /* 2802 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM: 2803 * The memory offset size follows the address-size of the instruction. 2804 */ 2805 n = vie->addrsize; 2806 KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n)); 2807 2808 u.u64 = 0; 2809 for (i = 0; i < n; i++) { 2810 if (vie_peek(vie, &x)) 2811 return (-1); 2812 2813 u.buf[i] = x; 2814 vie_advance(vie); 2815 } 2816 vie->displacement = u.u64; 2817 return (0); 2818 } 2819 2820 #ifdef _KERNEL 2821 /* 2822 * Verify that the 'guest linear address' provided as collateral of the nested 2823 * page table fault matches with our instruction decoding. 2824 */ 2825 static int 2826 verify_gla(struct vcpu *vcpu, uint64_t gla, struct vie *vie, 2827 enum vm_cpu_mode cpu_mode) 2828 { 2829 int error; 2830 uint64_t base, segbase, idx, gla2; 2831 enum vm_reg_name seg; 2832 struct seg_desc desc; 2833 2834 /* Skip 'gla' verification */ 2835 if (gla == VIE_INVALID_GLA) 2836 return (0); 2837 2838 base = 0; 2839 if (vie->base_register != VM_REG_LAST) { 2840 error = vm_get_register(vcpu, vie->base_register, &base); 2841 if (error) { 2842 printf("verify_gla: error %d getting base reg %d\n", 2843 error, vie->base_register); 2844 return (-1); 2845 } 2846 2847 /* 2848 * RIP-relative addressing starts from the following 2849 * instruction 2850 */ 2851 if (vie->base_register == VM_REG_GUEST_RIP) 2852 base += vie->num_processed; 2853 } 2854 2855 idx = 0; 2856 if (vie->index_register != VM_REG_LAST) { 2857 error = vm_get_register(vcpu, vie->index_register, &idx); 2858 if (error) { 2859 printf("verify_gla: error %d getting index reg %d\n", 2860 error, vie->index_register); 2861 return (-1); 2862 } 2863 } 2864 2865 /* 2866 * From "Specifying a Segment Selector", Intel SDM, Vol 1 2867 * 2868 * In 64-bit mode, segmentation is generally (but not 2869 * completely) disabled. The exceptions are the FS and GS 2870 * segments. 2871 * 2872 * In legacy IA-32 mode, when the ESP or EBP register is used 2873 * as the base, the SS segment is the default segment. For 2874 * other data references, except when relative to stack or 2875 * string destination the DS segment is the default. These 2876 * can be overridden to allow other segments to be accessed. 2877 */ 2878 if (vie->segment_override) 2879 seg = vie->segment_register; 2880 else if (vie->base_register == VM_REG_GUEST_RSP || 2881 vie->base_register == VM_REG_GUEST_RBP) 2882 seg = VM_REG_GUEST_SS; 2883 else 2884 seg = VM_REG_GUEST_DS; 2885 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && 2886 seg != VM_REG_GUEST_GS) { 2887 segbase = 0; 2888 } else { 2889 error = vm_get_seg_desc(vcpu, seg, &desc); 2890 if (error) { 2891 printf("verify_gla: error %d getting segment" 2892 " descriptor %d", error, 2893 vie->segment_register); 2894 return (-1); 2895 } 2896 segbase = desc.base; 2897 } 2898 2899 gla2 = segbase + base + vie->scale * idx + vie->displacement; 2900 gla2 &= size2mask[vie->addrsize]; 2901 if (gla != gla2) { 2902 printf("verify_gla mismatch: segbase(0x%0lx)" 2903 "base(0x%0lx), scale(%d), index(0x%0lx), " 2904 "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n", 2905 segbase, base, vie->scale, idx, vie->displacement, 2906 gla, gla2); 2907 return (-1); 2908 } 2909 2910 return (0); 2911 } 2912 #endif /* _KERNEL */ 2913 2914 int 2915 #ifdef _KERNEL 2916 vmm_decode_instruction(struct vcpu *vcpu, uint64_t gla, 2917 enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) 2918 #else 2919 vmm_decode_instruction(enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) 2920 #endif 2921 { 2922 2923 if (decode_prefixes(vie, cpu_mode, cs_d)) 2924 return (-1); 2925 2926 if (decode_opcode(vie)) 2927 return (-1); 2928 2929 if (decode_modrm(vie, cpu_mode)) 2930 return (-1); 2931 2932 if (decode_sib(vie)) 2933 return (-1); 2934 2935 if (decode_displacement(vie)) 2936 return (-1); 2937 2938 if (decode_immediate(vie)) 2939 return (-1); 2940 2941 if (decode_moffset(vie)) 2942 return (-1); 2943 2944 #ifdef _KERNEL 2945 if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) { 2946 if (verify_gla(vcpu, gla, vie, cpu_mode)) 2947 return (-1); 2948 } 2949 #endif 2950 2951 vie->decoded = 1; /* success */ 2952 2953 return (0); 2954 } 2955