1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2012 Sandvine, Inc. 5 * Copyright (c) 2012 NetApp, Inc. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #ifdef _KERNEL 34 #include <sys/param.h> 35 #include <sys/pcpu.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 39 #include <vm/vm.h> 40 #include <vm/pmap.h> 41 42 #include <machine/vmparam.h> 43 #include <machine/vmm.h> 44 #else /* !_KERNEL */ 45 #include <sys/types.h> 46 #include <sys/errno.h> 47 #include <sys/_iovec.h> 48 49 #include <machine/vmm.h> 50 51 #include <err.h> 52 #include <assert.h> 53 #include <stdbool.h> 54 #include <stddef.h> 55 #include <stdio.h> 56 #include <string.h> 57 #include <strings.h> 58 #include <vmmapi.h> 59 #define __diagused 60 #define KASSERT(exp,msg) assert((exp)) 61 #define panic(...) errx(4, __VA_ARGS__) 62 #endif /* _KERNEL */ 63 64 #include <machine/vmm_instruction_emul.h> 65 #include <x86/psl.h> 66 #include <x86/specialreg.h> 67 68 /* struct vie_op.op_type */ 69 enum { 70 VIE_OP_TYPE_NONE = 0, 71 VIE_OP_TYPE_MOV, 72 VIE_OP_TYPE_MOVSX, 73 VIE_OP_TYPE_MOVZX, 74 VIE_OP_TYPE_AND, 75 VIE_OP_TYPE_OR, 76 VIE_OP_TYPE_SUB, 77 VIE_OP_TYPE_TWO_BYTE, 78 VIE_OP_TYPE_PUSH, 79 VIE_OP_TYPE_CMP, 80 VIE_OP_TYPE_POP, 81 VIE_OP_TYPE_MOVS, 82 VIE_OP_TYPE_GROUP1, 83 VIE_OP_TYPE_STOS, 84 VIE_OP_TYPE_BITTEST, 85 VIE_OP_TYPE_TWOB_GRP15, 86 VIE_OP_TYPE_ADD, 87 VIE_OP_TYPE_TEST, 88 VIE_OP_TYPE_BEXTR, 89 VIE_OP_TYPE_LAST 90 }; 91 92 /* struct vie_op.op_flags */ 93 #define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */ 94 #define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ 95 #define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */ 96 #define VIE_OP_F_NO_MODRM (1 << 3) 97 #define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4) 98 99 static const struct vie_op three_byte_opcodes_0f38[256] = { 100 [0xF7] = { 101 .op_byte = 0xF7, 102 .op_type = VIE_OP_TYPE_BEXTR, 103 }, 104 }; 105 106 static const struct vie_op two_byte_opcodes[256] = { 107 [0xAE] = { 108 .op_byte = 0xAE, 109 .op_type = VIE_OP_TYPE_TWOB_GRP15, 110 }, 111 [0xB6] = { 112 .op_byte = 0xB6, 113 .op_type = VIE_OP_TYPE_MOVZX, 114 }, 115 [0xB7] = { 116 .op_byte = 0xB7, 117 .op_type = VIE_OP_TYPE_MOVZX, 118 }, 119 [0xBA] = { 120 .op_byte = 0xBA, 121 .op_type = VIE_OP_TYPE_BITTEST, 122 .op_flags = VIE_OP_F_IMM8, 123 }, 124 [0xBE] = { 125 .op_byte = 0xBE, 126 .op_type = VIE_OP_TYPE_MOVSX, 127 }, 128 }; 129 130 static const struct vie_op one_byte_opcodes[256] = { 131 [0x03] = { 132 .op_byte = 0x03, 133 .op_type = VIE_OP_TYPE_ADD, 134 }, 135 [0x0F] = { 136 .op_byte = 0x0F, 137 .op_type = VIE_OP_TYPE_TWO_BYTE 138 }, 139 [0x0B] = { 140 .op_byte = 0x0B, 141 .op_type = VIE_OP_TYPE_OR, 142 }, 143 [0x2B] = { 144 .op_byte = 0x2B, 145 .op_type = VIE_OP_TYPE_SUB, 146 }, 147 [0x39] = { 148 .op_byte = 0x39, 149 .op_type = VIE_OP_TYPE_CMP, 150 }, 151 [0x3B] = { 152 .op_byte = 0x3B, 153 .op_type = VIE_OP_TYPE_CMP, 154 }, 155 [0x88] = { 156 .op_byte = 0x88, 157 .op_type = VIE_OP_TYPE_MOV, 158 }, 159 [0x89] = { 160 .op_byte = 0x89, 161 .op_type = VIE_OP_TYPE_MOV, 162 }, 163 [0x8A] = { 164 .op_byte = 0x8A, 165 .op_type = VIE_OP_TYPE_MOV, 166 }, 167 [0x8B] = { 168 .op_byte = 0x8B, 169 .op_type = VIE_OP_TYPE_MOV, 170 }, 171 [0xA1] = { 172 .op_byte = 0xA1, 173 .op_type = VIE_OP_TYPE_MOV, 174 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 175 }, 176 [0xA3] = { 177 .op_byte = 0xA3, 178 .op_type = VIE_OP_TYPE_MOV, 179 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 180 }, 181 [0xA4] = { 182 .op_byte = 0xA4, 183 .op_type = VIE_OP_TYPE_MOVS, 184 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 185 }, 186 [0xA5] = { 187 .op_byte = 0xA5, 188 .op_type = VIE_OP_TYPE_MOVS, 189 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 190 }, 191 [0xAA] = { 192 .op_byte = 0xAA, 193 .op_type = VIE_OP_TYPE_STOS, 194 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 195 }, 196 [0xAB] = { 197 .op_byte = 0xAB, 198 .op_type = VIE_OP_TYPE_STOS, 199 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 200 }, 201 [0xC6] = { 202 /* XXX Group 11 extended opcode - not just MOV */ 203 .op_byte = 0xC6, 204 .op_type = VIE_OP_TYPE_MOV, 205 .op_flags = VIE_OP_F_IMM8, 206 }, 207 [0xC7] = { 208 .op_byte = 0xC7, 209 .op_type = VIE_OP_TYPE_MOV, 210 .op_flags = VIE_OP_F_IMM, 211 }, 212 [0x23] = { 213 .op_byte = 0x23, 214 .op_type = VIE_OP_TYPE_AND, 215 }, 216 [0x80] = { 217 /* Group 1 extended opcode */ 218 .op_byte = 0x80, 219 .op_type = VIE_OP_TYPE_GROUP1, 220 .op_flags = VIE_OP_F_IMM8, 221 }, 222 [0x81] = { 223 /* Group 1 extended opcode */ 224 .op_byte = 0x81, 225 .op_type = VIE_OP_TYPE_GROUP1, 226 .op_flags = VIE_OP_F_IMM, 227 }, 228 [0x83] = { 229 /* Group 1 extended opcode */ 230 .op_byte = 0x83, 231 .op_type = VIE_OP_TYPE_GROUP1, 232 .op_flags = VIE_OP_F_IMM8, 233 }, 234 [0x8F] = { 235 /* XXX Group 1A extended opcode - not just POP */ 236 .op_byte = 0x8F, 237 .op_type = VIE_OP_TYPE_POP, 238 }, 239 [0xF7] = { 240 /* XXX Group 3 extended opcode - not just TEST */ 241 .op_byte = 0xF7, 242 .op_type = VIE_OP_TYPE_TEST, 243 .op_flags = VIE_OP_F_IMM, 244 }, 245 [0xFF] = { 246 /* XXX Group 5 extended opcode - not just PUSH */ 247 .op_byte = 0xFF, 248 .op_type = VIE_OP_TYPE_PUSH, 249 } 250 }; 251 252 /* struct vie.mod */ 253 #define VIE_MOD_INDIRECT 0 254 #define VIE_MOD_INDIRECT_DISP8 1 255 #define VIE_MOD_INDIRECT_DISP32 2 256 #define VIE_MOD_DIRECT 3 257 258 /* struct vie.rm */ 259 #define VIE_RM_SIB 4 260 #define VIE_RM_DISP32 5 261 262 #define GB (1024 * 1024 * 1024) 263 264 static enum vm_reg_name gpr_map[16] = { 265 VM_REG_GUEST_RAX, 266 VM_REG_GUEST_RCX, 267 VM_REG_GUEST_RDX, 268 VM_REG_GUEST_RBX, 269 VM_REG_GUEST_RSP, 270 VM_REG_GUEST_RBP, 271 VM_REG_GUEST_RSI, 272 VM_REG_GUEST_RDI, 273 VM_REG_GUEST_R8, 274 VM_REG_GUEST_R9, 275 VM_REG_GUEST_R10, 276 VM_REG_GUEST_R11, 277 VM_REG_GUEST_R12, 278 VM_REG_GUEST_R13, 279 VM_REG_GUEST_R14, 280 VM_REG_GUEST_R15 281 }; 282 283 static uint64_t size2mask[] = { 284 [1] = 0xff, 285 [2] = 0xffff, 286 [4] = 0xffffffff, 287 [8] = 0xffffffffffffffff, 288 }; 289 290 static int 291 vie_read_register(struct vcpu *vcpu, enum vm_reg_name reg, uint64_t *rval) 292 { 293 int error; 294 295 error = vm_get_register(vcpu, reg, rval); 296 297 return (error); 298 } 299 300 static void 301 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr) 302 { 303 *lhbr = 0; 304 *reg = gpr_map[vie->reg]; 305 306 /* 307 * 64-bit mode imposes limitations on accessing legacy high byte 308 * registers (lhbr). 309 * 310 * The legacy high-byte registers cannot be addressed if the REX 311 * prefix is present. In this case the values 4, 5, 6 and 7 of the 312 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively. 313 * 314 * If the REX prefix is not present then the values 4, 5, 6 and 7 315 * of the 'ModRM:reg' field address the legacy high-byte registers, 316 * %ah, %ch, %dh and %bh respectively. 317 */ 318 if (!vie->rex_present) { 319 if (vie->reg & 0x4) { 320 *lhbr = 1; 321 *reg = gpr_map[vie->reg & 0x3]; 322 } 323 } 324 } 325 326 static int 327 vie_read_bytereg(struct vcpu *vcpu, struct vie *vie, uint8_t *rval) 328 { 329 uint64_t val; 330 int error, lhbr; 331 enum vm_reg_name reg; 332 333 vie_calc_bytereg(vie, ®, &lhbr); 334 error = vm_get_register(vcpu, reg, &val); 335 336 /* 337 * To obtain the value of a legacy high byte register shift the 338 * base register right by 8 bits (%ah = %rax >> 8). 339 */ 340 if (lhbr) 341 *rval = val >> 8; 342 else 343 *rval = val; 344 return (error); 345 } 346 347 static int 348 vie_write_bytereg(struct vcpu *vcpu, struct vie *vie, uint8_t byte) 349 { 350 uint64_t origval, val, mask; 351 int error, lhbr; 352 enum vm_reg_name reg; 353 354 vie_calc_bytereg(vie, ®, &lhbr); 355 error = vm_get_register(vcpu, reg, &origval); 356 if (error == 0) { 357 val = byte; 358 mask = 0xff; 359 if (lhbr) { 360 /* 361 * Shift left by 8 to store 'byte' in a legacy high 362 * byte register. 363 */ 364 val <<= 8; 365 mask <<= 8; 366 } 367 val |= origval & ~mask; 368 error = vm_set_register(vcpu, reg, val); 369 } 370 return (error); 371 } 372 373 int 374 vie_update_register(struct vcpu *vcpu, enum vm_reg_name reg, 375 uint64_t val, int size) 376 { 377 int error; 378 uint64_t origval; 379 380 switch (size) { 381 case 1: 382 case 2: 383 error = vie_read_register(vcpu, reg, &origval); 384 if (error) 385 return (error); 386 val &= size2mask[size]; 387 val |= origval & ~size2mask[size]; 388 break; 389 case 4: 390 val &= 0xffffffffUL; 391 break; 392 case 8: 393 break; 394 default: 395 return (EINVAL); 396 } 397 398 error = vm_set_register(vcpu, reg, val); 399 return (error); 400 } 401 402 #define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V) 403 404 /* 405 * Return the status flags that would result from doing (x - y). 406 */ 407 #define GETCC(sz) \ 408 static u_long \ 409 getcc##sz(uint##sz##_t x, uint##sz##_t y) \ 410 { \ 411 u_long rflags; \ 412 \ 413 __asm __volatile("sub %2,%1; pushfq; popq %0" : \ 414 "=r" (rflags), "+r" (x) : "m" (y)); \ 415 return (rflags); \ 416 } struct __hack 417 418 GETCC(8); 419 GETCC(16); 420 GETCC(32); 421 GETCC(64); 422 423 static u_long 424 getcc(int opsize, uint64_t x, uint64_t y) 425 { 426 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 427 ("getcc: invalid operand size %d", opsize)); 428 429 if (opsize == 1) 430 return (getcc8(x, y)); 431 else if (opsize == 2) 432 return (getcc16(x, y)); 433 else if (opsize == 4) 434 return (getcc32(x, y)); 435 else 436 return (getcc64(x, y)); 437 } 438 439 /* 440 * Macro creation of functions getaddflags{8,16,32,64} 441 */ 442 #define GETADDFLAGS(sz) \ 443 static u_long \ 444 getaddflags##sz(uint##sz##_t x, uint##sz##_t y) \ 445 { \ 446 u_long rflags; \ 447 \ 448 __asm __volatile("add %2,%1; pushfq; popq %0" : \ 449 "=r" (rflags), "+r" (x) : "m" (y)); \ 450 return (rflags); \ 451 } struct __hack 452 453 GETADDFLAGS(8); 454 GETADDFLAGS(16); 455 GETADDFLAGS(32); 456 GETADDFLAGS(64); 457 458 static u_long 459 getaddflags(int opsize, uint64_t x, uint64_t y) 460 { 461 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 462 ("getaddflags: invalid operand size %d", opsize)); 463 464 if (opsize == 1) 465 return (getaddflags8(x, y)); 466 else if (opsize == 2) 467 return (getaddflags16(x, y)); 468 else if (opsize == 4) 469 return (getaddflags32(x, y)); 470 else 471 return (getaddflags64(x, y)); 472 } 473 474 /* 475 * Return the status flags that would result from doing (x & y). 476 */ 477 #define GETANDFLAGS(sz) \ 478 static u_long \ 479 getandflags##sz(uint##sz##_t x, uint##sz##_t y) \ 480 { \ 481 u_long rflags; \ 482 \ 483 __asm __volatile("and %2,%1; pushfq; popq %0" : \ 484 "=r" (rflags), "+r" (x) : "m" (y)); \ 485 return (rflags); \ 486 } struct __hack 487 488 GETANDFLAGS(8); 489 GETANDFLAGS(16); 490 GETANDFLAGS(32); 491 GETANDFLAGS(64); 492 493 static u_long 494 getandflags(int opsize, uint64_t x, uint64_t y) 495 { 496 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 497 ("getandflags: invalid operand size %d", opsize)); 498 499 if (opsize == 1) 500 return (getandflags8(x, y)); 501 else if (opsize == 2) 502 return (getandflags16(x, y)); 503 else if (opsize == 4) 504 return (getandflags32(x, y)); 505 else 506 return (getandflags64(x, y)); 507 } 508 509 static int 510 emulate_mov(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 511 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 512 { 513 int error, size; 514 enum vm_reg_name reg; 515 uint8_t byte; 516 uint64_t val; 517 518 size = vie->opsize; 519 error = EINVAL; 520 521 switch (vie->op.op_byte) { 522 case 0x88: 523 /* 524 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m) 525 * 88/r: mov r/m8, r8 526 * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available) 527 */ 528 size = 1; /* override for byte operation */ 529 error = vie_read_bytereg(vcpu, vie, &byte); 530 if (error == 0) 531 error = memwrite(vcpu, gpa, byte, size, arg); 532 break; 533 case 0x89: 534 /* 535 * MOV from reg (ModRM:reg) to mem (ModRM:r/m) 536 * 89/r: mov r/m16, r16 537 * 89/r: mov r/m32, r32 538 * REX.W + 89/r mov r/m64, r64 539 */ 540 reg = gpr_map[vie->reg]; 541 error = vie_read_register(vcpu, reg, &val); 542 if (error == 0) { 543 val &= size2mask[size]; 544 error = memwrite(vcpu, gpa, val, size, arg); 545 } 546 break; 547 case 0x8A: 548 /* 549 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg) 550 * 8A/r: mov r8, r/m8 551 * REX + 8A/r: mov r8, r/m8 552 */ 553 size = 1; /* override for byte operation */ 554 error = memread(vcpu, gpa, &val, size, arg); 555 if (error == 0) 556 error = vie_write_bytereg(vcpu, vie, val); 557 break; 558 case 0x8B: 559 /* 560 * MOV from mem (ModRM:r/m) to reg (ModRM:reg) 561 * 8B/r: mov r16, r/m16 562 * 8B/r: mov r32, r/m32 563 * REX.W 8B/r: mov r64, r/m64 564 */ 565 error = memread(vcpu, gpa, &val, size, arg); 566 if (error == 0) { 567 reg = gpr_map[vie->reg]; 568 error = vie_update_register(vcpu, reg, val, size); 569 } 570 break; 571 case 0xA1: 572 /* 573 * MOV from seg:moffset to AX/EAX/RAX 574 * A1: mov AX, moffs16 575 * A1: mov EAX, moffs32 576 * REX.W + A1: mov RAX, moffs64 577 */ 578 error = memread(vcpu, gpa, &val, size, arg); 579 if (error == 0) { 580 reg = VM_REG_GUEST_RAX; 581 error = vie_update_register(vcpu, reg, val, size); 582 } 583 break; 584 case 0xA3: 585 /* 586 * MOV from AX/EAX/RAX to seg:moffset 587 * A3: mov moffs16, AX 588 * A3: mov moffs32, EAX 589 * REX.W + A3: mov moffs64, RAX 590 */ 591 error = vie_read_register(vcpu, VM_REG_GUEST_RAX, &val); 592 if (error == 0) { 593 val &= size2mask[size]; 594 error = memwrite(vcpu, gpa, val, size, arg); 595 } 596 break; 597 case 0xC6: 598 /* 599 * MOV from imm8 to mem (ModRM:r/m) 600 * C6/0 mov r/m8, imm8 601 * REX + C6/0 mov r/m8, imm8 602 */ 603 size = 1; /* override for byte operation */ 604 error = memwrite(vcpu, gpa, vie->immediate, size, arg); 605 break; 606 case 0xC7: 607 /* 608 * MOV from imm16/imm32 to mem (ModRM:r/m) 609 * C7/0 mov r/m16, imm16 610 * C7/0 mov r/m32, imm32 611 * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) 612 */ 613 val = vie->immediate & size2mask[size]; 614 error = memwrite(vcpu, gpa, val, size, arg); 615 break; 616 default: 617 break; 618 } 619 620 return (error); 621 } 622 623 static int 624 emulate_movx(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 625 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) 626 { 627 int error, size; 628 enum vm_reg_name reg; 629 uint64_t val; 630 631 size = vie->opsize; 632 error = EINVAL; 633 634 switch (vie->op.op_byte) { 635 case 0xB6: 636 /* 637 * MOV and zero extend byte from mem (ModRM:r/m) to 638 * reg (ModRM:reg). 639 * 640 * 0F B6/r movzx r16, r/m8 641 * 0F B6/r movzx r32, r/m8 642 * REX.W + 0F B6/r movzx r64, r/m8 643 */ 644 645 /* get the first operand */ 646 error = memread(vcpu, gpa, &val, 1, arg); 647 if (error) 648 break; 649 650 /* get the second operand */ 651 reg = gpr_map[vie->reg]; 652 653 /* zero-extend byte */ 654 val = (uint8_t)val; 655 656 /* write the result */ 657 error = vie_update_register(vcpu, reg, val, size); 658 break; 659 case 0xB7: 660 /* 661 * MOV and zero extend word from mem (ModRM:r/m) to 662 * reg (ModRM:reg). 663 * 664 * 0F B7/r movzx r32, r/m16 665 * REX.W + 0F B7/r movzx r64, r/m16 666 */ 667 error = memread(vcpu, gpa, &val, 2, arg); 668 if (error) 669 return (error); 670 671 reg = gpr_map[vie->reg]; 672 673 /* zero-extend word */ 674 val = (uint16_t)val; 675 676 error = vie_update_register(vcpu, reg, val, size); 677 break; 678 case 0xBE: 679 /* 680 * MOV and sign extend byte from mem (ModRM:r/m) to 681 * reg (ModRM:reg). 682 * 683 * 0F BE/r movsx r16, r/m8 684 * 0F BE/r movsx r32, r/m8 685 * REX.W + 0F BE/r movsx r64, r/m8 686 */ 687 688 /* get the first operand */ 689 error = memread(vcpu, gpa, &val, 1, arg); 690 if (error) 691 break; 692 693 /* get the second operand */ 694 reg = gpr_map[vie->reg]; 695 696 /* sign extend byte */ 697 val = (int8_t)val; 698 699 /* write the result */ 700 error = vie_update_register(vcpu, reg, val, size); 701 break; 702 default: 703 break; 704 } 705 return (error); 706 } 707 708 /* 709 * Helper function to calculate and validate a linear address. 710 */ 711 static int 712 get_gla(struct vcpu *vcpu, struct vie *vie __unused, 713 struct vm_guest_paging *paging, int opsize, int addrsize, int prot, 714 enum vm_reg_name seg, enum vm_reg_name gpr, uint64_t *gla, int *fault) 715 { 716 struct seg_desc desc; 717 uint64_t cr0, val, rflags; 718 int error __diagused; 719 720 error = vie_read_register(vcpu, VM_REG_GUEST_CR0, &cr0); 721 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); 722 723 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 724 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 725 726 error = vm_get_seg_desc(vcpu, seg, &desc); 727 KASSERT(error == 0, ("%s: error %d getting segment descriptor %d", 728 __func__, error, seg)); 729 730 error = vie_read_register(vcpu, gpr, &val); 731 KASSERT(error == 0, ("%s: error %d getting register %d", __func__, 732 error, gpr)); 733 734 if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize, 735 addrsize, prot, gla)) { 736 if (seg == VM_REG_GUEST_SS) 737 vm_inject_ss(vcpu, 0); 738 else 739 vm_inject_gp(vcpu); 740 goto guest_fault; 741 } 742 743 if (vie_canonical_check(paging->cpu_mode, *gla)) { 744 if (seg == VM_REG_GUEST_SS) 745 vm_inject_ss(vcpu, 0); 746 else 747 vm_inject_gp(vcpu); 748 goto guest_fault; 749 } 750 751 if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) { 752 vm_inject_ac(vcpu, 0); 753 goto guest_fault; 754 } 755 756 *fault = 0; 757 return (0); 758 759 guest_fault: 760 *fault = 1; 761 return (0); 762 } 763 764 static int 765 emulate_movs(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 766 struct vm_guest_paging *paging, mem_region_read_t memread, 767 mem_region_write_t memwrite, void *arg) 768 { 769 #ifdef _KERNEL 770 struct vm_copyinfo copyinfo[2]; 771 #else 772 struct iovec copyinfo[2]; 773 #endif 774 uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val; 775 uint64_t rcx, rdi, rsi, rflags; 776 int error, fault, opsize, seg, repeat; 777 778 opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize; 779 val = 0; 780 error = 0; 781 782 /* 783 * XXX although the MOVS instruction is only supposed to be used with 784 * the "rep" prefix some guests like FreeBSD will use "repnz" instead. 785 * 786 * Empirically the "repnz" prefix has identical behavior to "rep" 787 * and the zero flag does not make a difference. 788 */ 789 repeat = vie->repz_present | vie->repnz_present; 790 791 if (repeat) { 792 error = vie_read_register(vcpu, VM_REG_GUEST_RCX, &rcx); 793 KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); 794 795 /* 796 * The count register is %rcx, %ecx or %cx depending on the 797 * address size of the instruction. 798 */ 799 if ((rcx & vie_size2mask(vie->addrsize)) == 0) { 800 error = 0; 801 goto done; 802 } 803 } 804 805 /* 806 * Source Destination Comments 807 * -------------------------------------------- 808 * (1) memory memory n/a 809 * (2) memory mmio emulated 810 * (3) mmio memory emulated 811 * (4) mmio mmio emulated 812 * 813 * At this point we don't have sufficient information to distinguish 814 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this 815 * out because it will succeed only when operating on regular memory. 816 * 817 * XXX the emulation doesn't properly handle the case where 'gpa' 818 * is straddling the boundary between the normal memory and MMIO. 819 */ 820 821 seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS; 822 error = get_gla(vcpu, vie, paging, opsize, vie->addrsize, 823 PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault); 824 if (error || fault) 825 goto done; 826 827 error = vm_copy_setup(vcpu, paging, srcaddr, opsize, PROT_READ, 828 copyinfo, nitems(copyinfo), &fault); 829 if (error == 0) { 830 if (fault) 831 goto done; /* Resume guest to handle fault */ 832 833 /* 834 * case (2): read from system memory and write to mmio. 835 */ 836 vm_copyin(copyinfo, &val, opsize); 837 vm_copy_teardown(copyinfo, nitems(copyinfo)); 838 error = memwrite(vcpu, gpa, val, opsize, arg); 839 if (error) 840 goto done; 841 } else { 842 /* 843 * 'vm_copy_setup()' is expected to fail for cases (3) and (4) 844 * if 'srcaddr' is in the mmio space. 845 */ 846 847 error = get_gla(vcpu, vie, paging, opsize, vie->addrsize, 848 PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr, 849 &fault); 850 if (error || fault) 851 goto done; 852 853 error = vm_copy_setup(vcpu, paging, dstaddr, opsize, 854 PROT_WRITE, copyinfo, nitems(copyinfo), &fault); 855 if (error == 0) { 856 if (fault) 857 goto done; /* Resume guest to handle fault */ 858 859 /* 860 * case (3): read from MMIO and write to system memory. 861 * 862 * A MMIO read can have side-effects so we 863 * commit to it only after vm_copy_setup() is 864 * successful. If a page-fault needs to be 865 * injected into the guest then it will happen 866 * before the MMIO read is attempted. 867 */ 868 error = memread(vcpu, gpa, &val, opsize, arg); 869 if (error) 870 goto done; 871 872 vm_copyout(&val, copyinfo, opsize); 873 vm_copy_teardown(copyinfo, nitems(copyinfo)); 874 } else { 875 /* 876 * Case (4): read from and write to mmio. 877 * 878 * Commit to the MMIO read/write (with potential 879 * side-effects) only after we are sure that the 880 * instruction is not going to be restarted due 881 * to address translation faults. 882 */ 883 error = vm_gla2gpa(vcpu, paging, srcaddr, 884 PROT_READ, &srcgpa, &fault); 885 if (error || fault) 886 goto done; 887 888 error = vm_gla2gpa(vcpu, paging, dstaddr, 889 PROT_WRITE, &dstgpa, &fault); 890 if (error || fault) 891 goto done; 892 893 error = memread(vcpu, srcgpa, &val, opsize, arg); 894 if (error) 895 goto done; 896 897 error = memwrite(vcpu, dstgpa, val, opsize, arg); 898 if (error) 899 goto done; 900 } 901 } 902 903 error = vie_read_register(vcpu, VM_REG_GUEST_RSI, &rsi); 904 KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error)); 905 906 error = vie_read_register(vcpu, VM_REG_GUEST_RDI, &rdi); 907 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); 908 909 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 910 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 911 912 if (rflags & PSL_D) { 913 rsi -= opsize; 914 rdi -= opsize; 915 } else { 916 rsi += opsize; 917 rdi += opsize; 918 } 919 920 error = vie_update_register(vcpu, VM_REG_GUEST_RSI, rsi, 921 vie->addrsize); 922 KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error)); 923 924 error = vie_update_register(vcpu, VM_REG_GUEST_RDI, rdi, 925 vie->addrsize); 926 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); 927 928 if (repeat) { 929 rcx = rcx - 1; 930 error = vie_update_register(vcpu, VM_REG_GUEST_RCX, 931 rcx, vie->addrsize); 932 KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); 933 934 /* 935 * Repeat the instruction if the count register is not zero. 936 */ 937 if ((rcx & vie_size2mask(vie->addrsize)) != 0) 938 vm_restart_instruction(vcpu); 939 } 940 done: 941 KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d", 942 __func__, error)); 943 return (error); 944 } 945 946 static int 947 emulate_stos(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 948 struct vm_guest_paging *paging __unused, mem_region_read_t memread __unused, 949 mem_region_write_t memwrite, void *arg) 950 { 951 int error, opsize, repeat; 952 uint64_t val; 953 uint64_t rcx, rdi, rflags; 954 955 opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize; 956 repeat = vie->repz_present | vie->repnz_present; 957 958 if (repeat) { 959 error = vie_read_register(vcpu, VM_REG_GUEST_RCX, &rcx); 960 KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); 961 962 /* 963 * The count register is %rcx, %ecx or %cx depending on the 964 * address size of the instruction. 965 */ 966 if ((rcx & vie_size2mask(vie->addrsize)) == 0) 967 return (0); 968 } 969 970 error = vie_read_register(vcpu, VM_REG_GUEST_RAX, &val); 971 KASSERT(!error, ("%s: error %d getting rax", __func__, error)); 972 973 error = memwrite(vcpu, gpa, val, opsize, arg); 974 if (error) 975 return (error); 976 977 error = vie_read_register(vcpu, VM_REG_GUEST_RDI, &rdi); 978 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); 979 980 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 981 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 982 983 if (rflags & PSL_D) 984 rdi -= opsize; 985 else 986 rdi += opsize; 987 988 error = vie_update_register(vcpu, VM_REG_GUEST_RDI, rdi, 989 vie->addrsize); 990 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); 991 992 if (repeat) { 993 rcx = rcx - 1; 994 error = vie_update_register(vcpu, VM_REG_GUEST_RCX, 995 rcx, vie->addrsize); 996 KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); 997 998 /* 999 * Repeat the instruction if the count register is not zero. 1000 */ 1001 if ((rcx & vie_size2mask(vie->addrsize)) != 0) 1002 vm_restart_instruction(vcpu); 1003 } 1004 1005 return (0); 1006 } 1007 1008 static int 1009 emulate_and(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1010 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 1011 { 1012 int error, size; 1013 enum vm_reg_name reg; 1014 uint64_t result, rflags, rflags2, val1, val2; 1015 1016 size = vie->opsize; 1017 error = EINVAL; 1018 1019 switch (vie->op.op_byte) { 1020 case 0x23: 1021 /* 1022 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the 1023 * result in reg. 1024 * 1025 * 23/r and r16, r/m16 1026 * 23/r and r32, r/m32 1027 * REX.W + 23/r and r64, r/m64 1028 */ 1029 1030 /* get the first operand */ 1031 reg = gpr_map[vie->reg]; 1032 error = vie_read_register(vcpu, reg, &val1); 1033 if (error) 1034 break; 1035 1036 /* get the second operand */ 1037 error = memread(vcpu, gpa, &val2, size, arg); 1038 if (error) 1039 break; 1040 1041 /* perform the operation and write the result */ 1042 result = val1 & val2; 1043 error = vie_update_register(vcpu, reg, result, size); 1044 break; 1045 case 0x81: 1046 case 0x83: 1047 /* 1048 * AND mem (ModRM:r/m) with immediate and store the 1049 * result in mem. 1050 * 1051 * 81 /4 and r/m16, imm16 1052 * 81 /4 and r/m32, imm32 1053 * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64 1054 * 1055 * 83 /4 and r/m16, imm8 sign-extended to 16 1056 * 83 /4 and r/m32, imm8 sign-extended to 32 1057 * REX.W + 83/4 and r/m64, imm8 sign-extended to 64 1058 */ 1059 1060 /* get the first operand */ 1061 error = memread(vcpu, gpa, &val1, size, arg); 1062 if (error) 1063 break; 1064 1065 /* 1066 * perform the operation with the pre-fetched immediate 1067 * operand and write the result 1068 */ 1069 result = val1 & vie->immediate; 1070 error = memwrite(vcpu, gpa, result, size, arg); 1071 break; 1072 default: 1073 break; 1074 } 1075 if (error) 1076 return (error); 1077 1078 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 1079 if (error) 1080 return (error); 1081 1082 /* 1083 * OF and CF are cleared; the SF, ZF and PF flags are set according 1084 * to the result; AF is undefined. 1085 * 1086 * The updated status flags are obtained by subtracting 0 from 'result'. 1087 */ 1088 rflags2 = getcc(size, result, 0); 1089 rflags &= ~RFLAGS_STATUS_BITS; 1090 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 1091 1092 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8); 1093 return (error); 1094 } 1095 1096 static int 1097 emulate_or(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1098 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 1099 { 1100 int error, size; 1101 enum vm_reg_name reg; 1102 uint64_t result, rflags, rflags2, val1, val2; 1103 1104 size = vie->opsize; 1105 error = EINVAL; 1106 1107 switch (vie->op.op_byte) { 1108 case 0x0B: 1109 /* 1110 * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the 1111 * result in reg. 1112 * 1113 * 0b/r or r16, r/m16 1114 * 0b/r or r32, r/m32 1115 * REX.W + 0b/r or r64, r/m64 1116 */ 1117 1118 /* get the first operand */ 1119 reg = gpr_map[vie->reg]; 1120 error = vie_read_register(vcpu, reg, &val1); 1121 if (error) 1122 break; 1123 1124 /* get the second operand */ 1125 error = memread(vcpu, gpa, &val2, size, arg); 1126 if (error) 1127 break; 1128 1129 /* perform the operation and write the result */ 1130 result = val1 | val2; 1131 error = vie_update_register(vcpu, reg, result, size); 1132 break; 1133 case 0x81: 1134 case 0x83: 1135 /* 1136 * OR mem (ModRM:r/m) with immediate and store the 1137 * result in mem. 1138 * 1139 * 81 /1 or r/m16, imm16 1140 * 81 /1 or r/m32, imm32 1141 * REX.W + 81 /1 or r/m64, imm32 sign-extended to 64 1142 * 1143 * 83 /1 or r/m16, imm8 sign-extended to 16 1144 * 83 /1 or r/m32, imm8 sign-extended to 32 1145 * REX.W + 83/1 or r/m64, imm8 sign-extended to 64 1146 */ 1147 1148 /* get the first operand */ 1149 error = memread(vcpu, gpa, &val1, size, arg); 1150 if (error) 1151 break; 1152 1153 /* 1154 * perform the operation with the pre-fetched immediate 1155 * operand and write the result 1156 */ 1157 result = val1 | vie->immediate; 1158 error = memwrite(vcpu, gpa, result, size, arg); 1159 break; 1160 default: 1161 break; 1162 } 1163 if (error) 1164 return (error); 1165 1166 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 1167 if (error) 1168 return (error); 1169 1170 /* 1171 * OF and CF are cleared; the SF, ZF and PF flags are set according 1172 * to the result; AF is undefined. 1173 * 1174 * The updated status flags are obtained by subtracting 0 from 'result'. 1175 */ 1176 rflags2 = getcc(size, result, 0); 1177 rflags &= ~RFLAGS_STATUS_BITS; 1178 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 1179 1180 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8); 1181 return (error); 1182 } 1183 1184 static int 1185 emulate_cmp(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1186 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) 1187 { 1188 int error, size; 1189 uint64_t regop, memop, op1, op2, rflags, rflags2; 1190 enum vm_reg_name reg; 1191 1192 size = vie->opsize; 1193 switch (vie->op.op_byte) { 1194 case 0x39: 1195 case 0x3B: 1196 /* 1197 * 39/r CMP r/m16, r16 1198 * 39/r CMP r/m32, r32 1199 * REX.W 39/r CMP r/m64, r64 1200 * 1201 * 3B/r CMP r16, r/m16 1202 * 3B/r CMP r32, r/m32 1203 * REX.W + 3B/r CMP r64, r/m64 1204 * 1205 * Compare the first operand with the second operand and 1206 * set status flags in EFLAGS register. The comparison is 1207 * performed by subtracting the second operand from the first 1208 * operand and then setting the status flags. 1209 */ 1210 1211 /* Get the register operand */ 1212 reg = gpr_map[vie->reg]; 1213 error = vie_read_register(vcpu, reg, ®op); 1214 if (error) 1215 return (error); 1216 1217 /* Get the memory operand */ 1218 error = memread(vcpu, gpa, &memop, size, arg); 1219 if (error) 1220 return (error); 1221 1222 if (vie->op.op_byte == 0x3B) { 1223 op1 = regop; 1224 op2 = memop; 1225 } else { 1226 op1 = memop; 1227 op2 = regop; 1228 } 1229 rflags2 = getcc(size, op1, op2); 1230 break; 1231 case 0x80: 1232 case 0x81: 1233 case 0x83: 1234 /* 1235 * 80 /7 cmp r/m8, imm8 1236 * REX + 80 /7 cmp r/m8, imm8 1237 * 1238 * 81 /7 cmp r/m16, imm16 1239 * 81 /7 cmp r/m32, imm32 1240 * REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 64 1241 * 1242 * 83 /7 cmp r/m16, imm8 sign-extended to 16 1243 * 83 /7 cmp r/m32, imm8 sign-extended to 32 1244 * REX.W + 83 /7 cmp r/m64, imm8 sign-extended to 64 1245 * 1246 * Compare mem (ModRM:r/m) with immediate and set 1247 * status flags according to the results. The 1248 * comparison is performed by subtracting the 1249 * immediate from the first operand and then setting 1250 * the status flags. 1251 * 1252 */ 1253 if (vie->op.op_byte == 0x80) 1254 size = 1; 1255 1256 /* get the first operand */ 1257 error = memread(vcpu, gpa, &op1, size, arg); 1258 if (error) 1259 return (error); 1260 1261 rflags2 = getcc(size, op1, vie->immediate); 1262 break; 1263 default: 1264 return (EINVAL); 1265 } 1266 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 1267 if (error) 1268 return (error); 1269 rflags &= ~RFLAGS_STATUS_BITS; 1270 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1271 1272 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8); 1273 return (error); 1274 } 1275 1276 static int 1277 emulate_test(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1278 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) 1279 { 1280 int error, size; 1281 uint64_t op1, rflags, rflags2; 1282 1283 size = vie->opsize; 1284 error = EINVAL; 1285 1286 switch (vie->op.op_byte) { 1287 case 0xF7: 1288 /* 1289 * F7 /0 test r/m16, imm16 1290 * F7 /0 test r/m32, imm32 1291 * REX.W + F7 /0 test r/m64, imm32 sign-extended to 64 1292 * 1293 * Test mem (ModRM:r/m) with immediate and set status 1294 * flags according to the results. The comparison is 1295 * performed by anding the immediate from the first 1296 * operand and then setting the status flags. 1297 */ 1298 if ((vie->reg & 7) != 0) 1299 return (EINVAL); 1300 1301 error = memread(vcpu, gpa, &op1, size, arg); 1302 if (error) 1303 return (error); 1304 1305 rflags2 = getandflags(size, op1, vie->immediate); 1306 break; 1307 default: 1308 return (EINVAL); 1309 } 1310 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 1311 if (error) 1312 return (error); 1313 1314 /* 1315 * OF and CF are cleared; the SF, ZF and PF flags are set according 1316 * to the result; AF is undefined. 1317 */ 1318 rflags &= ~RFLAGS_STATUS_BITS; 1319 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 1320 1321 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8); 1322 return (error); 1323 } 1324 1325 static int 1326 emulate_bextr(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1327 struct vm_guest_paging *paging, mem_region_read_t memread, 1328 mem_region_write_t memwrite __unused, void *arg) 1329 { 1330 uint64_t src1, src2, dst, rflags; 1331 unsigned start, len, size; 1332 int error; 1333 1334 size = vie->opsize; 1335 error = EINVAL; 1336 1337 /* 1338 * VEX.LZ.0F38.W0 F7 /r BEXTR r32a, r/m32, r32b 1339 * VEX.LZ.0F38.W1 F7 /r BEXTR r64a, r/m64, r64b 1340 * 1341 * Destination operand is ModRM:reg. Source operands are ModRM:r/m and 1342 * Vex.vvvv. 1343 * 1344 * Operand size is always 32-bit if not in 64-bit mode (W1 is ignored). 1345 */ 1346 if (size != 4 && paging->cpu_mode != CPU_MODE_64BIT) 1347 size = 4; 1348 1349 /* 1350 * Extracts contiguous bits from the first /source/ operand (second 1351 * operand) using an index and length specified in the second /source/ 1352 * operand (third operand). 1353 */ 1354 error = memread(vcpu, gpa, &src1, size, arg); 1355 if (error) 1356 return (error); 1357 error = vie_read_register(vcpu, gpr_map[vie->vex_reg], &src2); 1358 if (error) 1359 return (error); 1360 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 1361 if (error) 1362 return (error); 1363 1364 start = (src2 & 0xff); 1365 len = (src2 & 0xff00) >> 8; 1366 1367 /* If no bits are extracted, the destination register is cleared. */ 1368 dst = 0; 1369 1370 /* If START exceeds the operand size, no bits are extracted. */ 1371 if (start > size * 8) 1372 goto done; 1373 /* Length is bounded by both the destination size and start offset. */ 1374 if (start + len > size * 8) 1375 len = (size * 8) - start; 1376 if (len == 0) 1377 goto done; 1378 1379 if (start > 0) 1380 src1 = (src1 >> start); 1381 if (len < 64) 1382 src1 = src1 & ((1ull << len) - 1); 1383 dst = src1; 1384 1385 done: 1386 error = vie_update_register(vcpu, gpr_map[vie->reg], dst, size); 1387 if (error) 1388 return (error); 1389 1390 /* 1391 * AMD: OF, CF cleared; SF/AF/PF undefined; ZF set by result. 1392 * Intel: ZF is set by result; AF/SF/PF undefined; all others cleared. 1393 */ 1394 rflags &= ~RFLAGS_STATUS_BITS; 1395 if (dst == 0) 1396 rflags |= PSL_Z; 1397 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 1398 8); 1399 return (error); 1400 } 1401 1402 static int 1403 emulate_add(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1404 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) 1405 { 1406 int error, size; 1407 uint64_t nval, rflags, rflags2, val1, val2; 1408 enum vm_reg_name reg; 1409 1410 size = vie->opsize; 1411 error = EINVAL; 1412 1413 switch (vie->op.op_byte) { 1414 case 0x03: 1415 /* 1416 * ADD r/m to r and store the result in r 1417 * 1418 * 03/r ADD r16, r/m16 1419 * 03/r ADD r32, r/m32 1420 * REX.W + 03/r ADD r64, r/m64 1421 */ 1422 1423 /* get the first operand */ 1424 reg = gpr_map[vie->reg]; 1425 error = vie_read_register(vcpu, reg, &val1); 1426 if (error) 1427 break; 1428 1429 /* get the second operand */ 1430 error = memread(vcpu, gpa, &val2, size, arg); 1431 if (error) 1432 break; 1433 1434 /* perform the operation and write the result */ 1435 nval = val1 + val2; 1436 error = vie_update_register(vcpu, reg, nval, size); 1437 break; 1438 default: 1439 break; 1440 } 1441 1442 if (!error) { 1443 rflags2 = getaddflags(size, val1, val2); 1444 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, 1445 &rflags); 1446 if (error) 1447 return (error); 1448 1449 rflags &= ~RFLAGS_STATUS_BITS; 1450 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1451 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, 1452 rflags, 8); 1453 } 1454 1455 return (error); 1456 } 1457 1458 static int 1459 emulate_sub(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1460 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) 1461 { 1462 int error, size; 1463 uint64_t nval, rflags, rflags2, val1, val2; 1464 enum vm_reg_name reg; 1465 1466 size = vie->opsize; 1467 error = EINVAL; 1468 1469 switch (vie->op.op_byte) { 1470 case 0x2B: 1471 /* 1472 * SUB r/m from r and store the result in r 1473 * 1474 * 2B/r SUB r16, r/m16 1475 * 2B/r SUB r32, r/m32 1476 * REX.W + 2B/r SUB r64, r/m64 1477 */ 1478 1479 /* get the first operand */ 1480 reg = gpr_map[vie->reg]; 1481 error = vie_read_register(vcpu, reg, &val1); 1482 if (error) 1483 break; 1484 1485 /* get the second operand */ 1486 error = memread(vcpu, gpa, &val2, size, arg); 1487 if (error) 1488 break; 1489 1490 /* perform the operation and write the result */ 1491 nval = val1 - val2; 1492 error = vie_update_register(vcpu, reg, nval, size); 1493 break; 1494 default: 1495 break; 1496 } 1497 1498 if (!error) { 1499 rflags2 = getcc(size, val1, val2); 1500 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, 1501 &rflags); 1502 if (error) 1503 return (error); 1504 1505 rflags &= ~RFLAGS_STATUS_BITS; 1506 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1507 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, 1508 rflags, 8); 1509 } 1510 1511 return (error); 1512 } 1513 1514 static int 1515 emulate_stack_op(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie, 1516 struct vm_guest_paging *paging, mem_region_read_t memread, 1517 mem_region_write_t memwrite, void *arg) 1518 { 1519 #ifdef _KERNEL 1520 struct vm_copyinfo copyinfo[2]; 1521 #else 1522 struct iovec copyinfo[2]; 1523 #endif 1524 struct seg_desc ss_desc; 1525 uint64_t cr0, rflags, rsp, stack_gla, val; 1526 int error, fault, size, stackaddrsize, pushop; 1527 1528 val = 0; 1529 size = vie->opsize; 1530 pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0; 1531 1532 /* 1533 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1 1534 */ 1535 if (paging->cpu_mode == CPU_MODE_REAL) { 1536 stackaddrsize = 2; 1537 } else if (paging->cpu_mode == CPU_MODE_64BIT) { 1538 /* 1539 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3 1540 * - Stack pointer size is always 64-bits. 1541 * - PUSH/POP of 32-bit values is not possible in 64-bit mode. 1542 * - 16-bit PUSH/POP is supported by using the operand size 1543 * override prefix (66H). 1544 */ 1545 stackaddrsize = 8; 1546 size = vie->opsize_override ? 2 : 8; 1547 } else { 1548 /* 1549 * In protected or compatibility mode the 'B' flag in the 1550 * stack-segment descriptor determines the size of the 1551 * stack pointer. 1552 */ 1553 error = vm_get_seg_desc(vcpu, VM_REG_GUEST_SS, &ss_desc); 1554 KASSERT(error == 0, ("%s: error %d getting SS descriptor", 1555 __func__, error)); 1556 if (SEG_DESC_DEF32(ss_desc.access)) 1557 stackaddrsize = 4; 1558 else 1559 stackaddrsize = 2; 1560 } 1561 1562 error = vie_read_register(vcpu, VM_REG_GUEST_CR0, &cr0); 1563 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); 1564 1565 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 1566 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 1567 1568 error = vie_read_register(vcpu, VM_REG_GUEST_RSP, &rsp); 1569 KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error)); 1570 if (pushop) { 1571 rsp -= size; 1572 } 1573 1574 if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc, 1575 rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ, 1576 &stack_gla)) { 1577 vm_inject_ss(vcpu, 0); 1578 return (0); 1579 } 1580 1581 if (vie_canonical_check(paging->cpu_mode, stack_gla)) { 1582 vm_inject_ss(vcpu, 0); 1583 return (0); 1584 } 1585 1586 if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) { 1587 vm_inject_ac(vcpu, 0); 1588 return (0); 1589 } 1590 1591 error = vm_copy_setup(vcpu, paging, stack_gla, size, 1592 pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo), 1593 &fault); 1594 if (error || fault) 1595 return (error); 1596 1597 if (pushop) { 1598 error = memread(vcpu, mmio_gpa, &val, size, arg); 1599 if (error == 0) 1600 vm_copyout(&val, copyinfo, size); 1601 } else { 1602 vm_copyin(copyinfo, &val, size); 1603 error = memwrite(vcpu, mmio_gpa, val, size, arg); 1604 rsp += size; 1605 } 1606 vm_copy_teardown(copyinfo, nitems(copyinfo)); 1607 1608 if (error == 0) { 1609 error = vie_update_register(vcpu, VM_REG_GUEST_RSP, rsp, 1610 stackaddrsize); 1611 KASSERT(error == 0, ("error %d updating rsp", error)); 1612 } 1613 return (error); 1614 } 1615 1616 static int 1617 emulate_push(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie, 1618 struct vm_guest_paging *paging, mem_region_read_t memread, 1619 mem_region_write_t memwrite, void *arg) 1620 { 1621 int error; 1622 1623 /* 1624 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. 1625 * 1626 * PUSH is part of the group 5 extended opcodes and is identified 1627 * by ModRM:reg = b110. 1628 */ 1629 if ((vie->reg & 7) != 6) 1630 return (EINVAL); 1631 1632 error = emulate_stack_op(vcpu, mmio_gpa, vie, paging, memread, 1633 memwrite, arg); 1634 return (error); 1635 } 1636 1637 static int 1638 emulate_pop(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie, 1639 struct vm_guest_paging *paging, mem_region_read_t memread, 1640 mem_region_write_t memwrite, void *arg) 1641 { 1642 int error; 1643 1644 /* 1645 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. 1646 * 1647 * POP is part of the group 1A extended opcodes and is identified 1648 * by ModRM:reg = b000. 1649 */ 1650 if ((vie->reg & 7) != 0) 1651 return (EINVAL); 1652 1653 error = emulate_stack_op(vcpu, mmio_gpa, vie, paging, memread, 1654 memwrite, arg); 1655 return (error); 1656 } 1657 1658 static int 1659 emulate_group1(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1660 struct vm_guest_paging *paging __unused, mem_region_read_t memread, 1661 mem_region_write_t memwrite, void *memarg) 1662 { 1663 int error; 1664 1665 switch (vie->reg & 7) { 1666 case 0x1: /* OR */ 1667 error = emulate_or(vcpu, gpa, vie, 1668 memread, memwrite, memarg); 1669 break; 1670 case 0x4: /* AND */ 1671 error = emulate_and(vcpu, gpa, vie, 1672 memread, memwrite, memarg); 1673 break; 1674 case 0x7: /* CMP */ 1675 error = emulate_cmp(vcpu, gpa, vie, 1676 memread, memwrite, memarg); 1677 break; 1678 default: 1679 error = EINVAL; 1680 break; 1681 } 1682 1683 return (error); 1684 } 1685 1686 static int 1687 emulate_bittest(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1688 mem_region_read_t memread, mem_region_write_t memwrite __unused, 1689 void *memarg) 1690 { 1691 uint64_t val, rflags; 1692 int error, bitmask, bitoff; 1693 1694 /* 1695 * 0F BA is a Group 8 extended opcode. 1696 * 1697 * Currently we only emulate the 'Bit Test' instruction which is 1698 * identified by a ModR/M:reg encoding of 100b. 1699 */ 1700 if ((vie->reg & 7) != 4) 1701 return (EINVAL); 1702 1703 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 1704 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 1705 1706 error = memread(vcpu, gpa, &val, vie->opsize, memarg); 1707 if (error) 1708 return (error); 1709 1710 /* 1711 * Intel SDM, Vol 2, Table 3-2: 1712 * "Range of Bit Positions Specified by Bit Offset Operands" 1713 */ 1714 bitmask = vie->opsize * 8 - 1; 1715 bitoff = vie->immediate & bitmask; 1716 1717 /* Copy the bit into the Carry flag in %rflags */ 1718 if (val & (1UL << bitoff)) 1719 rflags |= PSL_C; 1720 else 1721 rflags &= ~PSL_C; 1722 1723 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8); 1724 KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error)); 1725 1726 return (0); 1727 } 1728 1729 static int 1730 emulate_twob_group15(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1731 mem_region_read_t memread, mem_region_write_t memwrite __unused, 1732 void *memarg) 1733 { 1734 int error; 1735 uint64_t buf; 1736 1737 switch (vie->reg & 7) { 1738 case 0x7: /* CLFLUSH, CLFLUSHOPT, and SFENCE */ 1739 if (vie->mod == 0x3) { 1740 /* 1741 * SFENCE. Ignore it, VM exit provides enough 1742 * barriers on its own. 1743 */ 1744 error = 0; 1745 } else { 1746 /* 1747 * CLFLUSH, CLFLUSHOPT. Only check for access 1748 * rights. 1749 */ 1750 error = memread(vcpu, gpa, &buf, 1, memarg); 1751 } 1752 break; 1753 default: 1754 error = EINVAL; 1755 break; 1756 } 1757 1758 return (error); 1759 } 1760 1761 int 1762 vmm_emulate_instruction(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1763 struct vm_guest_paging *paging, mem_region_read_t memread, 1764 mem_region_write_t memwrite, void *memarg) 1765 { 1766 int error; 1767 1768 if (!vie->decoded) 1769 return (EINVAL); 1770 1771 switch (vie->op.op_type) { 1772 case VIE_OP_TYPE_GROUP1: 1773 error = emulate_group1(vcpu, gpa, vie, paging, memread, 1774 memwrite, memarg); 1775 break; 1776 case VIE_OP_TYPE_POP: 1777 error = emulate_pop(vcpu, gpa, vie, paging, memread, 1778 memwrite, memarg); 1779 break; 1780 case VIE_OP_TYPE_PUSH: 1781 error = emulate_push(vcpu, gpa, vie, paging, memread, 1782 memwrite, memarg); 1783 break; 1784 case VIE_OP_TYPE_CMP: 1785 error = emulate_cmp(vcpu, gpa, vie, 1786 memread, memwrite, memarg); 1787 break; 1788 case VIE_OP_TYPE_MOV: 1789 error = emulate_mov(vcpu, gpa, vie, 1790 memread, memwrite, memarg); 1791 break; 1792 case VIE_OP_TYPE_MOVSX: 1793 case VIE_OP_TYPE_MOVZX: 1794 error = emulate_movx(vcpu, gpa, vie, 1795 memread, memwrite, memarg); 1796 break; 1797 case VIE_OP_TYPE_MOVS: 1798 error = emulate_movs(vcpu, gpa, vie, paging, memread, 1799 memwrite, memarg); 1800 break; 1801 case VIE_OP_TYPE_STOS: 1802 error = emulate_stos(vcpu, gpa, vie, paging, memread, 1803 memwrite, memarg); 1804 break; 1805 case VIE_OP_TYPE_AND: 1806 error = emulate_and(vcpu, gpa, vie, 1807 memread, memwrite, memarg); 1808 break; 1809 case VIE_OP_TYPE_OR: 1810 error = emulate_or(vcpu, gpa, vie, 1811 memread, memwrite, memarg); 1812 break; 1813 case VIE_OP_TYPE_SUB: 1814 error = emulate_sub(vcpu, gpa, vie, 1815 memread, memwrite, memarg); 1816 break; 1817 case VIE_OP_TYPE_BITTEST: 1818 error = emulate_bittest(vcpu, gpa, vie, 1819 memread, memwrite, memarg); 1820 break; 1821 case VIE_OP_TYPE_TWOB_GRP15: 1822 error = emulate_twob_group15(vcpu, gpa, vie, 1823 memread, memwrite, memarg); 1824 break; 1825 case VIE_OP_TYPE_ADD: 1826 error = emulate_add(vcpu, gpa, vie, memread, 1827 memwrite, memarg); 1828 break; 1829 case VIE_OP_TYPE_TEST: 1830 error = emulate_test(vcpu, gpa, vie, 1831 memread, memwrite, memarg); 1832 break; 1833 case VIE_OP_TYPE_BEXTR: 1834 error = emulate_bextr(vcpu, gpa, vie, paging, 1835 memread, memwrite, memarg); 1836 break; 1837 default: 1838 error = EINVAL; 1839 break; 1840 } 1841 1842 return (error); 1843 } 1844 1845 int 1846 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) 1847 { 1848 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 1849 ("%s: invalid size %d", __func__, size)); 1850 KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl)); 1851 1852 if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) 1853 return (0); 1854 1855 return ((gla & (size - 1)) ? 1 : 0); 1856 } 1857 1858 int 1859 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla) 1860 { 1861 uint64_t mask; 1862 1863 if (cpu_mode != CPU_MODE_64BIT) 1864 return (0); 1865 1866 /* 1867 * The value of the bit 47 in the 'gla' should be replicated in the 1868 * most significant 16 bits. 1869 */ 1870 mask = ~((1UL << 48) - 1); 1871 if (gla & (1UL << 47)) 1872 return ((gla & mask) != mask); 1873 else 1874 return ((gla & mask) != 0); 1875 } 1876 1877 uint64_t 1878 vie_size2mask(int size) 1879 { 1880 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 1881 ("vie_size2mask: invalid size %d", size)); 1882 return (size2mask[size]); 1883 } 1884 1885 int 1886 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, 1887 struct seg_desc *desc, uint64_t offset, int length, int addrsize, 1888 int prot, uint64_t *gla) 1889 { 1890 uint64_t firstoff, low_limit, high_limit, segbase; 1891 int glasize, type; 1892 1893 KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS, 1894 ("%s: invalid segment %d", __func__, seg)); 1895 KASSERT(length == 1 || length == 2 || length == 4 || length == 8, 1896 ("%s: invalid operand size %d", __func__, length)); 1897 KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, 1898 ("%s: invalid prot %#x", __func__, prot)); 1899 1900 firstoff = offset; 1901 if (cpu_mode == CPU_MODE_64BIT) { 1902 KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address " 1903 "size %d for cpu_mode %d", __func__, addrsize, cpu_mode)); 1904 glasize = 8; 1905 } else { 1906 KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address " 1907 "size %d for cpu mode %d", __func__, addrsize, cpu_mode)); 1908 glasize = 4; 1909 /* 1910 * If the segment selector is loaded with a NULL selector 1911 * then the descriptor is unusable and attempting to use 1912 * it results in a #GP(0). 1913 */ 1914 if (SEG_DESC_UNUSABLE(desc->access)) 1915 return (-1); 1916 1917 /* 1918 * The processor generates a #NP exception when a segment 1919 * register is loaded with a selector that points to a 1920 * descriptor that is not present. If this was the case then 1921 * it would have been checked before the VM-exit. 1922 */ 1923 KASSERT(SEG_DESC_PRESENT(desc->access), 1924 ("segment %d not present: %#x", seg, desc->access)); 1925 1926 /* 1927 * The descriptor type must indicate a code/data segment. 1928 */ 1929 type = SEG_DESC_TYPE(desc->access); 1930 KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " 1931 "descriptor type %#x", seg, type)); 1932 1933 if (prot & PROT_READ) { 1934 /* #GP on a read access to a exec-only code segment */ 1935 if ((type & 0xA) == 0x8) 1936 return (-1); 1937 } 1938 1939 if (prot & PROT_WRITE) { 1940 /* 1941 * #GP on a write access to a code segment or a 1942 * read-only data segment. 1943 */ 1944 if (type & 0x8) /* code segment */ 1945 return (-1); 1946 1947 if ((type & 0xA) == 0) /* read-only data seg */ 1948 return (-1); 1949 } 1950 1951 /* 1952 * 'desc->limit' is fully expanded taking granularity into 1953 * account. 1954 */ 1955 if ((type & 0xC) == 0x4) { 1956 /* expand-down data segment */ 1957 low_limit = desc->limit + 1; 1958 high_limit = SEG_DESC_DEF32(desc->access) ? 1959 0xffffffff : 0xffff; 1960 } else { 1961 /* code segment or expand-up data segment */ 1962 low_limit = 0; 1963 high_limit = desc->limit; 1964 } 1965 1966 while (length > 0) { 1967 offset &= vie_size2mask(addrsize); 1968 if (offset < low_limit || offset > high_limit) 1969 return (-1); 1970 offset++; 1971 length--; 1972 } 1973 } 1974 1975 /* 1976 * In 64-bit mode all segments except %fs and %gs have a segment 1977 * base address of 0. 1978 */ 1979 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && 1980 seg != VM_REG_GUEST_GS) { 1981 segbase = 0; 1982 } else { 1983 segbase = desc->base; 1984 } 1985 1986 /* 1987 * Truncate 'firstoff' to the effective address size before adding 1988 * it to the segment base. 1989 */ 1990 firstoff &= vie_size2mask(addrsize); 1991 *gla = (segbase + firstoff) & vie_size2mask(glasize); 1992 return (0); 1993 } 1994 1995 /* 1996 * Prepare a partially decoded vie for a 2nd attempt. 1997 */ 1998 void 1999 vie_restart(struct vie *vie) 2000 { 2001 _Static_assert( 2002 offsetof(struct vie, inst) < offsetof(struct vie, vie_startzero) && 2003 offsetof(struct vie, num_valid) < offsetof(struct vie, vie_startzero), 2004 "restart should not erase instruction length or contents"); 2005 2006 memset((char *)vie + offsetof(struct vie, vie_startzero), 0, 2007 sizeof(*vie) - offsetof(struct vie, vie_startzero)); 2008 2009 vie->base_register = VM_REG_LAST; 2010 vie->index_register = VM_REG_LAST; 2011 vie->segment_register = VM_REG_LAST; 2012 } 2013 2014 void 2015 vie_init(struct vie *vie, const char *inst_bytes, int inst_length) 2016 { 2017 KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE, 2018 ("%s: invalid instruction length (%d)", __func__, inst_length)); 2019 2020 vie_restart(vie); 2021 memset(vie->inst, 0, sizeof(vie->inst)); 2022 if (inst_length != 0) 2023 memcpy(vie->inst, inst_bytes, inst_length); 2024 vie->num_valid = inst_length; 2025 } 2026 2027 #ifdef _KERNEL 2028 static int 2029 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte) 2030 { 2031 int error_code = 0; 2032 2033 if (pte & PG_V) 2034 error_code |= PGEX_P; 2035 if (prot & VM_PROT_WRITE) 2036 error_code |= PGEX_W; 2037 if (usermode) 2038 error_code |= PGEX_U; 2039 if (rsvd) 2040 error_code |= PGEX_RSV; 2041 if (prot & VM_PROT_EXECUTE) 2042 error_code |= PGEX_I; 2043 2044 return (error_code); 2045 } 2046 2047 static void 2048 ptp_release(void **cookie) 2049 { 2050 if (*cookie != NULL) { 2051 vm_gpa_release(*cookie); 2052 *cookie = NULL; 2053 } 2054 } 2055 2056 static void * 2057 ptp_hold(struct vcpu *vcpu, vm_paddr_t ptpphys, size_t len, void **cookie) 2058 { 2059 void *ptr; 2060 2061 ptp_release(cookie); 2062 ptr = vm_gpa_hold(vcpu, ptpphys, len, VM_PROT_RW, cookie); 2063 return (ptr); 2064 } 2065 2066 static int 2067 _vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, 2068 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only) 2069 { 2070 int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable; 2071 u_int retries; 2072 uint64_t *ptpbase, ptpphys, pte, pgsize; 2073 uint32_t *ptpbase32, pte32; 2074 void *cookie; 2075 2076 *guest_fault = 0; 2077 2078 usermode = (paging->cpl == 3 ? 1 : 0); 2079 writable = prot & VM_PROT_WRITE; 2080 cookie = NULL; 2081 retval = 0; 2082 retries = 0; 2083 restart: 2084 ptpphys = paging->cr3; /* root of the page tables */ 2085 ptp_release(&cookie); 2086 if (retries++ > 0) 2087 maybe_yield(); 2088 2089 if (vie_canonical_check(paging->cpu_mode, gla)) { 2090 /* 2091 * XXX assuming a non-stack reference otherwise a stack fault 2092 * should be generated. 2093 */ 2094 if (!check_only) 2095 vm_inject_gp(vcpu); 2096 goto fault; 2097 } 2098 2099 if (paging->paging_mode == PAGING_MODE_FLAT) { 2100 *gpa = gla; 2101 goto done; 2102 } 2103 2104 if (paging->paging_mode == PAGING_MODE_32) { 2105 nlevels = 2; 2106 while (--nlevels >= 0) { 2107 /* Zero out the lower 12 bits. */ 2108 ptpphys &= ~0xfff; 2109 2110 ptpbase32 = ptp_hold(vcpu, ptpphys, PAGE_SIZE, 2111 &cookie); 2112 2113 if (ptpbase32 == NULL) 2114 goto error; 2115 2116 ptpshift = PAGE_SHIFT + nlevels * 10; 2117 ptpindex = (gla >> ptpshift) & 0x3FF; 2118 pgsize = 1UL << ptpshift; 2119 2120 pte32 = ptpbase32[ptpindex]; 2121 2122 if ((pte32 & PG_V) == 0 || 2123 (usermode && (pte32 & PG_U) == 0) || 2124 (writable && (pte32 & PG_RW) == 0)) { 2125 if (!check_only) { 2126 pfcode = pf_error_code(usermode, prot, 0, 2127 pte32); 2128 vm_inject_pf(vcpu, pfcode, gla); 2129 } 2130 goto fault; 2131 } 2132 2133 /* 2134 * Emulate the x86 MMU's management of the accessed 2135 * and dirty flags. While the accessed flag is set 2136 * at every level of the page table, the dirty flag 2137 * is only set at the last level providing the guest 2138 * physical address. 2139 */ 2140 if (!check_only && (pte32 & PG_A) == 0) { 2141 if (atomic_cmpset_32(&ptpbase32[ptpindex], 2142 pte32, pte32 | PG_A) == 0) { 2143 goto restart; 2144 } 2145 } 2146 2147 /* XXX must be ignored if CR4.PSE=0 */ 2148 if (nlevels > 0 && (pte32 & PG_PS) != 0) 2149 break; 2150 2151 ptpphys = pte32; 2152 } 2153 2154 /* Set the dirty bit in the page table entry if necessary */ 2155 if (!check_only && writable && (pte32 & PG_M) == 0) { 2156 if (atomic_cmpset_32(&ptpbase32[ptpindex], 2157 pte32, pte32 | PG_M) == 0) { 2158 goto restart; 2159 } 2160 } 2161 2162 /* Zero out the lower 'ptpshift' bits */ 2163 pte32 >>= ptpshift; pte32 <<= ptpshift; 2164 *gpa = pte32 | (gla & (pgsize - 1)); 2165 goto done; 2166 } 2167 2168 if (paging->paging_mode == PAGING_MODE_PAE) { 2169 /* Zero out the lower 5 bits and the upper 32 bits */ 2170 ptpphys &= 0xffffffe0UL; 2171 2172 ptpbase = ptp_hold(vcpu, ptpphys, sizeof(*ptpbase) * 4, 2173 &cookie); 2174 if (ptpbase == NULL) 2175 goto error; 2176 2177 ptpindex = (gla >> 30) & 0x3; 2178 2179 pte = ptpbase[ptpindex]; 2180 2181 if ((pte & PG_V) == 0) { 2182 if (!check_only) { 2183 pfcode = pf_error_code(usermode, prot, 0, pte); 2184 vm_inject_pf(vcpu, pfcode, gla); 2185 } 2186 goto fault; 2187 } 2188 2189 ptpphys = pte; 2190 2191 nlevels = 2; 2192 } else if (paging->paging_mode == PAGING_MODE_64_LA57) { 2193 nlevels = 5; 2194 } else { 2195 nlevels = 4; 2196 } 2197 2198 while (--nlevels >= 0) { 2199 /* Zero out the lower 12 bits and the upper 12 bits */ 2200 ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12; 2201 2202 ptpbase = ptp_hold(vcpu, ptpphys, PAGE_SIZE, &cookie); 2203 if (ptpbase == NULL) 2204 goto error; 2205 2206 ptpshift = PAGE_SHIFT + nlevels * 9; 2207 ptpindex = (gla >> ptpshift) & 0x1FF; 2208 pgsize = 1UL << ptpshift; 2209 2210 pte = ptpbase[ptpindex]; 2211 2212 if ((pte & PG_V) == 0 || 2213 (usermode && (pte & PG_U) == 0) || 2214 (writable && (pte & PG_RW) == 0)) { 2215 if (!check_only) { 2216 pfcode = pf_error_code(usermode, prot, 0, pte); 2217 vm_inject_pf(vcpu, pfcode, gla); 2218 } 2219 goto fault; 2220 } 2221 2222 /* Set the accessed bit in the page table entry */ 2223 if (!check_only && (pte & PG_A) == 0) { 2224 if (atomic_cmpset_64(&ptpbase[ptpindex], 2225 pte, pte | PG_A) == 0) { 2226 goto restart; 2227 } 2228 } 2229 2230 if (nlevels > 0 && (pte & PG_PS) != 0) { 2231 if (pgsize > 1 * GB) { 2232 if (!check_only) { 2233 pfcode = pf_error_code(usermode, prot, 1, 2234 pte); 2235 vm_inject_pf(vcpu, pfcode, gla); 2236 } 2237 goto fault; 2238 } 2239 break; 2240 } 2241 2242 ptpphys = pte; 2243 } 2244 2245 /* Set the dirty bit in the page table entry if necessary */ 2246 if (!check_only && writable && (pte & PG_M) == 0) { 2247 if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0) 2248 goto restart; 2249 } 2250 2251 /* Zero out the lower 'ptpshift' bits and the upper 12 bits */ 2252 pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12; 2253 *gpa = pte | (gla & (pgsize - 1)); 2254 done: 2255 ptp_release(&cookie); 2256 KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d", 2257 __func__, retval)); 2258 return (retval); 2259 error: 2260 retval = EFAULT; 2261 goto done; 2262 fault: 2263 *guest_fault = 1; 2264 goto done; 2265 } 2266 2267 int 2268 vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, 2269 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) 2270 { 2271 2272 return (_vm_gla2gpa(vcpu, paging, gla, prot, gpa, guest_fault, 2273 false)); 2274 } 2275 2276 int 2277 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, 2278 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) 2279 { 2280 2281 return (_vm_gla2gpa(vcpu, paging, gla, prot, gpa, guest_fault, 2282 true)); 2283 } 2284 2285 int 2286 vmm_fetch_instruction(struct vcpu *vcpu, struct vm_guest_paging *paging, 2287 uint64_t rip, int inst_length, struct vie *vie, int *faultptr) 2288 { 2289 struct vm_copyinfo copyinfo[2]; 2290 int error, prot; 2291 2292 if (inst_length > VIE_INST_SIZE) 2293 panic("vmm_fetch_instruction: invalid length %d", inst_length); 2294 2295 prot = PROT_READ | PROT_EXEC; 2296 error = vm_copy_setup(vcpu, paging, rip, inst_length, prot, 2297 copyinfo, nitems(copyinfo), faultptr); 2298 if (error || *faultptr) 2299 return (error); 2300 2301 vm_copyin(copyinfo, vie->inst, inst_length); 2302 vm_copy_teardown(copyinfo, nitems(copyinfo)); 2303 vie->num_valid = inst_length; 2304 return (0); 2305 } 2306 #endif /* _KERNEL */ 2307 2308 static int 2309 vie_peek(struct vie *vie, uint8_t *x) 2310 { 2311 2312 if (vie->num_processed < vie->num_valid) { 2313 *x = vie->inst[vie->num_processed]; 2314 return (0); 2315 } else 2316 return (-1); 2317 } 2318 2319 static void 2320 vie_advance(struct vie *vie) 2321 { 2322 2323 vie->num_processed++; 2324 } 2325 2326 static bool 2327 segment_override(uint8_t x, int *seg) 2328 { 2329 2330 switch (x) { 2331 case 0x2E: 2332 *seg = VM_REG_GUEST_CS; 2333 break; 2334 case 0x36: 2335 *seg = VM_REG_GUEST_SS; 2336 break; 2337 case 0x3E: 2338 *seg = VM_REG_GUEST_DS; 2339 break; 2340 case 0x26: 2341 *seg = VM_REG_GUEST_ES; 2342 break; 2343 case 0x64: 2344 *seg = VM_REG_GUEST_FS; 2345 break; 2346 case 0x65: 2347 *seg = VM_REG_GUEST_GS; 2348 break; 2349 default: 2350 return (false); 2351 } 2352 return (true); 2353 } 2354 2355 static int 2356 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d) 2357 { 2358 uint8_t x; 2359 2360 while (1) { 2361 if (vie_peek(vie, &x)) 2362 return (-1); 2363 2364 if (x == 0x66) 2365 vie->opsize_override = 1; 2366 else if (x == 0x67) 2367 vie->addrsize_override = 1; 2368 else if (x == 0xF3) 2369 vie->repz_present = 1; 2370 else if (x == 0xF2) 2371 vie->repnz_present = 1; 2372 else if (segment_override(x, &vie->segment_register)) 2373 vie->segment_override = 1; 2374 else 2375 break; 2376 2377 vie_advance(vie); 2378 } 2379 2380 /* 2381 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2: 2382 * - Only one REX prefix is allowed per instruction. 2383 * - The REX prefix must immediately precede the opcode byte or the 2384 * escape opcode byte. 2385 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3) 2386 * the mandatory prefix must come before the REX prefix. 2387 */ 2388 if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) { 2389 vie->rex_present = 1; 2390 vie->rex_w = x & 0x8 ? 1 : 0; 2391 vie->rex_r = x & 0x4 ? 1 : 0; 2392 vie->rex_x = x & 0x2 ? 1 : 0; 2393 vie->rex_b = x & 0x1 ? 1 : 0; 2394 vie_advance(vie); 2395 } 2396 2397 /* 2398 * § 2.3.5, "The VEX Prefix", SDM Vol 2. 2399 */ 2400 if ((cpu_mode == CPU_MODE_64BIT || cpu_mode == CPU_MODE_COMPATIBILITY) 2401 && x == 0xC4) { 2402 const struct vie_op *optab; 2403 2404 /* 3-byte VEX prefix. */ 2405 vie->vex_present = 1; 2406 2407 vie_advance(vie); 2408 if (vie_peek(vie, &x)) 2409 return (-1); 2410 2411 /* 2412 * 2nd byte: [R', X', B', mmmmm[4:0]]. Bits are inverted 2413 * relative to REX encoding. 2414 */ 2415 vie->rex_r = x & 0x80 ? 0 : 1; 2416 vie->rex_x = x & 0x40 ? 0 : 1; 2417 vie->rex_b = x & 0x20 ? 0 : 1; 2418 2419 switch (x & 0x1F) { 2420 case 0x2: 2421 /* 0F 38. */ 2422 optab = three_byte_opcodes_0f38; 2423 break; 2424 case 0x1: 2425 /* 0F class - nothing handled here yet. */ 2426 /* FALLTHROUGH */ 2427 case 0x3: 2428 /* 0F 3A class - nothing handled here yet. */ 2429 /* FALLTHROUGH */ 2430 default: 2431 /* Reserved (#UD). */ 2432 return (-1); 2433 } 2434 2435 vie_advance(vie); 2436 if (vie_peek(vie, &x)) 2437 return (-1); 2438 2439 /* 3rd byte: [W, vvvv[6:3], L, pp[1:0]]. */ 2440 vie->rex_w = x & 0x80 ? 1 : 0; 2441 2442 vie->vex_reg = ((~(unsigned)x & 0x78u) >> 3); 2443 vie->vex_l = !!(x & 0x4); 2444 vie->vex_pp = (x & 0x3); 2445 2446 /* PP: 1=66 2=F3 3=F2 prefixes. */ 2447 switch (vie->vex_pp) { 2448 case 0x1: 2449 vie->opsize_override = 1; 2450 break; 2451 case 0x2: 2452 vie->repz_present = 1; 2453 break; 2454 case 0x3: 2455 vie->repnz_present = 1; 2456 break; 2457 } 2458 2459 vie_advance(vie); 2460 2461 /* Opcode, sans literal prefix prefix. */ 2462 if (vie_peek(vie, &x)) 2463 return (-1); 2464 2465 vie->op = optab[x]; 2466 if (vie->op.op_type == VIE_OP_TYPE_NONE) 2467 return (-1); 2468 2469 vie_advance(vie); 2470 } 2471 2472 /* 2473 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1 2474 */ 2475 if (cpu_mode == CPU_MODE_64BIT) { 2476 /* 2477 * Default address size is 64-bits and default operand size 2478 * is 32-bits. 2479 */ 2480 vie->addrsize = vie->addrsize_override ? 4 : 8; 2481 if (vie->rex_w) 2482 vie->opsize = 8; 2483 else if (vie->opsize_override) 2484 vie->opsize = 2; 2485 else 2486 vie->opsize = 4; 2487 } else if (cs_d) { 2488 /* Default address and operand sizes are 32-bits */ 2489 vie->addrsize = vie->addrsize_override ? 2 : 4; 2490 vie->opsize = vie->opsize_override ? 2 : 4; 2491 } else { 2492 /* Default address and operand sizes are 16-bits */ 2493 vie->addrsize = vie->addrsize_override ? 4 : 2; 2494 vie->opsize = vie->opsize_override ? 4 : 2; 2495 } 2496 return (0); 2497 } 2498 2499 static int 2500 decode_two_byte_opcode(struct vie *vie) 2501 { 2502 uint8_t x; 2503 2504 if (vie_peek(vie, &x)) 2505 return (-1); 2506 2507 vie->op = two_byte_opcodes[x]; 2508 2509 if (vie->op.op_type == VIE_OP_TYPE_NONE) 2510 return (-1); 2511 2512 vie_advance(vie); 2513 return (0); 2514 } 2515 2516 static int 2517 decode_opcode(struct vie *vie) 2518 { 2519 uint8_t x; 2520 2521 if (vie_peek(vie, &x)) 2522 return (-1); 2523 2524 /* Already did this via VEX prefix. */ 2525 if (vie->op.op_type != VIE_OP_TYPE_NONE) 2526 return (0); 2527 2528 vie->op = one_byte_opcodes[x]; 2529 2530 if (vie->op.op_type == VIE_OP_TYPE_NONE) 2531 return (-1); 2532 2533 vie_advance(vie); 2534 2535 if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE) 2536 return (decode_two_byte_opcode(vie)); 2537 2538 return (0); 2539 } 2540 2541 static int 2542 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode) 2543 { 2544 uint8_t x; 2545 2546 if (vie->op.op_flags & VIE_OP_F_NO_MODRM) 2547 return (0); 2548 2549 if (cpu_mode == CPU_MODE_REAL) 2550 return (-1); 2551 2552 if (vie_peek(vie, &x)) 2553 return (-1); 2554 2555 vie->mod = (x >> 6) & 0x3; 2556 vie->rm = (x >> 0) & 0x7; 2557 vie->reg = (x >> 3) & 0x7; 2558 2559 /* 2560 * A direct addressing mode makes no sense in the context of an EPT 2561 * fault. There has to be a memory access involved to cause the 2562 * EPT fault. 2563 */ 2564 if (vie->mod == VIE_MOD_DIRECT) 2565 return (-1); 2566 2567 if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) || 2568 (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) { 2569 /* 2570 * Table 2-5: Special Cases of REX Encodings 2571 * 2572 * mod=0, r/m=5 is used in the compatibility mode to 2573 * indicate a disp32 without a base register. 2574 * 2575 * mod!=3, r/m=4 is used in the compatibility mode to 2576 * indicate that the SIB byte is present. 2577 * 2578 * The 'b' bit in the REX prefix is don't care in 2579 * this case. 2580 */ 2581 } else { 2582 vie->rm |= (vie->rex_b << 3); 2583 } 2584 2585 vie->reg |= (vie->rex_r << 3); 2586 2587 /* SIB */ 2588 if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB) 2589 goto done; 2590 2591 vie->base_register = gpr_map[vie->rm]; 2592 2593 switch (vie->mod) { 2594 case VIE_MOD_INDIRECT_DISP8: 2595 vie->disp_bytes = 1; 2596 break; 2597 case VIE_MOD_INDIRECT_DISP32: 2598 vie->disp_bytes = 4; 2599 break; 2600 case VIE_MOD_INDIRECT: 2601 if (vie->rm == VIE_RM_DISP32) { 2602 vie->disp_bytes = 4; 2603 /* 2604 * Table 2-7. RIP-Relative Addressing 2605 * 2606 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32 2607 * whereas in compatibility mode it just implies disp32. 2608 */ 2609 2610 if (cpu_mode == CPU_MODE_64BIT) 2611 vie->base_register = VM_REG_GUEST_RIP; 2612 else 2613 vie->base_register = VM_REG_LAST; 2614 } 2615 break; 2616 } 2617 2618 done: 2619 vie_advance(vie); 2620 2621 return (0); 2622 } 2623 2624 static int 2625 decode_sib(struct vie *vie) 2626 { 2627 uint8_t x; 2628 2629 /* Proceed only if SIB byte is present */ 2630 if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB) 2631 return (0); 2632 2633 if (vie_peek(vie, &x)) 2634 return (-1); 2635 2636 /* De-construct the SIB byte */ 2637 vie->ss = (x >> 6) & 0x3; 2638 vie->index = (x >> 3) & 0x7; 2639 vie->base = (x >> 0) & 0x7; 2640 2641 /* Apply the REX prefix modifiers */ 2642 vie->index |= vie->rex_x << 3; 2643 vie->base |= vie->rex_b << 3; 2644 2645 switch (vie->mod) { 2646 case VIE_MOD_INDIRECT_DISP8: 2647 vie->disp_bytes = 1; 2648 break; 2649 case VIE_MOD_INDIRECT_DISP32: 2650 vie->disp_bytes = 4; 2651 break; 2652 } 2653 2654 if (vie->mod == VIE_MOD_INDIRECT && 2655 (vie->base == 5 || vie->base == 13)) { 2656 /* 2657 * Special case when base register is unused if mod = 0 2658 * and base = %rbp or %r13. 2659 * 2660 * Documented in: 2661 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 2662 * Table 2-5: Special Cases of REX Encodings 2663 */ 2664 vie->disp_bytes = 4; 2665 } else { 2666 vie->base_register = gpr_map[vie->base]; 2667 } 2668 2669 /* 2670 * All encodings of 'index' are valid except for %rsp (4). 2671 * 2672 * Documented in: 2673 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 2674 * Table 2-5: Special Cases of REX Encodings 2675 */ 2676 if (vie->index != 4) 2677 vie->index_register = gpr_map[vie->index]; 2678 2679 /* 'scale' makes sense only in the context of an index register */ 2680 if (vie->index_register < VM_REG_LAST) 2681 vie->scale = 1 << vie->ss; 2682 2683 vie_advance(vie); 2684 2685 return (0); 2686 } 2687 2688 static int 2689 decode_displacement(struct vie *vie) 2690 { 2691 int n, i; 2692 uint8_t x; 2693 2694 union { 2695 char buf[4]; 2696 int8_t signed8; 2697 int32_t signed32; 2698 } u; 2699 2700 if ((n = vie->disp_bytes) == 0) 2701 return (0); 2702 2703 if (n != 1 && n != 4) 2704 panic("decode_displacement: invalid disp_bytes %d", n); 2705 2706 for (i = 0; i < n; i++) { 2707 if (vie_peek(vie, &x)) 2708 return (-1); 2709 2710 u.buf[i] = x; 2711 vie_advance(vie); 2712 } 2713 2714 if (n == 1) 2715 vie->displacement = u.signed8; /* sign-extended */ 2716 else 2717 vie->displacement = u.signed32; /* sign-extended */ 2718 2719 return (0); 2720 } 2721 2722 static int 2723 decode_immediate(struct vie *vie) 2724 { 2725 int i, n; 2726 uint8_t x; 2727 union { 2728 char buf[4]; 2729 int8_t signed8; 2730 int16_t signed16; 2731 int32_t signed32; 2732 } u; 2733 2734 /* Figure out immediate operand size (if any) */ 2735 if (vie->op.op_flags & VIE_OP_F_IMM) { 2736 /* 2737 * Section 2.2.1.5 "Immediates", Intel SDM: 2738 * In 64-bit mode the typical size of immediate operands 2739 * remains 32-bits. When the operand size if 64-bits, the 2740 * processor sign-extends all immediates to 64-bits prior 2741 * to their use. 2742 */ 2743 if (vie->opsize == 4 || vie->opsize == 8) 2744 vie->imm_bytes = 4; 2745 else 2746 vie->imm_bytes = 2; 2747 } else if (vie->op.op_flags & VIE_OP_F_IMM8) { 2748 vie->imm_bytes = 1; 2749 } 2750 2751 if ((n = vie->imm_bytes) == 0) 2752 return (0); 2753 2754 KASSERT(n == 1 || n == 2 || n == 4, 2755 ("%s: invalid number of immediate bytes: %d", __func__, n)); 2756 2757 for (i = 0; i < n; i++) { 2758 if (vie_peek(vie, &x)) 2759 return (-1); 2760 2761 u.buf[i] = x; 2762 vie_advance(vie); 2763 } 2764 2765 /* sign-extend the immediate value before use */ 2766 if (n == 1) 2767 vie->immediate = u.signed8; 2768 else if (n == 2) 2769 vie->immediate = u.signed16; 2770 else 2771 vie->immediate = u.signed32; 2772 2773 return (0); 2774 } 2775 2776 static int 2777 decode_moffset(struct vie *vie) 2778 { 2779 int i, n; 2780 uint8_t x; 2781 union { 2782 char buf[8]; 2783 uint64_t u64; 2784 } u; 2785 2786 if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0) 2787 return (0); 2788 2789 /* 2790 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM: 2791 * The memory offset size follows the address-size of the instruction. 2792 */ 2793 n = vie->addrsize; 2794 KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n)); 2795 2796 u.u64 = 0; 2797 for (i = 0; i < n; i++) { 2798 if (vie_peek(vie, &x)) 2799 return (-1); 2800 2801 u.buf[i] = x; 2802 vie_advance(vie); 2803 } 2804 vie->displacement = u.u64; 2805 return (0); 2806 } 2807 2808 #ifdef _KERNEL 2809 /* 2810 * Verify that the 'guest linear address' provided as collateral of the nested 2811 * page table fault matches with our instruction decoding. 2812 */ 2813 static int 2814 verify_gla(struct vcpu *vcpu, uint64_t gla, struct vie *vie, 2815 enum vm_cpu_mode cpu_mode) 2816 { 2817 int error; 2818 uint64_t base, segbase, idx, gla2; 2819 enum vm_reg_name seg; 2820 struct seg_desc desc; 2821 2822 /* Skip 'gla' verification */ 2823 if (gla == VIE_INVALID_GLA) 2824 return (0); 2825 2826 base = 0; 2827 if (vie->base_register != VM_REG_LAST) { 2828 error = vm_get_register(vcpu, vie->base_register, &base); 2829 if (error) { 2830 printf("verify_gla: error %d getting base reg %d\n", 2831 error, vie->base_register); 2832 return (-1); 2833 } 2834 2835 /* 2836 * RIP-relative addressing starts from the following 2837 * instruction 2838 */ 2839 if (vie->base_register == VM_REG_GUEST_RIP) 2840 base += vie->num_processed; 2841 } 2842 2843 idx = 0; 2844 if (vie->index_register != VM_REG_LAST) { 2845 error = vm_get_register(vcpu, vie->index_register, &idx); 2846 if (error) { 2847 printf("verify_gla: error %d getting index reg %d\n", 2848 error, vie->index_register); 2849 return (-1); 2850 } 2851 } 2852 2853 /* 2854 * From "Specifying a Segment Selector", Intel SDM, Vol 1 2855 * 2856 * In 64-bit mode, segmentation is generally (but not 2857 * completely) disabled. The exceptions are the FS and GS 2858 * segments. 2859 * 2860 * In legacy IA-32 mode, when the ESP or EBP register is used 2861 * as the base, the SS segment is the default segment. For 2862 * other data references, except when relative to stack or 2863 * string destination the DS segment is the default. These 2864 * can be overridden to allow other segments to be accessed. 2865 */ 2866 if (vie->segment_override) 2867 seg = vie->segment_register; 2868 else if (vie->base_register == VM_REG_GUEST_RSP || 2869 vie->base_register == VM_REG_GUEST_RBP) 2870 seg = VM_REG_GUEST_SS; 2871 else 2872 seg = VM_REG_GUEST_DS; 2873 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && 2874 seg != VM_REG_GUEST_GS) { 2875 segbase = 0; 2876 } else { 2877 error = vm_get_seg_desc(vcpu, seg, &desc); 2878 if (error) { 2879 printf("verify_gla: error %d getting segment" 2880 " descriptor %d", error, 2881 vie->segment_register); 2882 return (-1); 2883 } 2884 segbase = desc.base; 2885 } 2886 2887 gla2 = segbase + base + vie->scale * idx + vie->displacement; 2888 gla2 &= size2mask[vie->addrsize]; 2889 if (gla != gla2) { 2890 printf("verify_gla mismatch: segbase(0x%0lx)" 2891 "base(0x%0lx), scale(%d), index(0x%0lx), " 2892 "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n", 2893 segbase, base, vie->scale, idx, vie->displacement, 2894 gla, gla2); 2895 return (-1); 2896 } 2897 2898 return (0); 2899 } 2900 #endif /* _KERNEL */ 2901 2902 int 2903 #ifdef _KERNEL 2904 vmm_decode_instruction(struct vcpu *vcpu, uint64_t gla, 2905 enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) 2906 #else 2907 vmm_decode_instruction(enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) 2908 #endif 2909 { 2910 2911 if (decode_prefixes(vie, cpu_mode, cs_d)) 2912 return (-1); 2913 2914 if (decode_opcode(vie)) 2915 return (-1); 2916 2917 if (decode_modrm(vie, cpu_mode)) 2918 return (-1); 2919 2920 if (decode_sib(vie)) 2921 return (-1); 2922 2923 if (decode_displacement(vie)) 2924 return (-1); 2925 2926 if (decode_immediate(vie)) 2927 return (-1); 2928 2929 if (decode_moffset(vie)) 2930 return (-1); 2931 2932 #ifdef _KERNEL 2933 if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) { 2934 if (verify_gla(vcpu, gla, vie, cpu_mode)) 2935 return (-1); 2936 } 2937 #endif 2938 2939 vie->decoded = 1; /* success */ 2940 2941 return (0); 2942 } 2943