1 // SPDX-License-Identifier: GPL-2.0 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include <linux/objtool.h> 5 #include <linux/percpu.h> 6 7 #include <asm/debugreg.h> 8 #include <asm/mmu_context.h> 9 #include <asm/msr.h> 10 11 #include "x86.h" 12 #include "cpuid.h" 13 #include "hyperv.h" 14 #include "mmu.h" 15 #include "nested.h" 16 #include "pmu.h" 17 #include "posted_intr.h" 18 #include "sgx.h" 19 #include "trace.h" 20 #include "vmx.h" 21 #include "smm.h" 22 23 static bool __read_mostly enable_shadow_vmcs = 1; 24 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 25 26 static bool __read_mostly nested_early_check = 0; 27 module_param(nested_early_check, bool, S_IRUGO); 28 29 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK 30 31 /* 32 * Hyper-V requires all of these, so mark them as supported even though 33 * they are just treated the same as all-context. 34 */ 35 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 36 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 37 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 38 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 39 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 40 41 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 42 43 enum { 44 VMX_VMREAD_BITMAP, 45 VMX_VMWRITE_BITMAP, 46 VMX_BITMAP_NR 47 }; 48 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 49 50 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 51 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 52 53 struct shadow_vmcs_field { 54 u16 encoding; 55 u16 offset; 56 }; 57 static struct shadow_vmcs_field shadow_read_only_fields[] = { 58 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 59 #include "vmcs_shadow_fields.h" 60 }; 61 static int max_shadow_read_only_fields = 62 ARRAY_SIZE(shadow_read_only_fields); 63 64 static struct shadow_vmcs_field shadow_read_write_fields[] = { 65 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 66 #include "vmcs_shadow_fields.h" 67 }; 68 static int max_shadow_read_write_fields = 69 ARRAY_SIZE(shadow_read_write_fields); 70 71 static void init_vmcs_shadow_fields(void) 72 { 73 int i, j; 74 75 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 76 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 77 78 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 79 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 80 u16 field = entry.encoding; 81 82 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 83 (i + 1 == max_shadow_read_only_fields || 84 shadow_read_only_fields[i + 1].encoding != field + 1)) 85 pr_err("Missing field from shadow_read_only_field %x\n", 86 field + 1); 87 88 clear_bit(field, vmx_vmread_bitmap); 89 if (field & 1) 90 #ifdef CONFIG_X86_64 91 continue; 92 #else 93 entry.offset += sizeof(u32); 94 #endif 95 shadow_read_only_fields[j++] = entry; 96 } 97 max_shadow_read_only_fields = j; 98 99 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 100 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 101 u16 field = entry.encoding; 102 103 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 104 (i + 1 == max_shadow_read_write_fields || 105 shadow_read_write_fields[i + 1].encoding != field + 1)) 106 pr_err("Missing field from shadow_read_write_field %x\n", 107 field + 1); 108 109 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 110 field <= GUEST_TR_AR_BYTES, 111 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 112 113 /* 114 * PML and the preemption timer can be emulated, but the 115 * processor cannot vmwrite to fields that don't exist 116 * on bare metal. 117 */ 118 switch (field) { 119 case GUEST_PML_INDEX: 120 if (!cpu_has_vmx_pml()) 121 continue; 122 break; 123 case VMX_PREEMPTION_TIMER_VALUE: 124 if (!cpu_has_vmx_preemption_timer()) 125 continue; 126 break; 127 case GUEST_INTR_STATUS: 128 if (!cpu_has_vmx_apicv()) 129 continue; 130 break; 131 default: 132 break; 133 } 134 135 clear_bit(field, vmx_vmwrite_bitmap); 136 clear_bit(field, vmx_vmread_bitmap); 137 if (field & 1) 138 #ifdef CONFIG_X86_64 139 continue; 140 #else 141 entry.offset += sizeof(u32); 142 #endif 143 shadow_read_write_fields[j++] = entry; 144 } 145 max_shadow_read_write_fields = j; 146 } 147 148 /* 149 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 150 * set the success or error code of an emulated VMX instruction (as specified 151 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 152 * instruction. 153 */ 154 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 155 { 156 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 157 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 158 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 159 return kvm_skip_emulated_instruction(vcpu); 160 } 161 162 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 163 { 164 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 165 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 166 X86_EFLAGS_SF | X86_EFLAGS_OF)) 167 | X86_EFLAGS_CF); 168 return kvm_skip_emulated_instruction(vcpu); 169 } 170 171 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 172 u32 vm_instruction_error) 173 { 174 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 175 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 176 X86_EFLAGS_SF | X86_EFLAGS_OF)) 177 | X86_EFLAGS_ZF); 178 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 179 /* 180 * We don't need to force sync to shadow VMCS because 181 * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all 182 * fields and thus must be synced. 183 */ 184 if (nested_vmx_is_evmptr12_set(to_vmx(vcpu))) 185 to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; 186 187 return kvm_skip_emulated_instruction(vcpu); 188 } 189 190 static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) 191 { 192 struct vcpu_vmx *vmx = to_vmx(vcpu); 193 194 /* 195 * failValid writes the error number to the current VMCS, which 196 * can't be done if there isn't a current VMCS. 197 */ 198 if (vmx->nested.current_vmptr == INVALID_GPA && 199 !nested_vmx_is_evmptr12_valid(vmx)) 200 return nested_vmx_failInvalid(vcpu); 201 202 return nested_vmx_failValid(vcpu, vm_instruction_error); 203 } 204 205 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 206 { 207 /* TODO: not to reset guest simply here. */ 208 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 209 pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator); 210 } 211 212 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 213 { 214 return fixed_bits_valid(control, low, high); 215 } 216 217 static inline u64 vmx_control_msr(u32 low, u32 high) 218 { 219 return low | ((u64)high << 32); 220 } 221 222 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 223 { 224 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 225 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 226 vmx->nested.need_vmcs12_to_shadow_sync = false; 227 } 228 229 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 230 { 231 #ifdef CONFIG_KVM_HYPERV 232 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); 233 struct vcpu_vmx *vmx = to_vmx(vcpu); 234 235 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map); 236 vmx->nested.hv_evmcs = NULL; 237 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 238 239 if (hv_vcpu) { 240 hv_vcpu->nested.pa_page_gpa = INVALID_GPA; 241 hv_vcpu->nested.vm_id = 0; 242 hv_vcpu->nested.vp_id = 0; 243 } 244 #endif 245 } 246 247 static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr) 248 { 249 #ifdef CONFIG_KVM_HYPERV 250 struct vcpu_vmx *vmx = to_vmx(vcpu); 251 /* 252 * When Enlightened VMEntry is enabled on the calling CPU we treat 253 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 254 * way to distinguish it from VMCS12) and we must not corrupt it by 255 * writing to the non-existent 'launch_state' field. The area doesn't 256 * have to be the currently active EVMCS on the calling CPU and there's 257 * nothing KVM has to do to transition it from 'active' to 'non-active' 258 * state. It is possible that the area will stay mapped as 259 * vmx->nested.hv_evmcs but this shouldn't be a problem. 260 */ 261 if (!guest_cpu_cap_has_evmcs(vcpu) || 262 !evmptr_is_valid(nested_get_evmptr(vcpu))) 263 return false; 264 265 if (nested_vmx_evmcs(vmx) && vmptr == vmx->nested.hv_evmcs_vmptr) 266 nested_release_evmcs(vcpu); 267 268 return true; 269 #else 270 return false; 271 #endif 272 } 273 274 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 275 struct loaded_vmcs *prev) 276 { 277 struct vmcs_host_state *dest, *src; 278 279 if (unlikely(!vmx->guest_state_loaded)) 280 return; 281 282 src = &prev->host_state; 283 dest = &vmx->loaded_vmcs->host_state; 284 285 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 286 dest->ldt_sel = src->ldt_sel; 287 #ifdef CONFIG_X86_64 288 dest->ds_sel = src->ds_sel; 289 dest->es_sel = src->es_sel; 290 #endif 291 } 292 293 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 294 { 295 struct vcpu_vmx *vmx = to_vmx(vcpu); 296 struct loaded_vmcs *prev; 297 int cpu; 298 299 if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) 300 return; 301 302 cpu = get_cpu(); 303 prev = vmx->loaded_vmcs; 304 vmx->loaded_vmcs = vmcs; 305 vmx_vcpu_load_vmcs(vcpu, cpu, prev); 306 vmx_sync_vmcs_host_state(vmx, prev); 307 put_cpu(); 308 309 vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET; 310 311 /* 312 * All lazily updated registers will be reloaded from VMCS12 on both 313 * vmentry and vmexit. 314 */ 315 vcpu->arch.regs_dirty = 0; 316 } 317 318 static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu) 319 { 320 struct vcpu_vmx *vmx = to_vmx(vcpu); 321 322 kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map); 323 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map); 324 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map); 325 vmx->nested.pi_desc = NULL; 326 } 327 328 /* 329 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 330 * just stops using VMX. 331 */ 332 static void free_nested(struct kvm_vcpu *vcpu) 333 { 334 struct vcpu_vmx *vmx = to_vmx(vcpu); 335 336 if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) 337 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 338 339 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 340 return; 341 342 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 343 344 vmx->nested.vmxon = false; 345 vmx->nested.smm.vmxon = false; 346 vmx->nested.vmxon_ptr = INVALID_GPA; 347 free_vpid(vmx->nested.vpid02); 348 vmx->nested.posted_intr_nv = -1; 349 vmx->nested.current_vmptr = INVALID_GPA; 350 if (enable_shadow_vmcs) { 351 vmx_disable_shadow_vmcs(vmx); 352 vmcs_clear(vmx->vmcs01.shadow_vmcs); 353 free_vmcs(vmx->vmcs01.shadow_vmcs); 354 vmx->vmcs01.shadow_vmcs = NULL; 355 } 356 kfree(vmx->nested.cached_vmcs12); 357 vmx->nested.cached_vmcs12 = NULL; 358 kfree(vmx->nested.cached_shadow_vmcs12); 359 vmx->nested.cached_shadow_vmcs12 = NULL; 360 361 nested_put_vmcs12_pages(vcpu); 362 363 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 364 365 nested_release_evmcs(vcpu); 366 367 free_loaded_vmcs(&vmx->nested.vmcs02); 368 } 369 370 /* 371 * Ensure that the current vmcs of the logical processor is the 372 * vmcs01 of the vcpu before calling free_nested(). 373 */ 374 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 375 { 376 vcpu_load(vcpu); 377 vmx_leave_nested(vcpu); 378 vcpu_put(vcpu); 379 } 380 381 #define EPTP_PA_MASK GENMASK_ULL(51, 12) 382 383 static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) 384 { 385 return VALID_PAGE(root_hpa) && 386 ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); 387 } 388 389 static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, 390 gpa_t addr) 391 { 392 unsigned long roots = 0; 393 uint i; 394 struct kvm_mmu_root_info *cached_root; 395 396 WARN_ON_ONCE(!mmu_is_nested(vcpu)); 397 398 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 399 cached_root = &vcpu->arch.mmu->prev_roots[i]; 400 401 if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, 402 eptp)) 403 roots |= KVM_MMU_ROOT_PREVIOUS(i); 404 } 405 if (roots) 406 kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots); 407 } 408 409 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 410 struct x86_exception *fault) 411 { 412 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 413 struct vcpu_vmx *vmx = to_vmx(vcpu); 414 unsigned long exit_qualification; 415 u32 vm_exit_reason; 416 417 if (vmx->nested.pml_full) { 418 vm_exit_reason = EXIT_REASON_PML_FULL; 419 vmx->nested.pml_full = false; 420 421 /* 422 * It should be impossible to trigger a nested PML Full VM-Exit 423 * for anything other than an EPT Violation from L2. KVM *can* 424 * trigger nEPT page fault injection in response to an EPT 425 * Misconfig, e.g. if the MMIO SPTE was stale and L1's EPT 426 * tables also changed, but KVM should not treat EPT Misconfig 427 * VM-Exits as writes. 428 */ 429 WARN_ON_ONCE(vmx->exit_reason.basic != EXIT_REASON_EPT_VIOLATION); 430 431 /* 432 * PML Full and EPT Violation VM-Exits both use bit 12 to report 433 * "NMI unblocking due to IRET", i.e. the bit can be propagated 434 * as-is from the original EXIT_QUALIFICATION. 435 */ 436 exit_qualification = vmx_get_exit_qual(vcpu) & INTR_INFO_UNBLOCK_NMI; 437 } else { 438 if (fault->error_code & PFERR_RSVD_MASK) { 439 vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 440 exit_qualification = 0; 441 } else { 442 exit_qualification = fault->exit_qualification; 443 exit_qualification |= vmx_get_exit_qual(vcpu) & 444 (EPT_VIOLATION_GVA_IS_VALID | 445 EPT_VIOLATION_GVA_TRANSLATED); 446 vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 447 } 448 449 /* 450 * Although the caller (kvm_inject_emulated_page_fault) would 451 * have already synced the faulting address in the shadow EPT 452 * tables for the current EPTP12, we also need to sync it for 453 * any other cached EPTP02s based on the same EP4TA, since the 454 * TLB associates mappings to the EP4TA rather than the full EPTP. 455 */ 456 nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, 457 fault->address); 458 } 459 460 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); 461 vmcs12->guest_physical_address = fault->address; 462 } 463 464 static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) 465 { 466 struct vcpu_vmx *vmx = to_vmx(vcpu); 467 bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT; 468 int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps); 469 470 kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level, 471 nested_ept_ad_enabled(vcpu), 472 nested_ept_get_eptp(vcpu)); 473 } 474 475 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 476 { 477 WARN_ON(mmu_is_nested(vcpu)); 478 479 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 480 nested_ept_new_eptp(vcpu); 481 vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; 482 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 483 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 484 485 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 486 } 487 488 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 489 { 490 vcpu->arch.mmu = &vcpu->arch.root_mmu; 491 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 492 } 493 494 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 495 u16 error_code) 496 { 497 bool inequality, bit; 498 499 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 500 inequality = 501 (error_code & vmcs12->page_fault_error_code_mask) != 502 vmcs12->page_fault_error_code_match; 503 return inequality ^ bit; 504 } 505 506 static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector, 507 u32 error_code) 508 { 509 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 510 511 /* 512 * Drop bits 31:16 of the error code when performing the #PF mask+match 513 * check. All VMCS fields involved are 32 bits, but Intel CPUs never 514 * set bits 31:16 and VMX disallows setting bits 31:16 in the injected 515 * error code. Including the to-be-dropped bits in the check might 516 * result in an "impossible" or missed exit from L1's perspective. 517 */ 518 if (vector == PF_VECTOR) 519 return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code); 520 521 return (vmcs12->exception_bitmap & (1u << vector)); 522 } 523 524 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 525 struct vmcs12 *vmcs12) 526 { 527 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 528 return 0; 529 530 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 531 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 532 return -EINVAL; 533 534 return 0; 535 } 536 537 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 538 struct vmcs12 *vmcs12) 539 { 540 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 541 return 0; 542 543 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 544 return -EINVAL; 545 546 return 0; 547 } 548 549 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 550 struct vmcs12 *vmcs12) 551 { 552 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 553 return 0; 554 555 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 556 return -EINVAL; 557 558 return 0; 559 } 560 561 /* 562 * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 563 * itself utilizing x2APIC. All MSRs were previously set to be intercepted, 564 * only the "disable intercept" case needs to be handled. 565 */ 566 static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, 567 unsigned long *msr_bitmap_l0, 568 u32 msr, int type) 569 { 570 if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) 571 vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); 572 573 if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) 574 vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); 575 } 576 577 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) 578 { 579 int msr; 580 581 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 582 unsigned word = msr / BITS_PER_LONG; 583 584 msr_bitmap[word] = ~0; 585 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 586 } 587 } 588 589 #define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ 590 static inline \ 591 void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ 592 unsigned long *msr_bitmap_l1, \ 593 unsigned long *msr_bitmap_l0, u32 msr) \ 594 { \ 595 if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ 596 vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ 597 vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 598 else \ 599 vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 600 } 601 BUILD_NVMX_MSR_INTERCEPT_HELPER(read) 602 BUILD_NVMX_MSR_INTERCEPT_HELPER(write) 603 604 static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, 605 unsigned long *msr_bitmap_l1, 606 unsigned long *msr_bitmap_l0, 607 u32 msr, int types) 608 { 609 if (types & MSR_TYPE_R) 610 nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, 611 msr_bitmap_l0, msr); 612 if (types & MSR_TYPE_W) 613 nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, 614 msr_bitmap_l0, msr); 615 } 616 617 /* 618 * Merge L0's and L1's MSR bitmap, return false to indicate that 619 * we do not use the hardware. 620 */ 621 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 622 struct vmcs12 *vmcs12) 623 { 624 struct vcpu_vmx *vmx = to_vmx(vcpu); 625 int msr; 626 unsigned long *msr_bitmap_l1; 627 unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; 628 struct kvm_host_map map; 629 630 /* Nothing to do if the MSR bitmap is not in use. */ 631 if (!cpu_has_vmx_msr_bitmap() || 632 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 633 return false; 634 635 /* 636 * MSR bitmap update can be skipped when: 637 * - MSR bitmap for L1 hasn't changed. 638 * - Nested hypervisor (L1) is attempting to launch the same L2 as 639 * before. 640 * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature 641 * and tells KVM (L0) there were no changes in MSR bitmap for L2. 642 */ 643 if (!vmx->nested.force_msr_bitmap_recalc) { 644 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 645 646 if (evmcs && evmcs->hv_enlightenments_control.msr_bitmap && 647 evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) 648 return true; 649 } 650 651 if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map)) 652 return false; 653 654 msr_bitmap_l1 = (unsigned long *)map.hva; 655 656 /* 657 * To keep the control flow simple, pay eight 8-byte writes (sixteen 658 * 4-byte writes on 32-bit systems) up front to enable intercepts for 659 * the x2APIC MSR range and selectively toggle those relevant to L2. 660 */ 661 enable_x2apic_msr_intercepts(msr_bitmap_l0); 662 663 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 664 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 665 /* 666 * L0 need not intercept reads for MSRs between 0x800 667 * and 0x8ff, it just lets the processor take the value 668 * from the virtual-APIC page; take those 256 bits 669 * directly from the L1 bitmap. 670 */ 671 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 672 unsigned word = msr / BITS_PER_LONG; 673 674 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 675 } 676 } 677 678 nested_vmx_disable_intercept_for_x2apic_msr( 679 msr_bitmap_l1, msr_bitmap_l0, 680 X2APIC_MSR(APIC_TASKPRI), 681 MSR_TYPE_R | MSR_TYPE_W); 682 683 if (nested_cpu_has_vid(vmcs12)) { 684 nested_vmx_disable_intercept_for_x2apic_msr( 685 msr_bitmap_l1, msr_bitmap_l0, 686 X2APIC_MSR(APIC_EOI), 687 MSR_TYPE_W); 688 nested_vmx_disable_intercept_for_x2apic_msr( 689 msr_bitmap_l1, msr_bitmap_l0, 690 X2APIC_MSR(APIC_SELF_IPI), 691 MSR_TYPE_W); 692 } 693 } 694 695 /* 696 * Always check vmcs01's bitmap to honor userspace MSR filters and any 697 * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. 698 */ 699 #ifdef CONFIG_X86_64 700 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 701 MSR_FS_BASE, MSR_TYPE_RW); 702 703 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 704 MSR_GS_BASE, MSR_TYPE_RW); 705 706 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 707 MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 708 #endif 709 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 710 MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); 711 712 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 713 MSR_IA32_PRED_CMD, MSR_TYPE_W); 714 715 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 716 MSR_IA32_FLUSH_CMD, MSR_TYPE_W); 717 718 kvm_vcpu_unmap(vcpu, &map); 719 720 vmx->nested.force_msr_bitmap_recalc = false; 721 722 return true; 723 } 724 725 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 726 struct vmcs12 *vmcs12) 727 { 728 struct vcpu_vmx *vmx = to_vmx(vcpu); 729 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 730 731 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 732 vmcs12->vmcs_link_pointer == INVALID_GPA) 733 return; 734 735 if (ghc->gpa != vmcs12->vmcs_link_pointer && 736 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 737 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 738 return; 739 740 kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 741 VMCS12_SIZE); 742 } 743 744 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 745 struct vmcs12 *vmcs12) 746 { 747 struct vcpu_vmx *vmx = to_vmx(vcpu); 748 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 749 750 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 751 vmcs12->vmcs_link_pointer == INVALID_GPA) 752 return; 753 754 if (ghc->gpa != vmcs12->vmcs_link_pointer && 755 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 756 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 757 return; 758 759 kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 760 VMCS12_SIZE); 761 } 762 763 /* 764 * In nested virtualization, check if L1 has set 765 * VM_EXIT_ACK_INTR_ON_EXIT 766 */ 767 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 768 { 769 return get_vmcs12(vcpu)->vm_exit_controls & 770 VM_EXIT_ACK_INTR_ON_EXIT; 771 } 772 773 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 774 struct vmcs12 *vmcs12) 775 { 776 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 777 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 778 return -EINVAL; 779 else 780 return 0; 781 } 782 783 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 784 struct vmcs12 *vmcs12) 785 { 786 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 787 !nested_cpu_has_apic_reg_virt(vmcs12) && 788 !nested_cpu_has_vid(vmcs12) && 789 !nested_cpu_has_posted_intr(vmcs12)) 790 return 0; 791 792 /* 793 * If virtualize x2apic mode is enabled, 794 * virtualize apic access must be disabled. 795 */ 796 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 797 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 798 return -EINVAL; 799 800 /* 801 * If virtual interrupt delivery is enabled, 802 * we must exit on external interrupts. 803 */ 804 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 805 return -EINVAL; 806 807 /* 808 * bits 15:8 should be zero in posted_intr_nv, 809 * the descriptor address has been already checked 810 * in nested_get_vmcs12_pages. 811 * 812 * bits 5:0 of posted_intr_desc_addr should be zero. 813 */ 814 if (nested_cpu_has_posted_intr(vmcs12) && 815 (CC(!nested_cpu_has_vid(vmcs12)) || 816 CC(!nested_exit_intr_ack_set(vcpu)) || 817 CC((vmcs12->posted_intr_nv & 0xff00)) || 818 CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) 819 return -EINVAL; 820 821 /* tpr shadow is needed by all apicv features. */ 822 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 823 return -EINVAL; 824 825 return 0; 826 } 827 828 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 829 u32 count, u64 addr) 830 { 831 if (count == 0) 832 return 0; 833 834 if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || 835 !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) 836 return -EINVAL; 837 838 return 0; 839 } 840 841 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 842 struct vmcs12 *vmcs12) 843 { 844 if (CC(nested_vmx_check_msr_switch(vcpu, 845 vmcs12->vm_exit_msr_load_count, 846 vmcs12->vm_exit_msr_load_addr)) || 847 CC(nested_vmx_check_msr_switch(vcpu, 848 vmcs12->vm_exit_msr_store_count, 849 vmcs12->vm_exit_msr_store_addr))) 850 return -EINVAL; 851 852 return 0; 853 } 854 855 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 856 struct vmcs12 *vmcs12) 857 { 858 if (CC(nested_vmx_check_msr_switch(vcpu, 859 vmcs12->vm_entry_msr_load_count, 860 vmcs12->vm_entry_msr_load_addr))) 861 return -EINVAL; 862 863 return 0; 864 } 865 866 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 867 struct vmcs12 *vmcs12) 868 { 869 if (!nested_cpu_has_pml(vmcs12)) 870 return 0; 871 872 if (CC(!nested_cpu_has_ept(vmcs12)) || 873 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 874 return -EINVAL; 875 876 return 0; 877 } 878 879 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 880 struct vmcs12 *vmcs12) 881 { 882 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 883 !nested_cpu_has_ept(vmcs12))) 884 return -EINVAL; 885 return 0; 886 } 887 888 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 889 struct vmcs12 *vmcs12) 890 { 891 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 892 !nested_cpu_has_ept(vmcs12))) 893 return -EINVAL; 894 return 0; 895 } 896 897 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 898 struct vmcs12 *vmcs12) 899 { 900 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 901 return 0; 902 903 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 904 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 905 return -EINVAL; 906 907 return 0; 908 } 909 910 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 911 struct vmx_msr_entry *e) 912 { 913 /* x2APIC MSR accesses are not allowed */ 914 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 915 return -EINVAL; 916 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 917 CC(e->index == MSR_IA32_UCODE_REV)) 918 return -EINVAL; 919 if (CC(e->reserved != 0)) 920 return -EINVAL; 921 return 0; 922 } 923 924 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 925 struct vmx_msr_entry *e) 926 { 927 if (CC(e->index == MSR_FS_BASE) || 928 CC(e->index == MSR_GS_BASE) || 929 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 930 nested_vmx_msr_check_common(vcpu, e)) 931 return -EINVAL; 932 return 0; 933 } 934 935 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 936 struct vmx_msr_entry *e) 937 { 938 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 939 nested_vmx_msr_check_common(vcpu, e)) 940 return -EINVAL; 941 return 0; 942 } 943 944 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) 945 { 946 struct vcpu_vmx *vmx = to_vmx(vcpu); 947 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 948 vmx->nested.msrs.misc_high); 949 950 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; 951 } 952 953 /* 954 * Load guest's/host's msr at nested entry/exit. 955 * return 0 for success, entry index for failure. 956 * 957 * One of the failure modes for MSR load/store is when a list exceeds the 958 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch 959 * as possible, process all valid entries before failing rather than precheck 960 * for a capacity violation. 961 */ 962 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 963 { 964 u32 i; 965 struct vmx_msr_entry e; 966 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 967 968 for (i = 0; i < count; i++) { 969 if (unlikely(i >= max_msr_list_size)) 970 goto fail; 971 972 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 973 &e, sizeof(e))) { 974 pr_debug_ratelimited( 975 "%s cannot read MSR entry (%u, 0x%08llx)\n", 976 __func__, i, gpa + i * sizeof(e)); 977 goto fail; 978 } 979 if (nested_vmx_load_msr_check(vcpu, &e)) { 980 pr_debug_ratelimited( 981 "%s check failed (%u, 0x%x, 0x%x)\n", 982 __func__, i, e.index, e.reserved); 983 goto fail; 984 } 985 if (kvm_set_msr_with_filter(vcpu, e.index, e.value)) { 986 pr_debug_ratelimited( 987 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 988 __func__, i, e.index, e.value); 989 goto fail; 990 } 991 } 992 return 0; 993 fail: 994 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ 995 return i + 1; 996 } 997 998 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, 999 u32 msr_index, 1000 u64 *data) 1001 { 1002 struct vcpu_vmx *vmx = to_vmx(vcpu); 1003 1004 /* 1005 * If the L0 hypervisor stored a more accurate value for the TSC that 1006 * does not include the time taken for emulation of the L2->L1 1007 * VM-exit in L0, use the more accurate value. 1008 */ 1009 if (msr_index == MSR_IA32_TSC) { 1010 int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, 1011 MSR_IA32_TSC); 1012 1013 if (i >= 0) { 1014 u64 val = vmx->msr_autostore.guest.val[i].value; 1015 1016 *data = kvm_read_l1_tsc(vcpu, val); 1017 return true; 1018 } 1019 } 1020 1021 if (kvm_get_msr_with_filter(vcpu, msr_index, data)) { 1022 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 1023 msr_index); 1024 return false; 1025 } 1026 return true; 1027 } 1028 1029 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, 1030 struct vmx_msr_entry *e) 1031 { 1032 if (kvm_vcpu_read_guest(vcpu, 1033 gpa + i * sizeof(*e), 1034 e, 2 * sizeof(u32))) { 1035 pr_debug_ratelimited( 1036 "%s cannot read MSR entry (%u, 0x%08llx)\n", 1037 __func__, i, gpa + i * sizeof(*e)); 1038 return false; 1039 } 1040 if (nested_vmx_store_msr_check(vcpu, e)) { 1041 pr_debug_ratelimited( 1042 "%s check failed (%u, 0x%x, 0x%x)\n", 1043 __func__, i, e->index, e->reserved); 1044 return false; 1045 } 1046 return true; 1047 } 1048 1049 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 1050 { 1051 u64 data; 1052 u32 i; 1053 struct vmx_msr_entry e; 1054 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 1055 1056 for (i = 0; i < count; i++) { 1057 if (unlikely(i >= max_msr_list_size)) 1058 return -EINVAL; 1059 1060 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1061 return -EINVAL; 1062 1063 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) 1064 return -EINVAL; 1065 1066 if (kvm_vcpu_write_guest(vcpu, 1067 gpa + i * sizeof(e) + 1068 offsetof(struct vmx_msr_entry, value), 1069 &data, sizeof(data))) { 1070 pr_debug_ratelimited( 1071 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1072 __func__, i, e.index, data); 1073 return -EINVAL; 1074 } 1075 } 1076 return 0; 1077 } 1078 1079 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) 1080 { 1081 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1082 u32 count = vmcs12->vm_exit_msr_store_count; 1083 u64 gpa = vmcs12->vm_exit_msr_store_addr; 1084 struct vmx_msr_entry e; 1085 u32 i; 1086 1087 for (i = 0; i < count; i++) { 1088 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1089 return false; 1090 1091 if (e.index == msr_index) 1092 return true; 1093 } 1094 return false; 1095 } 1096 1097 static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, 1098 u32 msr_index) 1099 { 1100 struct vcpu_vmx *vmx = to_vmx(vcpu); 1101 struct vmx_msrs *autostore = &vmx->msr_autostore.guest; 1102 bool in_vmcs12_store_list; 1103 int msr_autostore_slot; 1104 bool in_autostore_list; 1105 int last; 1106 1107 msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); 1108 in_autostore_list = msr_autostore_slot >= 0; 1109 in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); 1110 1111 if (in_vmcs12_store_list && !in_autostore_list) { 1112 if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { 1113 /* 1114 * Emulated VMEntry does not fail here. Instead a less 1115 * accurate value will be returned by 1116 * nested_vmx_get_vmexit_msr_value() by reading KVM's 1117 * internal MSR state instead of reading the value from 1118 * the vmcs02 VMExit MSR-store area. 1119 */ 1120 pr_warn_ratelimited( 1121 "Not enough msr entries in msr_autostore. Can't add msr %x\n", 1122 msr_index); 1123 return; 1124 } 1125 last = autostore->nr++; 1126 autostore->val[last].index = msr_index; 1127 } else if (!in_vmcs12_store_list && in_autostore_list) { 1128 last = --autostore->nr; 1129 autostore->val[msr_autostore_slot] = autostore->val[last]; 1130 } 1131 } 1132 1133 /* 1134 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are 1135 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected 1136 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to 1137 * @entry_failure_code. 1138 */ 1139 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, 1140 bool nested_ept, bool reload_pdptrs, 1141 enum vm_entry_failure_code *entry_failure_code) 1142 { 1143 if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) { 1144 *entry_failure_code = ENTRY_FAIL_DEFAULT; 1145 return -EINVAL; 1146 } 1147 1148 /* 1149 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 1150 * must not be dereferenced. 1151 */ 1152 if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && 1153 CC(!load_pdptrs(vcpu, cr3))) { 1154 *entry_failure_code = ENTRY_FAIL_PDPTE; 1155 return -EINVAL; 1156 } 1157 1158 vcpu->arch.cr3 = cr3; 1159 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); 1160 1161 /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ 1162 kvm_init_mmu(vcpu); 1163 1164 if (!nested_ept) 1165 kvm_mmu_new_pgd(vcpu, cr3); 1166 1167 return 0; 1168 } 1169 1170 /* 1171 * Returns if KVM is able to config CPU to tag TLB entries 1172 * populated by L2 differently than TLB entries populated 1173 * by L1. 1174 * 1175 * If L0 uses EPT, L1 and L2 run with different EPTP because 1176 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries 1177 * are tagged with different EPTP. 1178 * 1179 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 1180 * with different VPID (L1 entries are tagged with vmx->vpid 1181 * while L2 entries are tagged with vmx->nested.vpid02). 1182 */ 1183 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1184 { 1185 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1186 1187 return enable_ept || 1188 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1189 } 1190 1191 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, 1192 struct vmcs12 *vmcs12, 1193 bool is_vmenter) 1194 { 1195 struct vcpu_vmx *vmx = to_vmx(vcpu); 1196 1197 /* Handle pending Hyper-V TLB flush requests */ 1198 kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept); 1199 1200 /* 1201 * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the 1202 * same VPID as the host, and so architecturally, linear and combined 1203 * mappings for VPID=0 must be flushed at VM-Enter and VM-Exit. KVM 1204 * emulates L2 sharing L1's VPID=0 by using vpid01 while running L2, 1205 * and so KVM must also emulate TLB flush of VPID=0, i.e. vpid01. This 1206 * is required if VPID is disabled in KVM, as a TLB flush (there are no 1207 * VPIDs) still occurs from L1's perspective, and KVM may need to 1208 * synchronize the MMU in response to the guest TLB flush. 1209 * 1210 * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. 1211 * EPT is a special snowflake, as guest-physical mappings aren't 1212 * flushed on VPID invalidations, including VM-Enter or VM-Exit with 1213 * VPID disabled. As a result, KVM _never_ needs to sync nEPT 1214 * entries on VM-Enter because L1 can't rely on VM-Enter to flush 1215 * those mappings. 1216 */ 1217 if (!nested_cpu_has_vpid(vmcs12)) { 1218 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1219 return; 1220 } 1221 1222 /* L2 should never have a VPID if VPID is disabled. */ 1223 WARN_ON(!enable_vpid); 1224 1225 /* 1226 * VPID is enabled and in use by vmcs12. If vpid12 is changing, then 1227 * emulate a guest TLB flush as KVM does not track vpid12 history nor 1228 * is the VPID incorporated into the MMU context. I.e. KVM must assume 1229 * that the new vpid12 has never been used and thus represents a new 1230 * guest ASID that cannot have entries in the TLB. 1231 */ 1232 if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 1233 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 1234 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1235 return; 1236 } 1237 1238 /* 1239 * If VPID is enabled, used by vmc12, and vpid12 is not changing but 1240 * does not have a unique TLB tag (ASID), i.e. EPT is disabled and 1241 * KVM was unable to allocate a VPID for L2, flush the current context 1242 * as the effective ASID is common to both L1 and L2. 1243 */ 1244 if (!nested_has_guest_tlb_tag(vcpu)) 1245 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 1246 } 1247 1248 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1249 { 1250 superset &= mask; 1251 subset &= mask; 1252 1253 return (superset | subset) == superset; 1254 } 1255 1256 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1257 { 1258 const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT | 1259 VMX_BASIC_INOUT | 1260 VMX_BASIC_TRUE_CTLS; 1261 1262 const u64 reserved_bits = GENMASK_ULL(63, 56) | 1263 GENMASK_ULL(47, 45) | 1264 BIT_ULL(31); 1265 1266 u64 vmx_basic = vmcs_config.nested.basic; 1267 1268 BUILD_BUG_ON(feature_bits & reserved_bits); 1269 1270 /* 1271 * Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has 1272 * inverted polarity), the incoming value must not set feature bits or 1273 * reserved bits that aren't allowed/supported by KVM. Fields, i.e. 1274 * multi-bit values, are explicitly checked below. 1275 */ 1276 if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits)) 1277 return -EINVAL; 1278 1279 /* 1280 * KVM does not emulate a version of VMX that constrains physical 1281 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1282 */ 1283 if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) 1284 return -EINVAL; 1285 1286 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1287 vmx_basic_vmcs_revision_id(data)) 1288 return -EINVAL; 1289 1290 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1291 return -EINVAL; 1292 1293 vmx->nested.msrs.basic = data; 1294 return 0; 1295 } 1296 1297 static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index, 1298 u32 **low, u32 **high) 1299 { 1300 switch (msr_index) { 1301 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1302 *low = &msrs->pinbased_ctls_low; 1303 *high = &msrs->pinbased_ctls_high; 1304 break; 1305 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1306 *low = &msrs->procbased_ctls_low; 1307 *high = &msrs->procbased_ctls_high; 1308 break; 1309 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1310 *low = &msrs->exit_ctls_low; 1311 *high = &msrs->exit_ctls_high; 1312 break; 1313 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1314 *low = &msrs->entry_ctls_low; 1315 *high = &msrs->entry_ctls_high; 1316 break; 1317 case MSR_IA32_VMX_PROCBASED_CTLS2: 1318 *low = &msrs->secondary_ctls_low; 1319 *high = &msrs->secondary_ctls_high; 1320 break; 1321 default: 1322 BUG(); 1323 } 1324 } 1325 1326 static int 1327 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1328 { 1329 u32 *lowp, *highp; 1330 u64 supported; 1331 1332 vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp); 1333 1334 supported = vmx_control_msr(*lowp, *highp); 1335 1336 /* Check must-be-1 bits are still 1. */ 1337 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1338 return -EINVAL; 1339 1340 /* Check must-be-0 bits are still 0. */ 1341 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1342 return -EINVAL; 1343 1344 vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp); 1345 *lowp = data; 1346 *highp = data >> 32; 1347 return 0; 1348 } 1349 1350 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1351 { 1352 const u64 feature_bits = VMX_MISC_SAVE_EFER_LMA | 1353 VMX_MISC_ACTIVITY_HLT | 1354 VMX_MISC_ACTIVITY_SHUTDOWN | 1355 VMX_MISC_ACTIVITY_WAIT_SIPI | 1356 VMX_MISC_INTEL_PT | 1357 VMX_MISC_RDMSR_IN_SMM | 1358 VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 1359 VMX_MISC_VMXOFF_BLOCK_SMI | 1360 VMX_MISC_ZERO_LEN_INS; 1361 1362 const u64 reserved_bits = BIT_ULL(31) | GENMASK_ULL(13, 9); 1363 1364 u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, 1365 vmcs_config.nested.misc_high); 1366 1367 BUILD_BUG_ON(feature_bits & reserved_bits); 1368 1369 /* 1370 * The incoming value must not set feature bits or reserved bits that 1371 * aren't allowed/supported by KVM. Fields, i.e. multi-bit values, are 1372 * explicitly checked below. 1373 */ 1374 if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits)) 1375 return -EINVAL; 1376 1377 if ((vmx->nested.msrs.pinbased_ctls_high & 1378 PIN_BASED_VMX_PREEMPTION_TIMER) && 1379 vmx_misc_preemption_timer_rate(data) != 1380 vmx_misc_preemption_timer_rate(vmx_misc)) 1381 return -EINVAL; 1382 1383 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1384 return -EINVAL; 1385 1386 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1387 return -EINVAL; 1388 1389 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1390 return -EINVAL; 1391 1392 vmx->nested.msrs.misc_low = data; 1393 vmx->nested.msrs.misc_high = data >> 32; 1394 1395 return 0; 1396 } 1397 1398 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1399 { 1400 u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps, 1401 vmcs_config.nested.vpid_caps); 1402 1403 /* Every bit is either reserved or a feature bit. */ 1404 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1405 return -EINVAL; 1406 1407 vmx->nested.msrs.ept_caps = data; 1408 vmx->nested.msrs.vpid_caps = data >> 32; 1409 return 0; 1410 } 1411 1412 static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index) 1413 { 1414 switch (msr_index) { 1415 case MSR_IA32_VMX_CR0_FIXED0: 1416 return &msrs->cr0_fixed0; 1417 case MSR_IA32_VMX_CR4_FIXED0: 1418 return &msrs->cr4_fixed0; 1419 default: 1420 BUG(); 1421 } 1422 } 1423 1424 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1425 { 1426 const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index); 1427 1428 /* 1429 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1430 * must be 1 in the restored value. 1431 */ 1432 if (!is_bitwise_subset(data, *msr, -1ULL)) 1433 return -EINVAL; 1434 1435 *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data; 1436 return 0; 1437 } 1438 1439 /* 1440 * Called when userspace is restoring VMX MSRs. 1441 * 1442 * Returns 0 on success, non-0 otherwise. 1443 */ 1444 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1445 { 1446 struct vcpu_vmx *vmx = to_vmx(vcpu); 1447 1448 /* 1449 * Don't allow changes to the VMX capability MSRs while the vCPU 1450 * is in VMX operation. 1451 */ 1452 if (vmx->nested.vmxon) 1453 return -EBUSY; 1454 1455 switch (msr_index) { 1456 case MSR_IA32_VMX_BASIC: 1457 return vmx_restore_vmx_basic(vmx, data); 1458 case MSR_IA32_VMX_PINBASED_CTLS: 1459 case MSR_IA32_VMX_PROCBASED_CTLS: 1460 case MSR_IA32_VMX_EXIT_CTLS: 1461 case MSR_IA32_VMX_ENTRY_CTLS: 1462 /* 1463 * The "non-true" VMX capability MSRs are generated from the 1464 * "true" MSRs, so we do not support restoring them directly. 1465 * 1466 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1467 * should restore the "true" MSRs with the must-be-1 bits 1468 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1469 * DEFAULT SETTINGS". 1470 */ 1471 return -EINVAL; 1472 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1473 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1474 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1475 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1476 case MSR_IA32_VMX_PROCBASED_CTLS2: 1477 return vmx_restore_control_msr(vmx, msr_index, data); 1478 case MSR_IA32_VMX_MISC: 1479 return vmx_restore_vmx_misc(vmx, data); 1480 case MSR_IA32_VMX_CR0_FIXED0: 1481 case MSR_IA32_VMX_CR4_FIXED0: 1482 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1483 case MSR_IA32_VMX_CR0_FIXED1: 1484 case MSR_IA32_VMX_CR4_FIXED1: 1485 /* 1486 * These MSRs are generated based on the vCPU's CPUID, so we 1487 * do not support restoring them directly. 1488 */ 1489 return -EINVAL; 1490 case MSR_IA32_VMX_EPT_VPID_CAP: 1491 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1492 case MSR_IA32_VMX_VMCS_ENUM: 1493 vmx->nested.msrs.vmcs_enum = data; 1494 return 0; 1495 case MSR_IA32_VMX_VMFUNC: 1496 if (data & ~vmcs_config.nested.vmfunc_controls) 1497 return -EINVAL; 1498 vmx->nested.msrs.vmfunc_controls = data; 1499 return 0; 1500 default: 1501 /* 1502 * The rest of the VMX capability MSRs do not support restore. 1503 */ 1504 return -EINVAL; 1505 } 1506 } 1507 1508 /* Returns 0 on success, non-0 otherwise. */ 1509 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1510 { 1511 switch (msr_index) { 1512 case MSR_IA32_VMX_BASIC: 1513 *pdata = msrs->basic; 1514 break; 1515 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1516 case MSR_IA32_VMX_PINBASED_CTLS: 1517 *pdata = vmx_control_msr( 1518 msrs->pinbased_ctls_low, 1519 msrs->pinbased_ctls_high); 1520 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1521 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1522 break; 1523 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1524 case MSR_IA32_VMX_PROCBASED_CTLS: 1525 *pdata = vmx_control_msr( 1526 msrs->procbased_ctls_low, 1527 msrs->procbased_ctls_high); 1528 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1529 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1530 break; 1531 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1532 case MSR_IA32_VMX_EXIT_CTLS: 1533 *pdata = vmx_control_msr( 1534 msrs->exit_ctls_low, 1535 msrs->exit_ctls_high); 1536 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1537 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1538 break; 1539 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1540 case MSR_IA32_VMX_ENTRY_CTLS: 1541 *pdata = vmx_control_msr( 1542 msrs->entry_ctls_low, 1543 msrs->entry_ctls_high); 1544 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1545 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1546 break; 1547 case MSR_IA32_VMX_MISC: 1548 *pdata = vmx_control_msr( 1549 msrs->misc_low, 1550 msrs->misc_high); 1551 break; 1552 case MSR_IA32_VMX_CR0_FIXED0: 1553 *pdata = msrs->cr0_fixed0; 1554 break; 1555 case MSR_IA32_VMX_CR0_FIXED1: 1556 *pdata = msrs->cr0_fixed1; 1557 break; 1558 case MSR_IA32_VMX_CR4_FIXED0: 1559 *pdata = msrs->cr4_fixed0; 1560 break; 1561 case MSR_IA32_VMX_CR4_FIXED1: 1562 *pdata = msrs->cr4_fixed1; 1563 break; 1564 case MSR_IA32_VMX_VMCS_ENUM: 1565 *pdata = msrs->vmcs_enum; 1566 break; 1567 case MSR_IA32_VMX_PROCBASED_CTLS2: 1568 *pdata = vmx_control_msr( 1569 msrs->secondary_ctls_low, 1570 msrs->secondary_ctls_high); 1571 break; 1572 case MSR_IA32_VMX_EPT_VPID_CAP: 1573 *pdata = msrs->ept_caps | 1574 ((u64)msrs->vpid_caps << 32); 1575 break; 1576 case MSR_IA32_VMX_VMFUNC: 1577 *pdata = msrs->vmfunc_controls; 1578 break; 1579 default: 1580 return 1; 1581 } 1582 1583 return 0; 1584 } 1585 1586 /* 1587 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1588 * been modified by the L1 guest. Note, "writable" in this context means 1589 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1590 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1591 * VM-exit information fields (which are actually writable if the vCPU is 1592 * configured to support "VMWRITE to any supported field in the VMCS"). 1593 */ 1594 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1595 { 1596 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1597 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1598 struct shadow_vmcs_field field; 1599 unsigned long val; 1600 int i; 1601 1602 if (WARN_ON(!shadow_vmcs)) 1603 return; 1604 1605 preempt_disable(); 1606 1607 vmcs_load(shadow_vmcs); 1608 1609 for (i = 0; i < max_shadow_read_write_fields; i++) { 1610 field = shadow_read_write_fields[i]; 1611 val = __vmcs_readl(field.encoding); 1612 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1613 } 1614 1615 vmcs_clear(shadow_vmcs); 1616 vmcs_load(vmx->loaded_vmcs->vmcs); 1617 1618 preempt_enable(); 1619 } 1620 1621 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1622 { 1623 const struct shadow_vmcs_field *fields[] = { 1624 shadow_read_write_fields, 1625 shadow_read_only_fields 1626 }; 1627 const int max_fields[] = { 1628 max_shadow_read_write_fields, 1629 max_shadow_read_only_fields 1630 }; 1631 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1632 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1633 struct shadow_vmcs_field field; 1634 unsigned long val; 1635 int i, q; 1636 1637 if (WARN_ON(!shadow_vmcs)) 1638 return; 1639 1640 vmcs_load(shadow_vmcs); 1641 1642 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1643 for (i = 0; i < max_fields[q]; i++) { 1644 field = fields[q][i]; 1645 val = vmcs12_read_any(vmcs12, field.encoding, 1646 field.offset); 1647 __vmcs_writel(field.encoding, val); 1648 } 1649 } 1650 1651 vmcs_clear(shadow_vmcs); 1652 vmcs_load(vmx->loaded_vmcs->vmcs); 1653 } 1654 1655 static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) 1656 { 1657 #ifdef CONFIG_KVM_HYPERV 1658 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1659 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 1660 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu); 1661 1662 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1663 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1664 vmcs12->guest_rip = evmcs->guest_rip; 1665 1666 if (unlikely(!(hv_clean_fields & 1667 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) { 1668 hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page; 1669 hv_vcpu->nested.vm_id = evmcs->hv_vm_id; 1670 hv_vcpu->nested.vp_id = evmcs->hv_vp_id; 1671 } 1672 1673 if (unlikely(!(hv_clean_fields & 1674 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1675 vmcs12->guest_rsp = evmcs->guest_rsp; 1676 vmcs12->guest_rflags = evmcs->guest_rflags; 1677 vmcs12->guest_interruptibility_info = 1678 evmcs->guest_interruptibility_info; 1679 /* 1680 * Not present in struct vmcs12: 1681 * vmcs12->guest_ssp = evmcs->guest_ssp; 1682 */ 1683 } 1684 1685 if (unlikely(!(hv_clean_fields & 1686 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1687 vmcs12->cpu_based_vm_exec_control = 1688 evmcs->cpu_based_vm_exec_control; 1689 } 1690 1691 if (unlikely(!(hv_clean_fields & 1692 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1693 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1694 } 1695 1696 if (unlikely(!(hv_clean_fields & 1697 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1698 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1699 } 1700 1701 if (unlikely(!(hv_clean_fields & 1702 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1703 vmcs12->vm_entry_intr_info_field = 1704 evmcs->vm_entry_intr_info_field; 1705 vmcs12->vm_entry_exception_error_code = 1706 evmcs->vm_entry_exception_error_code; 1707 vmcs12->vm_entry_instruction_len = 1708 evmcs->vm_entry_instruction_len; 1709 } 1710 1711 if (unlikely(!(hv_clean_fields & 1712 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1713 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1714 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1715 vmcs12->host_cr0 = evmcs->host_cr0; 1716 vmcs12->host_cr3 = evmcs->host_cr3; 1717 vmcs12->host_cr4 = evmcs->host_cr4; 1718 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1719 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1720 vmcs12->host_rip = evmcs->host_rip; 1721 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1722 vmcs12->host_es_selector = evmcs->host_es_selector; 1723 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1724 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1725 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1726 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1727 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1728 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1729 vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl; 1730 /* 1731 * Not present in struct vmcs12: 1732 * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet; 1733 * vmcs12->host_ssp = evmcs->host_ssp; 1734 * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr; 1735 */ 1736 } 1737 1738 if (unlikely(!(hv_clean_fields & 1739 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1740 vmcs12->pin_based_vm_exec_control = 1741 evmcs->pin_based_vm_exec_control; 1742 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1743 vmcs12->secondary_vm_exec_control = 1744 evmcs->secondary_vm_exec_control; 1745 } 1746 1747 if (unlikely(!(hv_clean_fields & 1748 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1749 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1750 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1751 } 1752 1753 if (unlikely(!(hv_clean_fields & 1754 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1755 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1756 } 1757 1758 if (unlikely(!(hv_clean_fields & 1759 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1760 vmcs12->guest_es_base = evmcs->guest_es_base; 1761 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1762 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1763 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1764 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1765 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1766 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1767 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1768 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1769 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1770 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1771 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1772 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1773 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1774 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1775 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1776 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1777 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1778 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1779 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1780 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1781 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1782 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1783 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1784 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1785 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1786 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1787 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1788 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1789 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1790 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1791 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1792 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1793 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1794 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1795 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1796 } 1797 1798 if (unlikely(!(hv_clean_fields & 1799 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1800 vmcs12->tsc_offset = evmcs->tsc_offset; 1801 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1802 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1803 vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap; 1804 vmcs12->tsc_multiplier = evmcs->tsc_multiplier; 1805 } 1806 1807 if (unlikely(!(hv_clean_fields & 1808 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1809 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1810 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1811 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1812 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1813 vmcs12->guest_cr0 = evmcs->guest_cr0; 1814 vmcs12->guest_cr3 = evmcs->guest_cr3; 1815 vmcs12->guest_cr4 = evmcs->guest_cr4; 1816 vmcs12->guest_dr7 = evmcs->guest_dr7; 1817 } 1818 1819 if (unlikely(!(hv_clean_fields & 1820 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1821 vmcs12->host_fs_base = evmcs->host_fs_base; 1822 vmcs12->host_gs_base = evmcs->host_gs_base; 1823 vmcs12->host_tr_base = evmcs->host_tr_base; 1824 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1825 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1826 vmcs12->host_rsp = evmcs->host_rsp; 1827 } 1828 1829 if (unlikely(!(hv_clean_fields & 1830 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1831 vmcs12->ept_pointer = evmcs->ept_pointer; 1832 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1833 } 1834 1835 if (unlikely(!(hv_clean_fields & 1836 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1837 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1838 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1839 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1840 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1841 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1842 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1843 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1844 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1845 vmcs12->guest_pending_dbg_exceptions = 1846 evmcs->guest_pending_dbg_exceptions; 1847 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1848 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1849 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1850 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1851 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1852 vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl; 1853 /* 1854 * Not present in struct vmcs12: 1855 * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet; 1856 * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl; 1857 * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr; 1858 */ 1859 } 1860 1861 /* 1862 * Not used? 1863 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1864 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1865 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1866 * vmcs12->page_fault_error_code_mask = 1867 * evmcs->page_fault_error_code_mask; 1868 * vmcs12->page_fault_error_code_match = 1869 * evmcs->page_fault_error_code_match; 1870 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1871 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1872 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1873 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1874 */ 1875 1876 /* 1877 * Read only fields: 1878 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1879 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1880 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1881 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1882 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1883 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1884 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1885 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1886 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1887 * vmcs12->exit_qualification = evmcs->exit_qualification; 1888 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1889 * 1890 * Not present in struct vmcs12: 1891 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1892 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1893 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1894 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1895 */ 1896 1897 return; 1898 #else /* CONFIG_KVM_HYPERV */ 1899 KVM_BUG_ON(1, vmx->vcpu.kvm); 1900 #endif /* CONFIG_KVM_HYPERV */ 1901 } 1902 1903 static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1904 { 1905 #ifdef CONFIG_KVM_HYPERV 1906 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1907 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 1908 1909 /* 1910 * Should not be changed by KVM: 1911 * 1912 * evmcs->host_es_selector = vmcs12->host_es_selector; 1913 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1914 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1915 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1916 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1917 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1918 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1919 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1920 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1921 * evmcs->host_cr0 = vmcs12->host_cr0; 1922 * evmcs->host_cr3 = vmcs12->host_cr3; 1923 * evmcs->host_cr4 = vmcs12->host_cr4; 1924 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1925 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1926 * evmcs->host_rip = vmcs12->host_rip; 1927 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1928 * evmcs->host_fs_base = vmcs12->host_fs_base; 1929 * evmcs->host_gs_base = vmcs12->host_gs_base; 1930 * evmcs->host_tr_base = vmcs12->host_tr_base; 1931 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1932 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1933 * evmcs->host_rsp = vmcs12->host_rsp; 1934 * sync_vmcs02_to_vmcs12() doesn't read these: 1935 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1936 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1937 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1938 * evmcs->ept_pointer = vmcs12->ept_pointer; 1939 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 1940 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 1941 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 1942 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 1943 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 1944 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 1945 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 1946 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 1947 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 1948 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 1949 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 1950 * evmcs->page_fault_error_code_mask = 1951 * vmcs12->page_fault_error_code_mask; 1952 * evmcs->page_fault_error_code_match = 1953 * vmcs12->page_fault_error_code_match; 1954 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 1955 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 1956 * evmcs->tsc_offset = vmcs12->tsc_offset; 1957 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 1958 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 1959 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 1960 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 1961 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 1962 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 1963 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 1964 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 1965 * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl; 1966 * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl; 1967 * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap; 1968 * evmcs->tsc_multiplier = vmcs12->tsc_multiplier; 1969 * 1970 * Not present in struct vmcs12: 1971 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 1972 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 1973 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 1974 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 1975 * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet; 1976 * evmcs->host_ssp = vmcs12->host_ssp; 1977 * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr; 1978 * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet; 1979 * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl; 1980 * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr; 1981 * evmcs->guest_ssp = vmcs12->guest_ssp; 1982 */ 1983 1984 evmcs->guest_es_selector = vmcs12->guest_es_selector; 1985 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 1986 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 1987 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 1988 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 1989 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 1990 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 1991 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 1992 1993 evmcs->guest_es_limit = vmcs12->guest_es_limit; 1994 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 1995 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 1996 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 1997 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 1998 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 1999 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 2000 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 2001 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 2002 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 2003 2004 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 2005 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 2006 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 2007 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 2008 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 2009 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 2010 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 2011 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 2012 2013 evmcs->guest_es_base = vmcs12->guest_es_base; 2014 evmcs->guest_cs_base = vmcs12->guest_cs_base; 2015 evmcs->guest_ss_base = vmcs12->guest_ss_base; 2016 evmcs->guest_ds_base = vmcs12->guest_ds_base; 2017 evmcs->guest_fs_base = vmcs12->guest_fs_base; 2018 evmcs->guest_gs_base = vmcs12->guest_gs_base; 2019 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 2020 evmcs->guest_tr_base = vmcs12->guest_tr_base; 2021 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 2022 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 2023 2024 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 2025 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 2026 2027 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 2028 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 2029 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 2030 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 2031 2032 evmcs->guest_pending_dbg_exceptions = 2033 vmcs12->guest_pending_dbg_exceptions; 2034 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 2035 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 2036 2037 evmcs->guest_activity_state = vmcs12->guest_activity_state; 2038 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 2039 2040 evmcs->guest_cr0 = vmcs12->guest_cr0; 2041 evmcs->guest_cr3 = vmcs12->guest_cr3; 2042 evmcs->guest_cr4 = vmcs12->guest_cr4; 2043 evmcs->guest_dr7 = vmcs12->guest_dr7; 2044 2045 evmcs->guest_physical_address = vmcs12->guest_physical_address; 2046 2047 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 2048 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 2049 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 2050 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 2051 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 2052 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 2053 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 2054 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 2055 2056 evmcs->exit_qualification = vmcs12->exit_qualification; 2057 2058 evmcs->guest_linear_address = vmcs12->guest_linear_address; 2059 evmcs->guest_rsp = vmcs12->guest_rsp; 2060 evmcs->guest_rflags = vmcs12->guest_rflags; 2061 2062 evmcs->guest_interruptibility_info = 2063 vmcs12->guest_interruptibility_info; 2064 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 2065 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 2066 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 2067 evmcs->vm_entry_exception_error_code = 2068 vmcs12->vm_entry_exception_error_code; 2069 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 2070 2071 evmcs->guest_rip = vmcs12->guest_rip; 2072 2073 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 2074 2075 return; 2076 #else /* CONFIG_KVM_HYPERV */ 2077 KVM_BUG_ON(1, vmx->vcpu.kvm); 2078 #endif /* CONFIG_KVM_HYPERV */ 2079 } 2080 2081 /* 2082 * This is an equivalent of the nested hypervisor executing the vmptrld 2083 * instruction. 2084 */ 2085 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( 2086 struct kvm_vcpu *vcpu, bool from_launch) 2087 { 2088 #ifdef CONFIG_KVM_HYPERV 2089 struct vcpu_vmx *vmx = to_vmx(vcpu); 2090 bool evmcs_gpa_changed = false; 2091 u64 evmcs_gpa; 2092 2093 if (likely(!guest_cpu_cap_has_evmcs(vcpu))) 2094 return EVMPTRLD_DISABLED; 2095 2096 evmcs_gpa = nested_get_evmptr(vcpu); 2097 if (!evmptr_is_valid(evmcs_gpa)) { 2098 nested_release_evmcs(vcpu); 2099 return EVMPTRLD_DISABLED; 2100 } 2101 2102 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 2103 vmx->nested.current_vmptr = INVALID_GPA; 2104 2105 nested_release_evmcs(vcpu); 2106 2107 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 2108 &vmx->nested.hv_evmcs_map)) 2109 return EVMPTRLD_ERROR; 2110 2111 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 2112 2113 /* 2114 * Currently, KVM only supports eVMCS version 1 2115 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 2116 * value to first u32 field of eVMCS which should specify eVMCS 2117 * VersionNumber. 2118 * 2119 * Guest should be aware of supported eVMCS versions by host by 2120 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 2121 * expected to set this CPUID leaf according to the value 2122 * returned in vmcs_version from nested_enable_evmcs(). 2123 * 2124 * However, it turns out that Microsoft Hyper-V fails to comply 2125 * to their own invented interface: When Hyper-V use eVMCS, it 2126 * just sets first u32 field of eVMCS to revision_id specified 2127 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 2128 * which is one of the supported versions specified in 2129 * CPUID.0x4000000A.EAX[0:15]. 2130 * 2131 * To overcome Hyper-V bug, we accept here either a supported 2132 * eVMCS version or VMCS12 revision_id as valid values for first 2133 * u32 field of eVMCS. 2134 */ 2135 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 2136 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 2137 nested_release_evmcs(vcpu); 2138 return EVMPTRLD_VMFAIL; 2139 } 2140 2141 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 2142 2143 evmcs_gpa_changed = true; 2144 /* 2145 * Unlike normal vmcs12, enlightened vmcs12 is not fully 2146 * reloaded from guest's memory (read only fields, fields not 2147 * present in struct hv_enlightened_vmcs, ...). Make sure there 2148 * are no leftovers. 2149 */ 2150 if (from_launch) { 2151 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2152 memset(vmcs12, 0, sizeof(*vmcs12)); 2153 vmcs12->hdr.revision_id = VMCS12_REVISION; 2154 } 2155 2156 } 2157 2158 /* 2159 * Clean fields data can't be used on VMLAUNCH and when we switch 2160 * between different L2 guests as KVM keeps a single VMCS12 per L1. 2161 */ 2162 if (from_launch || evmcs_gpa_changed) { 2163 vmx->nested.hv_evmcs->hv_clean_fields &= 2164 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2165 2166 vmx->nested.force_msr_bitmap_recalc = true; 2167 } 2168 2169 return EVMPTRLD_SUCCEEDED; 2170 #else 2171 return EVMPTRLD_DISABLED; 2172 #endif 2173 } 2174 2175 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 2176 { 2177 struct vcpu_vmx *vmx = to_vmx(vcpu); 2178 2179 if (nested_vmx_is_evmptr12_valid(vmx)) 2180 copy_vmcs12_to_enlightened(vmx); 2181 else 2182 copy_vmcs12_to_shadow(vmx); 2183 2184 vmx->nested.need_vmcs12_to_shadow_sync = false; 2185 } 2186 2187 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 2188 { 2189 struct vcpu_vmx *vmx = 2190 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 2191 2192 vmx->nested.preemption_timer_expired = true; 2193 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 2194 kvm_vcpu_kick(&vmx->vcpu); 2195 2196 return HRTIMER_NORESTART; 2197 } 2198 2199 static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) 2200 { 2201 struct vcpu_vmx *vmx = to_vmx(vcpu); 2202 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2203 2204 u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> 2205 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2206 2207 if (!vmx->nested.has_preemption_timer_deadline) { 2208 vmx->nested.preemption_timer_deadline = 2209 vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; 2210 vmx->nested.has_preemption_timer_deadline = true; 2211 } 2212 return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; 2213 } 2214 2215 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, 2216 u64 preemption_timeout) 2217 { 2218 struct vcpu_vmx *vmx = to_vmx(vcpu); 2219 2220 /* 2221 * A timer value of zero is architecturally guaranteed to cause 2222 * a VMExit prior to executing any instructions in the guest. 2223 */ 2224 if (preemption_timeout == 0) { 2225 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 2226 return; 2227 } 2228 2229 if (vcpu->arch.virtual_tsc_khz == 0) 2230 return; 2231 2232 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2233 preemption_timeout *= 1000000; 2234 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 2235 hrtimer_start(&vmx->nested.preemption_timer, 2236 ktime_add_ns(ktime_get(), preemption_timeout), 2237 HRTIMER_MODE_ABS_PINNED); 2238 } 2239 2240 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2241 { 2242 if (vmx->nested.nested_run_pending && 2243 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 2244 return vmcs12->guest_ia32_efer; 2245 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 2246 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 2247 else 2248 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 2249 } 2250 2251 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 2252 { 2253 struct kvm *kvm = vmx->vcpu.kvm; 2254 2255 /* 2256 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 2257 * according to L0's settings (vmcs12 is irrelevant here). Host 2258 * fields that come from L0 and are not constant, e.g. HOST_CR3, 2259 * will be set as needed prior to VMLAUNCH/VMRESUME. 2260 */ 2261 if (vmx->nested.vmcs02_initialized) 2262 return; 2263 vmx->nested.vmcs02_initialized = true; 2264 2265 /* 2266 * We don't care what the EPTP value is we just need to guarantee 2267 * it's valid so we don't get a false positive when doing early 2268 * consistency checks. 2269 */ 2270 if (enable_ept && nested_early_check) 2271 vmcs_write64(EPT_POINTER, 2272 construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); 2273 2274 if (vmx->ve_info) 2275 vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info)); 2276 2277 /* All VMFUNCs are currently emulated through L0 vmexits. */ 2278 if (cpu_has_vmx_vmfunc()) 2279 vmcs_write64(VM_FUNCTION_CONTROL, 0); 2280 2281 if (cpu_has_vmx_posted_intr()) 2282 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 2283 2284 if (cpu_has_vmx_msr_bitmap()) 2285 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2286 2287 /* 2288 * PML is emulated for L2, but never enabled in hardware as the MMU 2289 * handles A/D emulation. Disabling PML for L2 also avoids having to 2290 * deal with filtering out L2 GPAs from the buffer. 2291 */ 2292 if (enable_pml) { 2293 vmcs_write64(PML_ADDRESS, 0); 2294 vmcs_write16(GUEST_PML_INDEX, -1); 2295 } 2296 2297 if (cpu_has_vmx_encls_vmexit()) 2298 vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); 2299 2300 if (kvm_notify_vmexit_enabled(kvm)) 2301 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 2302 2303 /* 2304 * Set the MSR load/store lists to match L0's settings. Only the 2305 * addresses are constant (for vmcs02), the counts can change based 2306 * on L2's behavior, e.g. switching to/from long mode. 2307 */ 2308 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); 2309 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2310 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2311 2312 vmx_set_constant_host_state(vmx); 2313 } 2314 2315 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2316 struct vmcs12 *vmcs12) 2317 { 2318 prepare_vmcs02_constant_state(vmx); 2319 2320 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 2321 2322 /* 2323 * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the 2324 * same VPID as the host. Emulate this behavior by using vpid01 for L2 2325 * if VPID is disabled in vmcs12. Note, if VPID is disabled, VM-Enter 2326 * and VM-Exit are architecturally required to flush VPID=0, but *only* 2327 * VPID=0. I.e. using vpid02 would be ok (so long as KVM emulates the 2328 * required flushes), but doing so would cause KVM to over-flush. E.g. 2329 * if L1 runs L2 X with VPID12=1, then runs L2 Y with VPID12 disabled, 2330 * and then runs L2 X again, then KVM can and should retain TLB entries 2331 * for VPID12=1. 2332 */ 2333 if (enable_vpid) { 2334 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2335 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2336 else 2337 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2338 } 2339 } 2340 2341 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, 2342 struct vmcs12 *vmcs12) 2343 { 2344 u32 exec_control; 2345 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2346 2347 if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) 2348 prepare_vmcs02_early_rare(vmx, vmcs12); 2349 2350 /* 2351 * PIN CONTROLS 2352 */ 2353 exec_control = __pin_controls_get(vmcs01); 2354 exec_control |= (vmcs12->pin_based_vm_exec_control & 2355 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2356 2357 /* Posted interrupts setting is only taken from vmcs12. */ 2358 vmx->nested.pi_pending = false; 2359 if (nested_cpu_has_posted_intr(vmcs12)) { 2360 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2361 } else { 2362 vmx->nested.posted_intr_nv = -1; 2363 exec_control &= ~PIN_BASED_POSTED_INTR; 2364 } 2365 pin_controls_set(vmx, exec_control); 2366 2367 /* 2368 * EXEC CONTROLS 2369 */ 2370 exec_control = __exec_controls_get(vmcs01); /* L0's desires */ 2371 exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; 2372 exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; 2373 exec_control &= ~CPU_BASED_TPR_SHADOW; 2374 exec_control |= vmcs12->cpu_based_vm_exec_control; 2375 2376 vmx->nested.l1_tpr_threshold = -1; 2377 if (exec_control & CPU_BASED_TPR_SHADOW) 2378 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2379 #ifdef CONFIG_X86_64 2380 else 2381 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2382 CPU_BASED_CR8_STORE_EXITING; 2383 #endif 2384 2385 /* 2386 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2387 * for I/O port accesses. 2388 */ 2389 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2390 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2391 2392 /* 2393 * This bit will be computed in nested_get_vmcs12_pages, because 2394 * we do not have access to L1's MSR bitmap yet. For now, keep 2395 * the same bit as before, hoping to avoid multiple VMWRITEs that 2396 * only set/clear this bit. 2397 */ 2398 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2399 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2400 2401 exec_controls_set(vmx, exec_control); 2402 2403 /* 2404 * SECONDARY EXEC CONTROLS 2405 */ 2406 if (cpu_has_secondary_exec_ctrls()) { 2407 exec_control = __secondary_exec_controls_get(vmcs01); 2408 2409 /* Take the following fields only from vmcs12 */ 2410 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2411 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2412 SECONDARY_EXEC_ENABLE_INVPCID | 2413 SECONDARY_EXEC_ENABLE_RDTSCP | 2414 SECONDARY_EXEC_ENABLE_XSAVES | 2415 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2416 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2417 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2418 SECONDARY_EXEC_ENABLE_VMFUNC | 2419 SECONDARY_EXEC_DESC); 2420 2421 if (nested_cpu_has(vmcs12, 2422 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 2423 exec_control |= vmcs12->secondary_vm_exec_control; 2424 2425 /* PML is emulated and never enabled in hardware for L2. */ 2426 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 2427 2428 /* VMCS shadowing for L2 is emulated for now */ 2429 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2430 2431 /* 2432 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2433 * will not have to rewrite the controls just for this bit. 2434 */ 2435 if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2436 exec_control |= SECONDARY_EXEC_DESC; 2437 2438 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2439 vmcs_write16(GUEST_INTR_STATUS, 2440 vmcs12->guest_intr_status); 2441 2442 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 2443 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2444 2445 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) 2446 vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); 2447 2448 secondary_exec_controls_set(vmx, exec_control); 2449 } 2450 2451 /* 2452 * ENTRY CONTROLS 2453 * 2454 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2455 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2456 * on the related bits (if supported by the CPU) in the hope that 2457 * we can avoid VMWrites during vmx_set_efer(). 2458 * 2459 * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is 2460 * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to 2461 * do the same for L2. 2462 */ 2463 exec_control = __vm_entry_controls_get(vmcs01); 2464 exec_control |= (vmcs12->vm_entry_controls & 2465 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 2466 exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); 2467 if (cpu_has_load_ia32_efer()) { 2468 if (guest_efer & EFER_LMA) 2469 exec_control |= VM_ENTRY_IA32E_MODE; 2470 if (guest_efer != kvm_host.efer) 2471 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2472 } 2473 vm_entry_controls_set(vmx, exec_control); 2474 2475 /* 2476 * EXIT CONTROLS 2477 * 2478 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2479 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2480 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2481 */ 2482 exec_control = __vm_exit_controls_get(vmcs01); 2483 if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer) 2484 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2485 else 2486 exec_control &= ~VM_EXIT_LOAD_IA32_EFER; 2487 vm_exit_controls_set(vmx, exec_control); 2488 2489 /* 2490 * Interrupt/Exception Fields 2491 */ 2492 if (vmx->nested.nested_run_pending) { 2493 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2494 vmcs12->vm_entry_intr_info_field); 2495 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2496 vmcs12->vm_entry_exception_error_code); 2497 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2498 vmcs12->vm_entry_instruction_len); 2499 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2500 vmcs12->guest_interruptibility_info); 2501 vmx->loaded_vmcs->nmi_known_unmasked = 2502 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2503 } else { 2504 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2505 } 2506 } 2507 2508 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2509 { 2510 struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx); 2511 2512 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2513 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2514 2515 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2516 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2517 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2518 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2519 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2520 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2521 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2522 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2523 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2524 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2525 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2526 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2527 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2528 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2529 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2530 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2531 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2532 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2533 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2534 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2535 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2536 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2537 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2538 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2539 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2540 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2541 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2542 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2543 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2544 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2545 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2546 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2547 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2548 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2549 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2550 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2551 2552 vmx_segment_cache_clear(vmx); 2553 } 2554 2555 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2556 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2557 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2558 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2559 vmcs12->guest_pending_dbg_exceptions); 2560 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2561 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2562 2563 /* 2564 * L1 may access the L2's PDPTR, so save them to construct 2565 * vmcs12 2566 */ 2567 if (enable_ept) { 2568 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2569 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2570 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2571 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2572 } 2573 2574 if (kvm_mpx_supported() && vmx->nested.nested_run_pending && 2575 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2576 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2577 } 2578 2579 if (nested_cpu_has_xsaves(vmcs12)) 2580 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2581 2582 /* 2583 * Whether page-faults are trapped is determined by a combination of 2584 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 2585 * doesn't care about page faults then we should set all of these to 2586 * L1's desires. However, if L0 does care about (some) page faults, it 2587 * is not easy (if at all possible?) to merge L0 and L1's desires, we 2588 * simply ask to exit on each and every L2 page fault. This is done by 2589 * setting MASK=MATCH=0 and (see below) EB.PF=1. 2590 * Note that below we don't need special code to set EB.PF beyond the 2591 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2592 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2593 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2594 */ 2595 if (vmx_need_pf_intercept(&vmx->vcpu)) { 2596 /* 2597 * TODO: if both L0 and L1 need the same MASK and MATCH, 2598 * go ahead and use it? 2599 */ 2600 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 2601 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 2602 } else { 2603 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); 2604 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); 2605 } 2606 2607 if (cpu_has_vmx_apicv()) { 2608 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2609 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2610 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2611 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2612 } 2613 2614 /* 2615 * Make sure the msr_autostore list is up to date before we set the 2616 * count in the vmcs02. 2617 */ 2618 prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); 2619 2620 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); 2621 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2622 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2623 2624 set_cr4_guest_host_mask(vmx); 2625 } 2626 2627 /* 2628 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2629 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2630 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2631 * guest in a way that will both be appropriate to L1's requests, and our 2632 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2633 * function also has additional necessary side-effects, like setting various 2634 * vcpu->arch fields. 2635 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2636 * is assigned to entry_failure_code on failure. 2637 */ 2638 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2639 bool from_vmentry, 2640 enum vm_entry_failure_code *entry_failure_code) 2641 { 2642 struct vcpu_vmx *vmx = to_vmx(vcpu); 2643 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 2644 bool load_guest_pdptrs_vmcs12 = false; 2645 2646 if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) { 2647 prepare_vmcs02_rare(vmx, vmcs12); 2648 vmx->nested.dirty_vmcs12 = false; 2649 2650 load_guest_pdptrs_vmcs12 = !nested_vmx_is_evmptr12_valid(vmx) || 2651 !(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2652 } 2653 2654 if (vmx->nested.nested_run_pending && 2655 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2656 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2657 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); 2658 } else { 2659 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2660 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); 2661 } 2662 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2663 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2664 vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); 2665 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2666 2667 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2668 * bitwise-or of what L1 wants to trap for L2, and what we want to 2669 * trap. Note that CR0.TS also needs updating - we do this later. 2670 */ 2671 vmx_update_exception_bitmap(vcpu); 2672 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2673 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2674 2675 if (vmx->nested.nested_run_pending && 2676 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2677 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2678 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2679 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2680 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 2681 } 2682 2683 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( 2684 vcpu->arch.l1_tsc_offset, 2685 vmx_get_l2_tsc_offset(vcpu), 2686 vmx_get_l2_tsc_multiplier(vcpu)); 2687 2688 vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( 2689 vcpu->arch.l1_tsc_scaling_ratio, 2690 vmx_get_l2_tsc_multiplier(vcpu)); 2691 2692 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2693 if (kvm_caps.has_tsc_control) 2694 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 2695 2696 nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); 2697 2698 if (nested_cpu_has_ept(vmcs12)) 2699 nested_ept_init_mmu_context(vcpu); 2700 2701 /* 2702 * Override the CR0/CR4 read shadows after setting the effective guest 2703 * CR0/CR4. The common helpers also set the shadows, but they don't 2704 * account for vmcs12's cr0/4_guest_host_mask. 2705 */ 2706 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2707 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2708 2709 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2710 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2711 2712 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2713 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2714 vmx_set_efer(vcpu, vcpu->arch.efer); 2715 2716 /* 2717 * Guest state is invalid and unrestricted guest is disabled, 2718 * which means L1 attempted VMEntry to L2 with invalid state. 2719 * Fail the VMEntry. 2720 * 2721 * However when force loading the guest state (SMM exit or 2722 * loading nested state after migration, it is possible to 2723 * have invalid guest state now, which will be later fixed by 2724 * restoring L2 register state 2725 */ 2726 if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { 2727 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2728 return -EINVAL; 2729 } 2730 2731 /* Shadow page tables on either EPT or shadow page tables. */ 2732 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2733 from_vmentry, entry_failure_code)) 2734 return -EINVAL; 2735 2736 /* 2737 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 2738 * on nested VM-Exit, which can occur without actually running L2 and 2739 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with 2740 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the 2741 * transition to HLT instead of running L2. 2742 */ 2743 if (enable_ept) 2744 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); 2745 2746 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2747 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2748 is_pae_paging(vcpu)) { 2749 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2750 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2751 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2752 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2753 } 2754 2755 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2756 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && 2757 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2758 vmcs12->guest_ia32_perf_global_ctrl))) { 2759 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2760 return -EINVAL; 2761 } 2762 2763 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2764 kvm_rip_write(vcpu, vmcs12->guest_rip); 2765 2766 /* 2767 * It was observed that genuine Hyper-V running in L1 doesn't reset 2768 * 'hv_clean_fields' by itself, it only sets the corresponding dirty 2769 * bits when it changes a field in eVMCS. Mark all fields as clean 2770 * here. 2771 */ 2772 if (nested_vmx_is_evmptr12_valid(vmx)) 2773 evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2774 2775 return 0; 2776 } 2777 2778 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2779 { 2780 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2781 nested_cpu_has_virtual_nmis(vmcs12))) 2782 return -EINVAL; 2783 2784 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2785 nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) 2786 return -EINVAL; 2787 2788 return 0; 2789 } 2790 2791 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) 2792 { 2793 struct vcpu_vmx *vmx = to_vmx(vcpu); 2794 2795 /* Check for memory type validity */ 2796 switch (new_eptp & VMX_EPTP_MT_MASK) { 2797 case VMX_EPTP_MT_UC: 2798 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2799 return false; 2800 break; 2801 case VMX_EPTP_MT_WB: 2802 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2803 return false; 2804 break; 2805 default: 2806 return false; 2807 } 2808 2809 /* Page-walk levels validity. */ 2810 switch (new_eptp & VMX_EPTP_PWL_MASK) { 2811 case VMX_EPTP_PWL_5: 2812 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) 2813 return false; 2814 break; 2815 case VMX_EPTP_PWL_4: 2816 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) 2817 return false; 2818 break; 2819 default: 2820 return false; 2821 } 2822 2823 /* Reserved bits should not be set */ 2824 if (CC(!kvm_vcpu_is_legal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) 2825 return false; 2826 2827 /* AD, if set, should be supported */ 2828 if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { 2829 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2830 return false; 2831 } 2832 2833 return true; 2834 } 2835 2836 /* 2837 * Checks related to VM-Execution Control Fields 2838 */ 2839 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2840 struct vmcs12 *vmcs12) 2841 { 2842 struct vcpu_vmx *vmx = to_vmx(vcpu); 2843 2844 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2845 vmx->nested.msrs.pinbased_ctls_low, 2846 vmx->nested.msrs.pinbased_ctls_high)) || 2847 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2848 vmx->nested.msrs.procbased_ctls_low, 2849 vmx->nested.msrs.procbased_ctls_high))) 2850 return -EINVAL; 2851 2852 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2853 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2854 vmx->nested.msrs.secondary_ctls_low, 2855 vmx->nested.msrs.secondary_ctls_high))) 2856 return -EINVAL; 2857 2858 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2859 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2860 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2861 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2862 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2863 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2864 nested_vmx_check_nmi_controls(vmcs12) || 2865 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2866 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2867 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2868 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2869 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2870 return -EINVAL; 2871 2872 if (!nested_cpu_has_preemption_timer(vmcs12) && 2873 nested_cpu_has_save_preemption_timer(vmcs12)) 2874 return -EINVAL; 2875 2876 if (nested_cpu_has_ept(vmcs12) && 2877 CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) 2878 return -EINVAL; 2879 2880 if (nested_cpu_has_vmfunc(vmcs12)) { 2881 if (CC(vmcs12->vm_function_control & 2882 ~vmx->nested.msrs.vmfunc_controls)) 2883 return -EINVAL; 2884 2885 if (nested_cpu_has_eptp_switching(vmcs12)) { 2886 if (CC(!nested_cpu_has_ept(vmcs12)) || 2887 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2888 return -EINVAL; 2889 } 2890 } 2891 2892 return 0; 2893 } 2894 2895 /* 2896 * Checks related to VM-Exit Control Fields 2897 */ 2898 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2899 struct vmcs12 *vmcs12) 2900 { 2901 struct vcpu_vmx *vmx = to_vmx(vcpu); 2902 2903 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 2904 vmx->nested.msrs.exit_ctls_low, 2905 vmx->nested.msrs.exit_ctls_high)) || 2906 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 2907 return -EINVAL; 2908 2909 return 0; 2910 } 2911 2912 /* 2913 * Checks related to VM-Entry Control Fields 2914 */ 2915 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 2916 struct vmcs12 *vmcs12) 2917 { 2918 struct vcpu_vmx *vmx = to_vmx(vcpu); 2919 2920 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 2921 vmx->nested.msrs.entry_ctls_low, 2922 vmx->nested.msrs.entry_ctls_high))) 2923 return -EINVAL; 2924 2925 /* 2926 * From the Intel SDM, volume 3: 2927 * Fields relevant to VM-entry event injection must be set properly. 2928 * These fields are the VM-entry interruption-information field, the 2929 * VM-entry exception error code, and the VM-entry instruction length. 2930 */ 2931 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 2932 u32 intr_info = vmcs12->vm_entry_intr_info_field; 2933 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 2934 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 2935 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 2936 bool should_have_error_code; 2937 bool urg = nested_cpu_has2(vmcs12, 2938 SECONDARY_EXEC_UNRESTRICTED_GUEST); 2939 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 2940 2941 /* VM-entry interruption-info field: interruption type */ 2942 if (CC(intr_type == INTR_TYPE_RESERVED) || 2943 CC(intr_type == INTR_TYPE_OTHER_EVENT && 2944 !nested_cpu_supports_monitor_trap_flag(vcpu))) 2945 return -EINVAL; 2946 2947 /* VM-entry interruption-info field: vector */ 2948 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 2949 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 2950 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 2951 return -EINVAL; 2952 2953 /* VM-entry interruption-info field: deliver error code */ 2954 should_have_error_code = 2955 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && 2956 x86_exception_has_error_code(vector); 2957 if (CC(has_error_code != should_have_error_code)) 2958 return -EINVAL; 2959 2960 /* VM-entry exception error code */ 2961 if (CC(has_error_code && 2962 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) 2963 return -EINVAL; 2964 2965 /* VM-entry interruption-info field: reserved bits */ 2966 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 2967 return -EINVAL; 2968 2969 /* VM-entry instruction length */ 2970 switch (intr_type) { 2971 case INTR_TYPE_SOFT_EXCEPTION: 2972 case INTR_TYPE_SOFT_INTR: 2973 case INTR_TYPE_PRIV_SW_EXCEPTION: 2974 if (CC(vmcs12->vm_entry_instruction_len > X86_MAX_INSTRUCTION_LENGTH) || 2975 CC(vmcs12->vm_entry_instruction_len == 0 && 2976 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 2977 return -EINVAL; 2978 } 2979 } 2980 2981 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 2982 return -EINVAL; 2983 2984 return 0; 2985 } 2986 2987 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 2988 struct vmcs12 *vmcs12) 2989 { 2990 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 2991 nested_check_vm_exit_controls(vcpu, vmcs12) || 2992 nested_check_vm_entry_controls(vcpu, vmcs12)) 2993 return -EINVAL; 2994 2995 #ifdef CONFIG_KVM_HYPERV 2996 if (guest_cpu_cap_has_evmcs(vcpu)) 2997 return nested_evmcs_check_controls(vmcs12); 2998 #endif 2999 3000 return 0; 3001 } 3002 3003 static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, 3004 struct vmcs12 *vmcs12) 3005 { 3006 #ifdef CONFIG_X86_64 3007 if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != 3008 !!(vcpu->arch.efer & EFER_LMA))) 3009 return -EINVAL; 3010 #endif 3011 return 0; 3012 } 3013 3014 static bool is_l1_noncanonical_address_on_vmexit(u64 la, struct vmcs12 *vmcs12) 3015 { 3016 /* 3017 * Check that the given linear address is canonical after a VM exit 3018 * from L2, based on HOST_CR4.LA57 value that will be loaded for L1. 3019 */ 3020 u8 l1_address_bits_on_exit = (vmcs12->host_cr4 & X86_CR4_LA57) ? 57 : 48; 3021 3022 return !__is_canonical_address(la, l1_address_bits_on_exit); 3023 } 3024 3025 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 3026 struct vmcs12 *vmcs12) 3027 { 3028 bool ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); 3029 3030 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 3031 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 3032 CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3))) 3033 return -EINVAL; 3034 3035 if (CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 3036 CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 3037 return -EINVAL; 3038 3039 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 3040 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 3041 return -EINVAL; 3042 3043 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 3044 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 3045 vmcs12->host_ia32_perf_global_ctrl))) 3046 return -EINVAL; 3047 3048 if (ia32e) { 3049 if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) 3050 return -EINVAL; 3051 } else { 3052 if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || 3053 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || 3054 CC((vmcs12->host_rip) >> 32)) 3055 return -EINVAL; 3056 } 3057 3058 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3059 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3060 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3061 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3062 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3063 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3064 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3065 CC(vmcs12->host_cs_selector == 0) || 3066 CC(vmcs12->host_tr_selector == 0) || 3067 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 3068 return -EINVAL; 3069 3070 if (CC(is_noncanonical_base_address(vmcs12->host_fs_base, vcpu)) || 3071 CC(is_noncanonical_base_address(vmcs12->host_gs_base, vcpu)) || 3072 CC(is_noncanonical_base_address(vmcs12->host_gdtr_base, vcpu)) || 3073 CC(is_noncanonical_base_address(vmcs12->host_idtr_base, vcpu)) || 3074 CC(is_noncanonical_base_address(vmcs12->host_tr_base, vcpu)) || 3075 CC(is_l1_noncanonical_address_on_vmexit(vmcs12->host_rip, vmcs12))) 3076 return -EINVAL; 3077 3078 /* 3079 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 3080 * IA32_EFER MSR must be 0 in the field for that register. In addition, 3081 * the values of the LMA and LME bits in the field must each be that of 3082 * the host address-space size VM-exit control. 3083 */ 3084 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 3085 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 3086 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 3087 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 3088 return -EINVAL; 3089 } 3090 3091 return 0; 3092 } 3093 3094 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 3095 struct vmcs12 *vmcs12) 3096 { 3097 struct vcpu_vmx *vmx = to_vmx(vcpu); 3098 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 3099 struct vmcs_hdr hdr; 3100 3101 if (vmcs12->vmcs_link_pointer == INVALID_GPA) 3102 return 0; 3103 3104 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 3105 return -EINVAL; 3106 3107 if (ghc->gpa != vmcs12->vmcs_link_pointer && 3108 CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 3109 vmcs12->vmcs_link_pointer, VMCS12_SIZE))) 3110 return -EINVAL; 3111 3112 if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 3113 offsetof(struct vmcs12, hdr), 3114 sizeof(hdr)))) 3115 return -EINVAL; 3116 3117 if (CC(hdr.revision_id != VMCS12_REVISION) || 3118 CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 3119 return -EINVAL; 3120 3121 return 0; 3122 } 3123 3124 /* 3125 * Checks related to Guest Non-register State 3126 */ 3127 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 3128 { 3129 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 3130 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && 3131 vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) 3132 return -EINVAL; 3133 3134 return 0; 3135 } 3136 3137 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 3138 struct vmcs12 *vmcs12, 3139 enum vm_entry_failure_code *entry_failure_code) 3140 { 3141 bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE); 3142 3143 *entry_failure_code = ENTRY_FAIL_DEFAULT; 3144 3145 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 3146 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 3147 return -EINVAL; 3148 3149 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && 3150 CC(!kvm_dr7_valid(vmcs12->guest_dr7))) 3151 return -EINVAL; 3152 3153 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 3154 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 3155 return -EINVAL; 3156 3157 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 3158 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; 3159 return -EINVAL; 3160 } 3161 3162 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 3163 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 3164 vmcs12->guest_ia32_perf_global_ctrl))) 3165 return -EINVAL; 3166 3167 if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG)) 3168 return -EINVAL; 3169 3170 if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) || 3171 CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG))) 3172 return -EINVAL; 3173 3174 /* 3175 * If the load IA32_EFER VM-entry control is 1, the following checks 3176 * are performed on the field for the IA32_EFER MSR: 3177 * - Bits reserved in the IA32_EFER MSR must be 0. 3178 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 3179 * the IA-32e mode guest VM-exit control. It must also be identical 3180 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 3181 * CR0.PG) is 1. 3182 */ 3183 if (to_vmx(vcpu)->nested.nested_run_pending && 3184 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 3185 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 3186 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 3187 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 3188 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 3189 return -EINVAL; 3190 } 3191 3192 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 3193 (CC(is_noncanonical_msr_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 3194 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 3195 return -EINVAL; 3196 3197 if (nested_check_guest_non_reg_state(vmcs12)) 3198 return -EINVAL; 3199 3200 return 0; 3201 } 3202 3203 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) 3204 { 3205 struct vcpu_vmx *vmx = to_vmx(vcpu); 3206 unsigned long cr3, cr4; 3207 bool vm_fail; 3208 3209 if (!nested_early_check) 3210 return 0; 3211 3212 if (vmx->msr_autoload.host.nr) 3213 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 3214 if (vmx->msr_autoload.guest.nr) 3215 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 3216 3217 preempt_disable(); 3218 3219 vmx_prepare_switch_to_guest(vcpu); 3220 3221 /* 3222 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, 3223 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to 3224 * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. 3225 * there is no need to preserve other bits or save/restore the field. 3226 */ 3227 vmcs_writel(GUEST_RFLAGS, 0); 3228 3229 cr3 = __get_current_cr3_fast(); 3230 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 3231 vmcs_writel(HOST_CR3, cr3); 3232 vmx->loaded_vmcs->host_state.cr3 = cr3; 3233 } 3234 3235 cr4 = cr4_read_shadow(); 3236 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 3237 vmcs_writel(HOST_CR4, cr4); 3238 vmx->loaded_vmcs->host_state.cr4 = cr4; 3239 } 3240 3241 vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 3242 __vmx_vcpu_run_flags(vmx)); 3243 3244 if (vmx->msr_autoload.host.nr) 3245 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 3246 if (vmx->msr_autoload.guest.nr) 3247 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 3248 3249 if (vm_fail) { 3250 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); 3251 3252 preempt_enable(); 3253 3254 trace_kvm_nested_vmenter_failed( 3255 "early hardware check VM-instruction error: ", error); 3256 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3257 return 1; 3258 } 3259 3260 /* 3261 * VMExit clears RFLAGS.IF and DR7, even on a consistency check. 3262 */ 3263 if (hw_breakpoint_active()) 3264 set_debugreg(__this_cpu_read(cpu_dr7), 7); 3265 local_irq_enable(); 3266 preempt_enable(); 3267 3268 /* 3269 * A non-failing VMEntry means we somehow entered guest mode with 3270 * an illegal RIP, and that's just the tip of the iceberg. There 3271 * is no telling what memory has been modified or what state has 3272 * been exposed to unknown code. Hitting this all but guarantees 3273 * a (very critical) hardware issue. 3274 */ 3275 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & 3276 VMX_EXIT_REASONS_FAILED_VMENTRY)); 3277 3278 return 0; 3279 } 3280 3281 #ifdef CONFIG_KVM_HYPERV 3282 static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) 3283 { 3284 struct vcpu_vmx *vmx = to_vmx(vcpu); 3285 3286 /* 3287 * hv_evmcs may end up being not mapped after migration (when 3288 * L2 was running), map it here to make sure vmcs12 changes are 3289 * properly reflected. 3290 */ 3291 if (guest_cpu_cap_has_evmcs(vcpu) && 3292 vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { 3293 enum nested_evmptrld_status evmptrld_status = 3294 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 3295 3296 if (evmptrld_status == EVMPTRLD_VMFAIL || 3297 evmptrld_status == EVMPTRLD_ERROR) 3298 return false; 3299 3300 /* 3301 * Post migration VMCS12 always provides the most actual 3302 * information, copy it to eVMCS upon entry. 3303 */ 3304 vmx->nested.need_vmcs12_to_shadow_sync = true; 3305 } 3306 3307 return true; 3308 } 3309 #endif 3310 3311 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 3312 { 3313 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3314 struct vcpu_vmx *vmx = to_vmx(vcpu); 3315 struct kvm_host_map *map; 3316 3317 if (!vcpu->arch.pdptrs_from_userspace && 3318 !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 3319 /* 3320 * Reload the guest's PDPTRs since after a migration 3321 * the guest CR3 might be restored prior to setting the nested 3322 * state which can lead to a load of wrong PDPTRs. 3323 */ 3324 if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) 3325 return false; 3326 } 3327 3328 3329 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3330 map = &vmx->nested.apic_access_page_map; 3331 3332 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) { 3333 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn)); 3334 } else { 3335 pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n", 3336 __func__); 3337 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3338 vcpu->run->internal.suberror = 3339 KVM_INTERNAL_ERROR_EMULATION; 3340 vcpu->run->internal.ndata = 0; 3341 return false; 3342 } 3343 } 3344 3345 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3346 map = &vmx->nested.virtual_apic_map; 3347 3348 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 3349 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 3350 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 3351 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 3352 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3353 /* 3354 * The processor will never use the TPR shadow, simply 3355 * clear the bit from the execution control. Such a 3356 * configuration is useless, but it happens in tests. 3357 * For any other configuration, failing the vm entry is 3358 * _not_ what the processor does but it's basically the 3359 * only possibility we have. 3360 */ 3361 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 3362 } else { 3363 /* 3364 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 3365 * force VM-Entry to fail. 3366 */ 3367 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); 3368 } 3369 } 3370 3371 if (nested_cpu_has_posted_intr(vmcs12)) { 3372 map = &vmx->nested.pi_desc_map; 3373 3374 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 3375 vmx->nested.pi_desc = 3376 (struct pi_desc *)(((void *)map->hva) + 3377 offset_in_page(vmcs12->posted_intr_desc_addr)); 3378 vmcs_write64(POSTED_INTR_DESC_ADDR, 3379 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 3380 } else { 3381 /* 3382 * Defer the KVM_INTERNAL_EXIT until KVM tries to 3383 * access the contents of the VMCS12 posted interrupt 3384 * descriptor. (Note that KVM may do this when it 3385 * should not, per the architectural specification.) 3386 */ 3387 vmx->nested.pi_desc = NULL; 3388 pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); 3389 } 3390 } 3391 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 3392 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3393 else 3394 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3395 3396 return true; 3397 } 3398 3399 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) 3400 { 3401 #ifdef CONFIG_KVM_HYPERV 3402 /* 3403 * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy 3404 * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory 3405 * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post 3406 * migration. 3407 */ 3408 if (!nested_get_evmcs_page(vcpu)) { 3409 pr_debug_ratelimited("%s: enlightened vmptrld failed\n", 3410 __func__); 3411 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3412 vcpu->run->internal.suberror = 3413 KVM_INTERNAL_ERROR_EMULATION; 3414 vcpu->run->internal.ndata = 0; 3415 3416 return false; 3417 } 3418 #endif 3419 3420 if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) 3421 return false; 3422 3423 return true; 3424 } 3425 3426 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) 3427 { 3428 struct vmcs12 *vmcs12; 3429 struct vcpu_vmx *vmx = to_vmx(vcpu); 3430 gpa_t dst; 3431 3432 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) 3433 return 0; 3434 3435 if (WARN_ON_ONCE(vmx->nested.pml_full)) 3436 return 1; 3437 3438 /* 3439 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is 3440 * set is already checked as part of A/D emulation. 3441 */ 3442 vmcs12 = get_vmcs12(vcpu); 3443 if (!nested_cpu_has_pml(vmcs12)) 3444 return 0; 3445 3446 if (vmcs12->guest_pml_index >= PML_LOG_NR_ENTRIES) { 3447 vmx->nested.pml_full = true; 3448 return 1; 3449 } 3450 3451 gpa &= ~0xFFFull; 3452 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; 3453 3454 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, 3455 offset_in_page(dst), sizeof(gpa))) 3456 return 0; 3457 3458 vmcs12->guest_pml_index--; 3459 3460 return 0; 3461 } 3462 3463 /* 3464 * Intel's VMX Instruction Reference specifies a common set of prerequisites 3465 * for running VMX instructions (except VMXON, whose prerequisites are 3466 * slightly different). It also specifies what exception to inject otherwise. 3467 * Note that many of these exceptions have priority over VM exits, so they 3468 * don't have to be checked again here. 3469 */ 3470 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 3471 { 3472 if (!to_vmx(vcpu)->nested.vmxon) { 3473 kvm_queue_exception(vcpu, UD_VECTOR); 3474 return 0; 3475 } 3476 3477 if (vmx_get_cpl(vcpu)) { 3478 kvm_inject_gp(vcpu, 0); 3479 return 0; 3480 } 3481 3482 return 1; 3483 } 3484 3485 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3486 struct vmcs12 *vmcs12); 3487 3488 /* 3489 * If from_vmentry is false, this is being called from state restore (either RSM 3490 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 3491 * 3492 * Returns: 3493 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode 3494 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail 3495 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit 3496 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error 3497 */ 3498 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, 3499 bool from_vmentry) 3500 { 3501 struct vcpu_vmx *vmx = to_vmx(vcpu); 3502 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3503 enum vm_entry_failure_code entry_failure_code; 3504 union vmx_exit_reason exit_reason = { 3505 .basic = EXIT_REASON_INVALID_STATE, 3506 .failed_vmentry = 1, 3507 }; 3508 u32 failed_index; 3509 3510 trace_kvm_nested_vmenter(kvm_rip_read(vcpu), 3511 vmx->nested.current_vmptr, 3512 vmcs12->guest_rip, 3513 vmcs12->guest_intr_status, 3514 vmcs12->vm_entry_intr_info_field, 3515 vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT, 3516 vmcs12->ept_pointer, 3517 vmcs12->guest_cr3, 3518 KVM_ISA_VMX); 3519 3520 kvm_service_local_tlb_flush_requests(vcpu); 3521 3522 if (!vmx->nested.nested_run_pending || 3523 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3524 vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 3525 if (kvm_mpx_supported() && 3526 (!vmx->nested.nested_run_pending || 3527 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 3528 vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3529 3530 /* 3531 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* 3532 * nested early checks are disabled. In the event of a "late" VM-Fail, 3533 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its 3534 * software model to the pre-VMEntry host state. When EPT is disabled, 3535 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes 3536 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing 3537 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to 3538 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested 3539 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is 3540 * guaranteed to be overwritten with a shadow CR3 prior to re-entering 3541 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as 3542 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks 3543 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail 3544 * path would need to manually save/restore vmcs01.GUEST_CR3. 3545 */ 3546 if (!enable_ept && !nested_early_check) 3547 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3548 3549 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3550 3551 prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); 3552 3553 if (from_vmentry) { 3554 if (unlikely(!nested_get_vmcs12_pages(vcpu))) { 3555 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3556 return NVMX_VMENTRY_KVM_INTERNAL_ERROR; 3557 } 3558 3559 if (nested_vmx_check_vmentry_hw(vcpu)) { 3560 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3561 return NVMX_VMENTRY_VMFAIL; 3562 } 3563 3564 if (nested_vmx_check_guest_state(vcpu, vmcs12, 3565 &entry_failure_code)) { 3566 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3567 vmcs12->exit_qualification = entry_failure_code; 3568 goto vmentry_fail_vmexit; 3569 } 3570 } 3571 3572 enter_guest_mode(vcpu); 3573 3574 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) { 3575 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3576 vmcs12->exit_qualification = entry_failure_code; 3577 goto vmentry_fail_vmexit_guest_mode; 3578 } 3579 3580 if (from_vmentry) { 3581 failed_index = nested_vmx_load_msr(vcpu, 3582 vmcs12->vm_entry_msr_load_addr, 3583 vmcs12->vm_entry_msr_load_count); 3584 if (failed_index) { 3585 exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; 3586 vmcs12->exit_qualification = failed_index; 3587 goto vmentry_fail_vmexit_guest_mode; 3588 } 3589 } else { 3590 /* 3591 * The MMU is not initialized to point at the right entities yet and 3592 * "get pages" would need to read data from the guest (i.e. we will 3593 * need to perform gpa to hpa translation). Request a call 3594 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3595 * have already been set at vmentry time and should not be reset. 3596 */ 3597 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 3598 } 3599 3600 /* 3601 * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI 3602 * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can 3603 * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit 3604 * unconditionally. Take care to pull data from vmcs01 as appropriate, 3605 * e.g. when checking for interrupt windows, as vmcs02 is now loaded. 3606 */ 3607 if ((__exec_controls_get(&vmx->vmcs01) & (CPU_BASED_INTR_WINDOW_EXITING | 3608 CPU_BASED_NMI_WINDOW_EXITING)) || 3609 kvm_apic_has_pending_init_or_sipi(vcpu) || 3610 kvm_apic_has_interrupt(vcpu)) 3611 kvm_make_request(KVM_REQ_EVENT, vcpu); 3612 3613 /* 3614 * Do not start the preemption timer hrtimer until after we know 3615 * we are successful, so that only nested_vmx_vmexit needs to cancel 3616 * the timer. 3617 */ 3618 vmx->nested.preemption_timer_expired = false; 3619 if (nested_cpu_has_preemption_timer(vmcs12)) { 3620 u64 timer_value = vmx_calc_preemption_timer_value(vcpu); 3621 vmx_start_preemption_timer(vcpu, timer_value); 3622 } 3623 3624 /* 3625 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3626 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3627 * returned as far as L1 is concerned. It will only return (and set 3628 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3629 */ 3630 return NVMX_VMENTRY_SUCCESS; 3631 3632 /* 3633 * A failed consistency check that leads to a VMExit during L1's 3634 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3635 * 26.7 "VM-entry failures during or after loading guest state". 3636 */ 3637 vmentry_fail_vmexit_guest_mode: 3638 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3639 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3640 leave_guest_mode(vcpu); 3641 3642 vmentry_fail_vmexit: 3643 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3644 3645 if (!from_vmentry) 3646 return NVMX_VMENTRY_VMEXIT; 3647 3648 load_vmcs12_host_state(vcpu, vmcs12); 3649 vmcs12->vm_exit_reason = exit_reason.full; 3650 if (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)) 3651 vmx->nested.need_vmcs12_to_shadow_sync = true; 3652 return NVMX_VMENTRY_VMEXIT; 3653 } 3654 3655 /* 3656 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3657 * for running an L2 nested guest. 3658 */ 3659 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3660 { 3661 struct vmcs12 *vmcs12; 3662 enum nvmx_vmentry_status status; 3663 struct vcpu_vmx *vmx = to_vmx(vcpu); 3664 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3665 enum nested_evmptrld_status evmptrld_status; 3666 3667 if (!nested_vmx_check_permission(vcpu)) 3668 return 1; 3669 3670 evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); 3671 if (evmptrld_status == EVMPTRLD_ERROR) { 3672 kvm_queue_exception(vcpu, UD_VECTOR); 3673 return 1; 3674 } 3675 3676 kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); 3677 3678 if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) 3679 return nested_vmx_failInvalid(vcpu); 3680 3681 if (CC(!nested_vmx_is_evmptr12_valid(vmx) && 3682 vmx->nested.current_vmptr == INVALID_GPA)) 3683 return nested_vmx_failInvalid(vcpu); 3684 3685 vmcs12 = get_vmcs12(vcpu); 3686 3687 /* 3688 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3689 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3690 * rather than RFLAGS.ZF, and no error number is stored to the 3691 * VM-instruction error field. 3692 */ 3693 if (CC(vmcs12->hdr.shadow_vmcs)) 3694 return nested_vmx_failInvalid(vcpu); 3695 3696 if (nested_vmx_is_evmptr12_valid(vmx)) { 3697 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 3698 3699 copy_enlightened_to_vmcs12(vmx, evmcs->hv_clean_fields); 3700 /* Enlightened VMCS doesn't have launch state */ 3701 vmcs12->launch_state = !launch; 3702 } else if (enable_shadow_vmcs) { 3703 copy_shadow_to_vmcs12(vmx); 3704 } 3705 3706 /* 3707 * The nested entry process starts with enforcing various prerequisites 3708 * on vmcs12 as required by the Intel SDM, and act appropriately when 3709 * they fail: As the SDM explains, some conditions should cause the 3710 * instruction to fail, while others will cause the instruction to seem 3711 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3712 * To speed up the normal (success) code path, we should avoid checking 3713 * for misconfigurations which will anyway be caught by the processor 3714 * when using the merged vmcs02. 3715 */ 3716 if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) 3717 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3718 3719 if (CC(vmcs12->launch_state == launch)) 3720 return nested_vmx_fail(vcpu, 3721 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3722 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3723 3724 if (nested_vmx_check_controls(vcpu, vmcs12)) 3725 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3726 3727 if (nested_vmx_check_address_space_size(vcpu, vmcs12)) 3728 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3729 3730 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3731 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3732 3733 /* 3734 * We're finally done with prerequisite checking, and can start with 3735 * the nested entry. 3736 */ 3737 vmx->nested.nested_run_pending = 1; 3738 vmx->nested.has_preemption_timer_deadline = false; 3739 status = nested_vmx_enter_non_root_mode(vcpu, true); 3740 if (unlikely(status != NVMX_VMENTRY_SUCCESS)) 3741 goto vmentry_failed; 3742 3743 /* Hide L1D cache contents from the nested guest. */ 3744 vmx->vcpu.arch.l1tf_flush_l1d = true; 3745 3746 /* 3747 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3748 * also be used as part of restoring nVMX state for 3749 * snapshot restore (migration). 3750 * 3751 * In this flow, it is assumed that vmcs12 cache was 3752 * transferred as part of captured nVMX state and should 3753 * therefore not be read from guest memory (which may not 3754 * exist on destination host yet). 3755 */ 3756 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3757 3758 switch (vmcs12->guest_activity_state) { 3759 case GUEST_ACTIVITY_HLT: 3760 /* 3761 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3762 * awakened by event injection or by an NMI-window VM-exit or 3763 * by an interrupt-window VM-exit, halt the vcpu. 3764 */ 3765 if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3766 !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && 3767 !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && 3768 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3769 vmx->nested.nested_run_pending = 0; 3770 return kvm_emulate_halt_noskip(vcpu); 3771 } 3772 break; 3773 case GUEST_ACTIVITY_WAIT_SIPI: 3774 vmx->nested.nested_run_pending = 0; 3775 kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); 3776 break; 3777 default: 3778 break; 3779 } 3780 3781 return 1; 3782 3783 vmentry_failed: 3784 vmx->nested.nested_run_pending = 0; 3785 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) 3786 return 0; 3787 if (status == NVMX_VMENTRY_VMEXIT) 3788 return 1; 3789 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); 3790 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3791 } 3792 3793 /* 3794 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3795 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). 3796 * This function returns the new value we should put in vmcs12.guest_cr0. 3797 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3798 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3799 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3800 * didn't trap the bit, because if L1 did, so would L0). 3801 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3802 * been modified by L2, and L1 knows it. So just leave the old value of 3803 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3804 * isn't relevant, because if L0 traps this bit it can set it to anything. 3805 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3806 * changed these bits, and therefore they need to be updated, but L0 3807 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3808 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3809 */ 3810 static inline unsigned long 3811 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3812 { 3813 return 3814 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3815 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3816 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3817 vcpu->arch.cr0_guest_owned_bits)); 3818 } 3819 3820 static inline unsigned long 3821 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3822 { 3823 return 3824 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3825 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3826 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3827 vcpu->arch.cr4_guest_owned_bits)); 3828 } 3829 3830 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3831 struct vmcs12 *vmcs12, 3832 u32 vm_exit_reason, u32 exit_intr_info) 3833 { 3834 u32 idt_vectoring; 3835 unsigned int nr; 3836 3837 /* 3838 * Per the SDM, VM-Exits due to double and triple faults are never 3839 * considered to occur during event delivery, even if the double/triple 3840 * fault is the result of an escalating vectoring issue. 3841 * 3842 * Note, the SDM qualifies the double fault behavior with "The original 3843 * event results in a double-fault exception". It's unclear why the 3844 * qualification exists since exits due to double fault can occur only 3845 * while vectoring a different exception (injected events are never 3846 * subject to interception), i.e. there's _always_ an original event. 3847 * 3848 * The SDM also uses NMI as a confusing example for the "original event 3849 * causes the VM exit directly" clause. NMI isn't special in any way, 3850 * the same rule applies to all events that cause an exit directly. 3851 * NMI is an odd choice for the example because NMIs can only occur on 3852 * instruction boundaries, i.e. they _can't_ occur during vectoring. 3853 */ 3854 if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT || 3855 ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI && 3856 is_double_fault(exit_intr_info))) { 3857 vmcs12->idt_vectoring_info_field = 0; 3858 } else if (vcpu->arch.exception.injected) { 3859 nr = vcpu->arch.exception.vector; 3860 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3861 3862 if (kvm_exception_is_soft(nr)) { 3863 vmcs12->vm_exit_instruction_len = 3864 vcpu->arch.event_exit_inst_len; 3865 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 3866 } else 3867 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 3868 3869 if (vcpu->arch.exception.has_error_code) { 3870 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 3871 vmcs12->idt_vectoring_error_code = 3872 vcpu->arch.exception.error_code; 3873 } 3874 3875 vmcs12->idt_vectoring_info_field = idt_vectoring; 3876 } else if (vcpu->arch.nmi_injected) { 3877 vmcs12->idt_vectoring_info_field = 3878 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 3879 } else if (vcpu->arch.interrupt.injected) { 3880 nr = vcpu->arch.interrupt.nr; 3881 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3882 3883 if (vcpu->arch.interrupt.soft) { 3884 idt_vectoring |= INTR_TYPE_SOFT_INTR; 3885 vmcs12->vm_entry_instruction_len = 3886 vcpu->arch.event_exit_inst_len; 3887 } else 3888 idt_vectoring |= INTR_TYPE_EXT_INTR; 3889 3890 vmcs12->idt_vectoring_info_field = idt_vectoring; 3891 } else { 3892 vmcs12->idt_vectoring_info_field = 0; 3893 } 3894 } 3895 3896 3897 void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) 3898 { 3899 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3900 gfn_t gfn; 3901 3902 /* 3903 * Don't need to mark the APIC access page dirty; it is never 3904 * written to by the CPU during APIC virtualization. 3905 */ 3906 3907 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3908 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; 3909 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3910 } 3911 3912 if (nested_cpu_has_posted_intr(vmcs12)) { 3913 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; 3914 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3915 } 3916 } 3917 3918 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 3919 { 3920 struct vcpu_vmx *vmx = to_vmx(vcpu); 3921 int max_irr; 3922 void *vapic_page; 3923 u16 status; 3924 3925 if (!vmx->nested.pi_pending) 3926 return 0; 3927 3928 if (!vmx->nested.pi_desc) 3929 goto mmio_needed; 3930 3931 vmx->nested.pi_pending = false; 3932 3933 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 3934 return 0; 3935 3936 max_irr = pi_find_highest_vector(vmx->nested.pi_desc); 3937 if (max_irr > 0) { 3938 vapic_page = vmx->nested.virtual_apic_map.hva; 3939 if (!vapic_page) 3940 goto mmio_needed; 3941 3942 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 3943 vapic_page, &max_irr); 3944 status = vmcs_read16(GUEST_INTR_STATUS); 3945 if ((u8)max_irr > ((u8)status & 0xff)) { 3946 status &= ~0xff; 3947 status |= (u8)max_irr; 3948 vmcs_write16(GUEST_INTR_STATUS, status); 3949 } 3950 } 3951 3952 nested_mark_vmcs12_pages_dirty(vcpu); 3953 return 0; 3954 3955 mmio_needed: 3956 kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); 3957 return -ENXIO; 3958 } 3959 3960 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) 3961 { 3962 struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; 3963 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; 3964 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3965 unsigned long exit_qual; 3966 3967 if (ex->has_payload) { 3968 exit_qual = ex->payload; 3969 } else if (ex->vector == PF_VECTOR) { 3970 exit_qual = vcpu->arch.cr2; 3971 } else if (ex->vector == DB_VECTOR) { 3972 exit_qual = vcpu->arch.dr6; 3973 exit_qual &= ~DR6_BT; 3974 exit_qual ^= DR6_ACTIVE_LOW; 3975 } else { 3976 exit_qual = 0; 3977 } 3978 3979 /* 3980 * Unlike AMD's Paged Real Mode, which reports an error code on #PF 3981 * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the 3982 * "has error code" flags on VM-Exit if the CPU is in Real Mode. 3983 */ 3984 if (ex->has_error_code && is_protmode(vcpu)) { 3985 /* 3986 * Intel CPUs do not generate error codes with bits 31:16 set, 3987 * and more importantly VMX disallows setting bits 31:16 in the 3988 * injected error code for VM-Entry. Drop the bits to mimic 3989 * hardware and avoid inducing failure on nested VM-Entry if L1 3990 * chooses to inject the exception back to L2. AMD CPUs _do_ 3991 * generate "full" 32-bit error codes, so KVM allows userspace 3992 * to inject exception error codes with bits 31:16 set. 3993 */ 3994 vmcs12->vm_exit_intr_error_code = (u16)ex->error_code; 3995 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 3996 } 3997 3998 if (kvm_exception_is_soft(ex->vector)) 3999 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 4000 else 4001 intr_info |= INTR_TYPE_HARD_EXCEPTION; 4002 4003 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 4004 vmx_get_nmi_mask(vcpu)) 4005 intr_info |= INTR_INFO_UNBLOCK_NMI; 4006 4007 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 4008 } 4009 4010 /* 4011 * Returns true if a debug trap is (likely) pending delivery. Infer the class 4012 * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6). 4013 * Using the payload is flawed because code breakpoints (fault-like) and data 4014 * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e. 4015 * this will return false positives if a to-be-injected code breakpoint #DB is 4016 * pending (from KVM's perspective, but not "pending" across an instruction 4017 * boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it 4018 * too is trap-like. 4019 * 4020 * KVM "works" despite these flaws as ICEBP isn't currently supported by the 4021 * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the 4022 * #DB has already happened), and MTF isn't marked pending on code breakpoints 4023 * from the emulator (because such #DBs are fault-like and thus don't trigger 4024 * actions that fire on instruction retire). 4025 */ 4026 static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex) 4027 { 4028 if (!ex->pending || ex->vector != DB_VECTOR) 4029 return 0; 4030 4031 /* General Detect #DBs are always fault-like. */ 4032 return ex->payload & ~DR6_BD; 4033 } 4034 4035 /* 4036 * Returns true if there's a pending #DB exception that is lower priority than 4037 * a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by 4038 * KVM, but could theoretically be injected by userspace. Note, this code is 4039 * imperfect, see above. 4040 */ 4041 static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex) 4042 { 4043 return vmx_get_pending_dbg_trap(ex) & ~DR6_BT; 4044 } 4045 4046 /* 4047 * Certain VM-exits set the 'pending debug exceptions' field to indicate a 4048 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM 4049 * represents these debug traps with a payload that is said to be compatible 4050 * with the 'pending debug exceptions' field, write the payload to the VMCS 4051 * field if a VM-exit is delivered before the debug trap. 4052 */ 4053 static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) 4054 { 4055 unsigned long pending_dbg; 4056 4057 pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception); 4058 if (pending_dbg) 4059 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg); 4060 } 4061 4062 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) 4063 { 4064 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 4065 to_vmx(vcpu)->nested.preemption_timer_expired; 4066 } 4067 4068 static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection) 4069 { 4070 struct vcpu_vmx *vmx = to_vmx(vcpu); 4071 void *vapic = vmx->nested.virtual_apic_map.hva; 4072 int max_irr, vppr; 4073 4074 if (nested_vmx_preemption_timer_pending(vcpu) || 4075 vmx->nested.mtf_pending) 4076 return true; 4077 4078 /* 4079 * Virtual Interrupt Delivery doesn't require manual injection. Either 4080 * the interrupt is already in GUEST_RVI and will be recognized by CPU 4081 * at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move 4082 * the interrupt from the PIR to RVI prior to entering the guest. 4083 */ 4084 if (for_injection) 4085 return false; 4086 4087 if (!nested_cpu_has_vid(get_vmcs12(vcpu)) || 4088 __vmx_interrupt_blocked(vcpu)) 4089 return false; 4090 4091 if (!vapic) 4092 return false; 4093 4094 vppr = *((u32 *)(vapic + APIC_PROCPRI)); 4095 4096 max_irr = vmx_get_rvi(); 4097 if ((max_irr & 0xf0) > (vppr & 0xf0)) 4098 return true; 4099 4100 if (vmx->nested.pi_pending && vmx->nested.pi_desc && 4101 pi_test_on(vmx->nested.pi_desc)) { 4102 max_irr = pi_find_highest_vector(vmx->nested.pi_desc); 4103 if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0)) 4104 return true; 4105 } 4106 4107 return false; 4108 } 4109 4110 /* 4111 * Per the Intel SDM's table "Priority Among Concurrent Events", with minor 4112 * edits to fill in missing examples, e.g. #DB due to split-lock accesses, 4113 * and less minor edits to splice in the priority of VMX Non-Root specific 4114 * events, e.g. MTF and NMI/INTR-window exiting. 4115 * 4116 * 1 Hardware Reset and Machine Checks 4117 * - RESET 4118 * - Machine Check 4119 * 4120 * 2 Trap on Task Switch 4121 * - T flag in TSS is set (on task switch) 4122 * 4123 * 3 External Hardware Interventions 4124 * - FLUSH 4125 * - STOPCLK 4126 * - SMI 4127 * - INIT 4128 * 4129 * 3.5 Monitor Trap Flag (MTF) VM-exit[1] 4130 * 4131 * 4 Traps on Previous Instruction 4132 * - Breakpoints 4133 * - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O 4134 * breakpoint, or #DB due to a split-lock access) 4135 * 4136 * 4.3 VMX-preemption timer expired VM-exit 4137 * 4138 * 4.6 NMI-window exiting VM-exit[2] 4139 * 4140 * 5 Nonmaskable Interrupts (NMI) 4141 * 4142 * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery 4143 * 4144 * 6 Maskable Hardware Interrupts 4145 * 4146 * 7 Code Breakpoint Fault 4147 * 4148 * 8 Faults from Fetching Next Instruction 4149 * - Code-Segment Limit Violation 4150 * - Code Page Fault 4151 * - Control protection exception (missing ENDBRANCH at target of indirect 4152 * call or jump) 4153 * 4154 * 9 Faults from Decoding Next Instruction 4155 * - Instruction length > 15 bytes 4156 * - Invalid Opcode 4157 * - Coprocessor Not Available 4158 * 4159 *10 Faults on Executing Instruction 4160 * - Overflow 4161 * - Bound error 4162 * - Invalid TSS 4163 * - Segment Not Present 4164 * - Stack fault 4165 * - General Protection 4166 * - Data Page Fault 4167 * - Alignment Check 4168 * - x86 FPU Floating-point exception 4169 * - SIMD floating-point exception 4170 * - Virtualization exception 4171 * - Control protection exception 4172 * 4173 * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs), 4174 * INIT signals, and higher priority events take priority over MTF VM exits. 4175 * MTF VM exits take priority over debug-trap exceptions and lower priority 4176 * events. 4177 * 4178 * [2] Debug-trap exceptions and higher priority events take priority over VM exits 4179 * caused by the VMX-preemption timer. VM exits caused by the VMX-preemption 4180 * timer take priority over VM exits caused by the "NMI-window exiting" 4181 * VM-execution control and lower priority events. 4182 * 4183 * [3] Debug-trap exceptions and higher priority events take priority over VM exits 4184 * caused by "NMI-window exiting". VM exits caused by this control take 4185 * priority over non-maskable interrupts (NMIs) and lower priority events. 4186 * 4187 * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to 4188 * the 1-setting of the "interrupt-window exiting" VM-execution control. Thus, 4189 * non-maskable interrupts (NMIs) and higher priority events take priority over 4190 * delivery of a virtual interrupt; delivery of a virtual interrupt takes 4191 * priority over external interrupts and lower priority events. 4192 */ 4193 static int vmx_check_nested_events(struct kvm_vcpu *vcpu) 4194 { 4195 struct kvm_lapic *apic = vcpu->arch.apic; 4196 struct vcpu_vmx *vmx = to_vmx(vcpu); 4197 /* 4198 * Only a pending nested run blocks a pending exception. If there is a 4199 * previously injected event, the pending exception occurred while said 4200 * event was being delivered and thus needs to be handled. 4201 */ 4202 bool block_nested_exceptions = vmx->nested.nested_run_pending; 4203 /* 4204 * Events that don't require injection, i.e. that are virtualized by 4205 * hardware, aren't blocked by a pending VM-Enter as KVM doesn't need 4206 * to regain control in order to deliver the event, and hardware will 4207 * handle event ordering, e.g. with respect to injected exceptions. 4208 * 4209 * But, new events (not exceptions) are only recognized at instruction 4210 * boundaries. If an event needs reinjection, then KVM is handling a 4211 * VM-Exit that occurred _during_ instruction execution; new events, 4212 * irrespective of whether or not they're injected, are blocked until 4213 * the instruction completes. 4214 */ 4215 bool block_non_injected_events = kvm_event_needs_reinjection(vcpu); 4216 /* 4217 * Inject events are blocked by nested VM-Enter, as KVM is responsible 4218 * for managing priority between concurrent events, i.e. KVM needs to 4219 * wait until after VM-Enter completes to deliver injected events. 4220 */ 4221 bool block_nested_events = block_nested_exceptions || 4222 block_non_injected_events; 4223 4224 if (lapic_in_kernel(vcpu) && 4225 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 4226 if (block_nested_events) 4227 return -EBUSY; 4228 nested_vmx_update_pending_dbg(vcpu); 4229 clear_bit(KVM_APIC_INIT, &apic->pending_events); 4230 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) 4231 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 4232 4233 /* MTF is discarded if the vCPU is in WFS. */ 4234 vmx->nested.mtf_pending = false; 4235 return 0; 4236 } 4237 4238 if (lapic_in_kernel(vcpu) && 4239 test_bit(KVM_APIC_SIPI, &apic->pending_events)) { 4240 if (block_nested_events) 4241 return -EBUSY; 4242 4243 clear_bit(KVM_APIC_SIPI, &apic->pending_events); 4244 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 4245 nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, 4246 apic->sipi_vector & 0xFFUL); 4247 return 0; 4248 } 4249 /* Fallthrough, the SIPI is completely ignored. */ 4250 } 4251 4252 /* 4253 * Process exceptions that are higher priority than Monitor Trap Flag: 4254 * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but 4255 * could theoretically come in from userspace), and ICEBP (INT1). 4256 * 4257 * TODO: SMIs have higher priority than MTF and trap-like #DBs (except 4258 * for TSS T flag #DBs). KVM also doesn't save/restore pending MTF 4259 * across SMI/RSM as it should; that needs to be addressed in order to 4260 * prioritize SMI over MTF and trap-like #DBs. 4261 */ 4262 if (vcpu->arch.exception_vmexit.pending && 4263 !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) { 4264 if (block_nested_exceptions) 4265 return -EBUSY; 4266 4267 nested_vmx_inject_exception_vmexit(vcpu); 4268 return 0; 4269 } 4270 4271 if (vcpu->arch.exception.pending && 4272 !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) { 4273 if (block_nested_exceptions) 4274 return -EBUSY; 4275 goto no_vmexit; 4276 } 4277 4278 if (vmx->nested.mtf_pending) { 4279 if (block_nested_events) 4280 return -EBUSY; 4281 nested_vmx_update_pending_dbg(vcpu); 4282 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); 4283 return 0; 4284 } 4285 4286 if (vcpu->arch.exception_vmexit.pending) { 4287 if (block_nested_exceptions) 4288 return -EBUSY; 4289 4290 nested_vmx_inject_exception_vmexit(vcpu); 4291 return 0; 4292 } 4293 4294 if (vcpu->arch.exception.pending) { 4295 if (block_nested_exceptions) 4296 return -EBUSY; 4297 goto no_vmexit; 4298 } 4299 4300 if (nested_vmx_preemption_timer_pending(vcpu)) { 4301 if (block_nested_events) 4302 return -EBUSY; 4303 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 4304 return 0; 4305 } 4306 4307 if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 4308 if (block_nested_events) 4309 return -EBUSY; 4310 goto no_vmexit; 4311 } 4312 4313 if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { 4314 if (block_nested_events) 4315 return -EBUSY; 4316 if (!nested_exit_on_nmi(vcpu)) 4317 goto no_vmexit; 4318 4319 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 4320 NMI_VECTOR | INTR_TYPE_NMI_INTR | 4321 INTR_INFO_VALID_MASK, 0); 4322 /* 4323 * The NMI-triggered VM exit counts as injection: 4324 * clear this one and block further NMIs. 4325 */ 4326 vcpu->arch.nmi_pending = 0; 4327 vmx_set_nmi_mask(vcpu, true); 4328 return 0; 4329 } 4330 4331 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { 4332 int irq; 4333 4334 if (!nested_exit_on_intr(vcpu)) { 4335 if (block_nested_events) 4336 return -EBUSY; 4337 4338 goto no_vmexit; 4339 } 4340 4341 if (!nested_exit_intr_ack_set(vcpu)) { 4342 if (block_nested_events) 4343 return -EBUSY; 4344 4345 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 4346 return 0; 4347 } 4348 4349 irq = kvm_cpu_get_extint(vcpu); 4350 if (irq != -1) { 4351 if (block_nested_events) 4352 return -EBUSY; 4353 4354 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 4355 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); 4356 return 0; 4357 } 4358 4359 irq = kvm_apic_has_interrupt(vcpu); 4360 if (WARN_ON_ONCE(irq < 0)) 4361 goto no_vmexit; 4362 4363 /* 4364 * If the IRQ is L2's PI notification vector, process posted 4365 * interrupts for L2 instead of injecting VM-Exit, as the 4366 * detection/morphing architecturally occurs when the IRQ is 4367 * delivered to the CPU. Note, only interrupts that are routed 4368 * through the local APIC trigger posted interrupt processing, 4369 * and enabling posted interrupts requires ACK-on-exit. 4370 */ 4371 if (irq == vmx->nested.posted_intr_nv) { 4372 /* 4373 * Nested posted interrupts are delivered via RVI, i.e. 4374 * aren't injected by KVM, and so can be queued even if 4375 * manual event injection is disallowed. 4376 */ 4377 if (block_non_injected_events) 4378 return -EBUSY; 4379 4380 vmx->nested.pi_pending = true; 4381 kvm_apic_clear_irr(vcpu, irq); 4382 goto no_vmexit; 4383 } 4384 4385 if (block_nested_events) 4386 return -EBUSY; 4387 4388 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 4389 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); 4390 4391 /* 4392 * ACK the interrupt _after_ emulating VM-Exit, as the IRQ must 4393 * be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI 4394 * if APICv is active. 4395 */ 4396 kvm_apic_ack_interrupt(vcpu, irq); 4397 return 0; 4398 } 4399 4400 no_vmexit: 4401 return vmx_complete_nested_posted_interrupt(vcpu); 4402 } 4403 4404 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 4405 { 4406 ktime_t remaining = 4407 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 4408 u64 value; 4409 4410 if (ktime_to_ns(remaining) <= 0) 4411 return 0; 4412 4413 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 4414 do_div(value, 1000000); 4415 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 4416 } 4417 4418 static bool is_vmcs12_ext_field(unsigned long field) 4419 { 4420 switch (field) { 4421 case GUEST_ES_SELECTOR: 4422 case GUEST_CS_SELECTOR: 4423 case GUEST_SS_SELECTOR: 4424 case GUEST_DS_SELECTOR: 4425 case GUEST_FS_SELECTOR: 4426 case GUEST_GS_SELECTOR: 4427 case GUEST_LDTR_SELECTOR: 4428 case GUEST_TR_SELECTOR: 4429 case GUEST_ES_LIMIT: 4430 case GUEST_CS_LIMIT: 4431 case GUEST_SS_LIMIT: 4432 case GUEST_DS_LIMIT: 4433 case GUEST_FS_LIMIT: 4434 case GUEST_GS_LIMIT: 4435 case GUEST_LDTR_LIMIT: 4436 case GUEST_TR_LIMIT: 4437 case GUEST_GDTR_LIMIT: 4438 case GUEST_IDTR_LIMIT: 4439 case GUEST_ES_AR_BYTES: 4440 case GUEST_DS_AR_BYTES: 4441 case GUEST_FS_AR_BYTES: 4442 case GUEST_GS_AR_BYTES: 4443 case GUEST_LDTR_AR_BYTES: 4444 case GUEST_TR_AR_BYTES: 4445 case GUEST_ES_BASE: 4446 case GUEST_CS_BASE: 4447 case GUEST_SS_BASE: 4448 case GUEST_DS_BASE: 4449 case GUEST_FS_BASE: 4450 case GUEST_GS_BASE: 4451 case GUEST_LDTR_BASE: 4452 case GUEST_TR_BASE: 4453 case GUEST_GDTR_BASE: 4454 case GUEST_IDTR_BASE: 4455 case GUEST_PENDING_DBG_EXCEPTIONS: 4456 case GUEST_BNDCFGS: 4457 return true; 4458 default: 4459 break; 4460 } 4461 4462 return false; 4463 } 4464 4465 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4466 struct vmcs12 *vmcs12) 4467 { 4468 struct vcpu_vmx *vmx = to_vmx(vcpu); 4469 4470 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 4471 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 4472 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 4473 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 4474 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 4475 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 4476 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 4477 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 4478 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 4479 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 4480 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 4481 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 4482 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 4483 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 4484 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 4485 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 4486 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 4487 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 4488 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 4489 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 4490 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 4491 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 4492 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 4493 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 4494 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 4495 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 4496 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 4497 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 4498 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 4499 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 4500 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 4501 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 4502 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 4503 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 4504 vmcs12->guest_pending_dbg_exceptions = 4505 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 4506 4507 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 4508 } 4509 4510 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4511 struct vmcs12 *vmcs12) 4512 { 4513 struct vcpu_vmx *vmx = to_vmx(vcpu); 4514 int cpu; 4515 4516 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 4517 return; 4518 4519 4520 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 4521 4522 cpu = get_cpu(); 4523 vmx->loaded_vmcs = &vmx->nested.vmcs02; 4524 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01); 4525 4526 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4527 4528 vmx->loaded_vmcs = &vmx->vmcs01; 4529 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02); 4530 put_cpu(); 4531 } 4532 4533 /* 4534 * Update the guest state fields of vmcs12 to reflect changes that 4535 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 4536 * VM-entry controls is also updated, since this is really a guest 4537 * state bit.) 4538 */ 4539 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 4540 { 4541 struct vcpu_vmx *vmx = to_vmx(vcpu); 4542 4543 if (nested_vmx_is_evmptr12_valid(vmx)) 4544 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4545 4546 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = 4547 !nested_vmx_is_evmptr12_valid(vmx); 4548 4549 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 4550 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 4551 4552 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 4553 vmcs12->guest_rip = kvm_rip_read(vcpu); 4554 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 4555 4556 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 4557 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 4558 4559 vmcs12->guest_interruptibility_info = 4560 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 4561 4562 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 4563 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 4564 else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 4565 vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; 4566 else 4567 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 4568 4569 if (nested_cpu_has_preemption_timer(vmcs12) && 4570 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && 4571 !vmx->nested.nested_run_pending) 4572 vmcs12->vmx_preemption_timer_value = 4573 vmx_get_preemption_timer_value(vcpu); 4574 4575 /* 4576 * In some cases (usually, nested EPT), L2 is allowed to change its 4577 * own CR3 without exiting. If it has changed it, we must keep it. 4578 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 4579 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 4580 * 4581 * Additionally, restore L2's PDPTR to vmcs12. 4582 */ 4583 if (enable_ept) { 4584 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 4585 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 4586 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 4587 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 4588 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 4589 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 4590 } 4591 } 4592 4593 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 4594 4595 if (nested_cpu_has_vid(vmcs12)) 4596 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 4597 4598 vmcs12->vm_entry_controls = 4599 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 4600 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 4601 4602 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 4603 vmcs12->guest_dr7 = vcpu->arch.dr7; 4604 4605 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 4606 vmcs12->guest_ia32_efer = vcpu->arch.efer; 4607 } 4608 4609 /* 4610 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 4611 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 4612 * and this function updates it to reflect the changes to the guest state while 4613 * L2 was running (and perhaps made some exits which were handled directly by L0 4614 * without going back to L1), and to reflect the exit reason. 4615 * Note that we do not have to copy here all VMCS fields, just those that 4616 * could have changed by the L2 guest or the exit - i.e., the guest-state and 4617 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 4618 * which already writes to vmcs12 directly. 4619 */ 4620 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 4621 u32 vm_exit_reason, u32 exit_intr_info, 4622 unsigned long exit_qualification, u32 exit_insn_len) 4623 { 4624 /* update exit information fields: */ 4625 vmcs12->vm_exit_reason = vm_exit_reason; 4626 if (to_vmx(vcpu)->exit_reason.enclave_mode) 4627 vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; 4628 vmcs12->exit_qualification = exit_qualification; 4629 4630 /* 4631 * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched 4632 * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other 4633 * exit info fields are unmodified. 4634 */ 4635 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 4636 vmcs12->launch_state = 1; 4637 4638 /* vm_entry_intr_info_field is cleared on exit. Emulate this 4639 * instead of reading the real value. */ 4640 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 4641 4642 /* 4643 * Transfer the event that L0 or L1 may wanted to inject into 4644 * L2 to IDT_VECTORING_INFO_FIELD. 4645 */ 4646 vmcs12_save_pending_event(vcpu, vmcs12, 4647 vm_exit_reason, exit_intr_info); 4648 4649 vmcs12->vm_exit_intr_info = exit_intr_info; 4650 vmcs12->vm_exit_instruction_len = exit_insn_len; 4651 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4652 4653 /* 4654 * According to spec, there's no need to store the guest's 4655 * MSRs if the exit is due to a VM-entry failure that occurs 4656 * during or after loading the guest state. Since this exit 4657 * does not fall in that category, we need to save the MSRs. 4658 */ 4659 if (nested_vmx_store_msr(vcpu, 4660 vmcs12->vm_exit_msr_store_addr, 4661 vmcs12->vm_exit_msr_store_count)) 4662 nested_vmx_abort(vcpu, 4663 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 4664 } 4665 } 4666 4667 /* 4668 * A part of what we need to when the nested L2 guest exits and we want to 4669 * run its L1 parent, is to reset L1's guest state to the host state specified 4670 * in vmcs12. 4671 * This function is to be called not only on normal nested exit, but also on 4672 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 4673 * Failures During or After Loading Guest State"). 4674 * This function should be called when the active VMCS is L1's (vmcs01). 4675 */ 4676 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 4677 struct vmcs12 *vmcs12) 4678 { 4679 enum vm_entry_failure_code ignored; 4680 struct kvm_segment seg; 4681 4682 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 4683 vcpu->arch.efer = vmcs12->host_ia32_efer; 4684 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4685 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 4686 else 4687 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 4688 vmx_set_efer(vcpu, vcpu->arch.efer); 4689 4690 kvm_rsp_write(vcpu, vmcs12->host_rsp); 4691 kvm_rip_write(vcpu, vmcs12->host_rip); 4692 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 4693 vmx_set_interrupt_shadow(vcpu, 0); 4694 4695 /* 4696 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 4697 * actually changed, because vmx_set_cr0 refers to efer set above. 4698 * 4699 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 4700 * (KVM doesn't change it); 4701 */ 4702 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4703 vmx_set_cr0(vcpu, vmcs12->host_cr0); 4704 4705 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 4706 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4707 vmx_set_cr4(vcpu, vmcs12->host_cr4); 4708 4709 nested_ept_uninit_mmu_context(vcpu); 4710 4711 /* 4712 * Only PDPTE load can fail as the value of cr3 was checked on entry and 4713 * couldn't have changed. 4714 */ 4715 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored)) 4716 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 4717 4718 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); 4719 4720 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 4721 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 4722 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 4723 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 4724 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 4725 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 4726 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 4727 4728 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 4729 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 4730 vmcs_write64(GUEST_BNDCFGS, 0); 4731 4732 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 4733 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 4734 vcpu->arch.pat = vmcs12->host_ia32_pat; 4735 } 4736 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 4737 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) 4738 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4739 vmcs12->host_ia32_perf_global_ctrl)); 4740 4741 /* Set L1 segment info according to Intel SDM 4742 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 4743 seg = (struct kvm_segment) { 4744 .base = 0, 4745 .limit = 0xFFFFFFFF, 4746 .selector = vmcs12->host_cs_selector, 4747 .type = 11, 4748 .present = 1, 4749 .s = 1, 4750 .g = 1 4751 }; 4752 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4753 seg.l = 1; 4754 else 4755 seg.db = 1; 4756 __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 4757 seg = (struct kvm_segment) { 4758 .base = 0, 4759 .limit = 0xFFFFFFFF, 4760 .type = 3, 4761 .present = 1, 4762 .s = 1, 4763 .db = 1, 4764 .g = 1 4765 }; 4766 seg.selector = vmcs12->host_ds_selector; 4767 __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 4768 seg.selector = vmcs12->host_es_selector; 4769 __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 4770 seg.selector = vmcs12->host_ss_selector; 4771 __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 4772 seg.selector = vmcs12->host_fs_selector; 4773 seg.base = vmcs12->host_fs_base; 4774 __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 4775 seg.selector = vmcs12->host_gs_selector; 4776 seg.base = vmcs12->host_gs_base; 4777 __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 4778 seg = (struct kvm_segment) { 4779 .base = vmcs12->host_tr_base, 4780 .limit = 0x67, 4781 .selector = vmcs12->host_tr_selector, 4782 .type = 11, 4783 .present = 1 4784 }; 4785 __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 4786 4787 memset(&seg, 0, sizeof(seg)); 4788 seg.unusable = 1; 4789 __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); 4790 4791 kvm_set_dr(vcpu, 7, 0x400); 4792 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4793 4794 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 4795 vmcs12->vm_exit_msr_load_count)) 4796 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4797 4798 to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); 4799 } 4800 4801 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 4802 { 4803 struct vmx_uret_msr *efer_msr; 4804 unsigned int i; 4805 4806 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 4807 return vmcs_read64(GUEST_IA32_EFER); 4808 4809 if (cpu_has_load_ia32_efer()) 4810 return kvm_host.efer; 4811 4812 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 4813 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 4814 return vmx->msr_autoload.guest.val[i].value; 4815 } 4816 4817 efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); 4818 if (efer_msr) 4819 return efer_msr->data; 4820 4821 return kvm_host.efer; 4822 } 4823 4824 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 4825 { 4826 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4827 struct vcpu_vmx *vmx = to_vmx(vcpu); 4828 struct vmx_msr_entry g, h; 4829 gpa_t gpa; 4830 u32 i, j; 4831 4832 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 4833 4834 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 4835 /* 4836 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 4837 * as vmcs01.GUEST_DR7 contains a userspace defined value 4838 * and vcpu->arch.dr7 is not squirreled away before the 4839 * nested VMENTER (not worth adding a variable in nested_vmx). 4840 */ 4841 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 4842 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 4843 else 4844 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 4845 } 4846 4847 /* 4848 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 4849 * handle a variety of side effects to KVM's software model. 4850 */ 4851 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 4852 4853 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4854 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 4855 4856 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4857 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 4858 4859 nested_ept_uninit_mmu_context(vcpu); 4860 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 4861 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 4862 4863 /* 4864 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 4865 * from vmcs01 (if necessary). The PDPTRs are not loaded on 4866 * VMFail, like everything else we just need to ensure our 4867 * software model is up-to-date. 4868 */ 4869 if (enable_ept && is_pae_paging(vcpu)) 4870 ept_save_pdptrs(vcpu); 4871 4872 kvm_mmu_reset_context(vcpu); 4873 4874 /* 4875 * This nasty bit of open coding is a compromise between blindly 4876 * loading L1's MSRs using the exit load lists (incorrect emulation 4877 * of VMFail), leaving the nested VM's MSRs in the software model 4878 * (incorrect behavior) and snapshotting the modified MSRs (too 4879 * expensive since the lists are unbound by hardware). For each 4880 * MSR that was (prematurely) loaded from the nested VMEntry load 4881 * list, reload it from the exit load list if it exists and differs 4882 * from the guest value. The intent is to stuff host state as 4883 * silently as possible, not to fully process the exit load list. 4884 */ 4885 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 4886 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 4887 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 4888 pr_debug_ratelimited( 4889 "%s read MSR index failed (%u, 0x%08llx)\n", 4890 __func__, i, gpa); 4891 goto vmabort; 4892 } 4893 4894 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 4895 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 4896 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 4897 pr_debug_ratelimited( 4898 "%s read MSR failed (%u, 0x%08llx)\n", 4899 __func__, j, gpa); 4900 goto vmabort; 4901 } 4902 if (h.index != g.index) 4903 continue; 4904 if (h.value == g.value) 4905 break; 4906 4907 if (nested_vmx_load_msr_check(vcpu, &h)) { 4908 pr_debug_ratelimited( 4909 "%s check failed (%u, 0x%x, 0x%x)\n", 4910 __func__, j, h.index, h.reserved); 4911 goto vmabort; 4912 } 4913 4914 if (kvm_set_msr_with_filter(vcpu, h.index, h.value)) { 4915 pr_debug_ratelimited( 4916 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 4917 __func__, j, h.index, h.value); 4918 goto vmabort; 4919 } 4920 } 4921 } 4922 4923 return; 4924 4925 vmabort: 4926 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4927 } 4928 4929 /* 4930 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 4931 * and modify vmcs12 to make it see what it would expect to see there if 4932 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 4933 */ 4934 void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, 4935 u32 exit_intr_info, unsigned long exit_qualification, 4936 u32 exit_insn_len) 4937 { 4938 struct vcpu_vmx *vmx = to_vmx(vcpu); 4939 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4940 4941 /* Pending MTF traps are discarded on VM-Exit. */ 4942 vmx->nested.mtf_pending = false; 4943 4944 /* trying to cancel vmlaunch/vmresume is a bug */ 4945 WARN_ON_ONCE(vmx->nested.nested_run_pending); 4946 4947 #ifdef CONFIG_KVM_HYPERV 4948 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { 4949 /* 4950 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map 4951 * Enlightened VMCS after migration and we still need to 4952 * do that when something is forcing L2->L1 exit prior to 4953 * the first L2 run. 4954 */ 4955 (void)nested_get_evmcs_page(vcpu); 4956 } 4957 #endif 4958 4959 /* Service pending TLB flush requests for L2 before switching to L1. */ 4960 kvm_service_local_tlb_flush_requests(vcpu); 4961 4962 /* 4963 * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between 4964 * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are 4965 * up-to-date before switching to L1. 4966 */ 4967 if (enable_ept && is_pae_paging(vcpu)) 4968 vmx_ept_load_pdptrs(vcpu); 4969 4970 leave_guest_mode(vcpu); 4971 4972 if (nested_cpu_has_preemption_timer(vmcs12)) 4973 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 4974 4975 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { 4976 vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; 4977 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 4978 vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; 4979 } 4980 4981 if (likely(!vmx->fail)) { 4982 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 4983 4984 if (vm_exit_reason != -1) 4985 prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, 4986 exit_intr_info, exit_qualification, 4987 exit_insn_len); 4988 4989 /* 4990 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 4991 * also be used to capture vmcs12 cache as part of 4992 * capturing nVMX state for snapshot (migration). 4993 * 4994 * Otherwise, this flush will dirty guest memory at a 4995 * point it is already assumed by user-space to be 4996 * immutable. 4997 */ 4998 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 4999 } else { 5000 /* 5001 * The only expected VM-instruction error is "VM entry with 5002 * invalid control field(s)." Anything else indicates a 5003 * problem with L0. And we should never get here with a 5004 * VMFail of any type if early consistency checks are enabled. 5005 */ 5006 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 5007 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 5008 WARN_ON_ONCE(nested_early_check); 5009 } 5010 5011 /* 5012 * Drop events/exceptions that were queued for re-injection to L2 5013 * (picked up via vmx_complete_interrupts()), as well as exceptions 5014 * that were pending for L2. Note, this must NOT be hoisted above 5015 * prepare_vmcs12(), events/exceptions queued for re-injection need to 5016 * be captured in vmcs12 (see vmcs12_save_pending_event()). 5017 */ 5018 vcpu->arch.nmi_injected = false; 5019 kvm_clear_exception_queue(vcpu); 5020 kvm_clear_interrupt_queue(vcpu); 5021 5022 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 5023 5024 /* 5025 * If IBRS is advertised to the vCPU, KVM must flush the indirect 5026 * branch predictors when transitioning from L2 to L1, as L1 expects 5027 * hardware (KVM in this case) to provide separate predictor modes. 5028 * Bare metal isolates VMX root (host) from VMX non-root (guest), but 5029 * doesn't isolate different VMCSs, i.e. in this case, doesn't provide 5030 * separate modes for L2 vs L1. 5031 */ 5032 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL)) 5033 indirect_branch_prediction_barrier(); 5034 5035 /* Update any VMCS fields that might have changed while L2 ran */ 5036 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 5037 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 5038 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 5039 if (kvm_caps.has_tsc_control) 5040 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 5041 5042 if (vmx->nested.l1_tpr_threshold != -1) 5043 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); 5044 5045 if (vmx->nested.change_vmcs01_virtual_apic_mode) { 5046 vmx->nested.change_vmcs01_virtual_apic_mode = false; 5047 vmx_set_virtual_apic_mode(vcpu); 5048 } 5049 5050 if (vmx->nested.update_vmcs01_cpu_dirty_logging) { 5051 vmx->nested.update_vmcs01_cpu_dirty_logging = false; 5052 vmx_update_cpu_dirty_logging(vcpu); 5053 } 5054 5055 nested_put_vmcs12_pages(vcpu); 5056 5057 if (vmx->nested.reload_vmcs01_apic_access_page) { 5058 vmx->nested.reload_vmcs01_apic_access_page = false; 5059 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 5060 } 5061 5062 if (vmx->nested.update_vmcs01_apicv_status) { 5063 vmx->nested.update_vmcs01_apicv_status = false; 5064 kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); 5065 } 5066 5067 if (vmx->nested.update_vmcs01_hwapic_isr) { 5068 vmx->nested.update_vmcs01_hwapic_isr = false; 5069 kvm_apic_update_hwapic_isr(vcpu); 5070 } 5071 5072 if ((vm_exit_reason != -1) && 5073 (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))) 5074 vmx->nested.need_vmcs12_to_shadow_sync = true; 5075 5076 /* in case we halted in L2 */ 5077 kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); 5078 5079 if (likely(!vmx->fail)) { 5080 if (vm_exit_reason != -1) 5081 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 5082 vmcs12->exit_qualification, 5083 vmcs12->idt_vectoring_info_field, 5084 vmcs12->vm_exit_intr_info, 5085 vmcs12->vm_exit_intr_error_code, 5086 KVM_ISA_VMX); 5087 5088 load_vmcs12_host_state(vcpu, vmcs12); 5089 5090 /* 5091 * Process events if an injectable IRQ or NMI is pending, even 5092 * if the event is blocked (RFLAGS.IF is cleared on VM-Exit). 5093 * If an event became pending while L2 was active, KVM needs to 5094 * either inject the event or request an IRQ/NMI window. SMIs 5095 * don't need to be processed as SMM is mutually exclusive with 5096 * non-root mode. INIT/SIPI don't need to be checked as INIT 5097 * is blocked post-VMXON, and SIPIs are ignored. 5098 */ 5099 if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending) 5100 kvm_make_request(KVM_REQ_EVENT, vcpu); 5101 return; 5102 } 5103 5104 /* 5105 * After an early L2 VM-entry failure, we're now back 5106 * in L1 which thinks it just finished a VMLAUNCH or 5107 * VMRESUME instruction, so we need to set the failure 5108 * flag and the VM-instruction error field of the VMCS 5109 * accordingly, and skip the emulated instruction. 5110 */ 5111 (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 5112 5113 /* 5114 * Restore L1's host state to KVM's software model. We're here 5115 * because a consistency check was caught by hardware, which 5116 * means some amount of guest state has been propagated to KVM's 5117 * model and needs to be unwound to the host's state. 5118 */ 5119 nested_vmx_restore_host_state(vcpu); 5120 5121 vmx->fail = 0; 5122 } 5123 5124 static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) 5125 { 5126 kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); 5127 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 5128 } 5129 5130 /* 5131 * Decode the memory-address operand of a vmx instruction, as recorded on an 5132 * exit caused by such an instruction (run by a guest hypervisor). 5133 * On success, returns 0. When the operand is invalid, returns 1 and throws 5134 * #UD, #GP, or #SS. 5135 */ 5136 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 5137 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 5138 { 5139 gva_t off; 5140 bool exn; 5141 struct kvm_segment s; 5142 5143 /* 5144 * According to Vol. 3B, "Information for VM Exits Due to Instruction 5145 * Execution", on an exit, vmx_instruction_info holds most of the 5146 * addressing components of the operand. Only the displacement part 5147 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 5148 * For how an actual address is calculated from all these components, 5149 * refer to Vol. 1, "Operand Addressing". 5150 */ 5151 int scaling = vmx_instruction_info & 3; 5152 int addr_size = (vmx_instruction_info >> 7) & 7; 5153 bool is_reg = vmx_instruction_info & (1u << 10); 5154 int seg_reg = (vmx_instruction_info >> 15) & 7; 5155 int index_reg = (vmx_instruction_info >> 18) & 0xf; 5156 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 5157 int base_reg = (vmx_instruction_info >> 23) & 0xf; 5158 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 5159 5160 if (is_reg) { 5161 kvm_queue_exception(vcpu, UD_VECTOR); 5162 return 1; 5163 } 5164 5165 /* Addr = segment_base + offset */ 5166 /* offset = base + [index * scale] + displacement */ 5167 off = exit_qualification; /* holds the displacement */ 5168 if (addr_size == 1) 5169 off = (gva_t)sign_extend64(off, 31); 5170 else if (addr_size == 0) 5171 off = (gva_t)sign_extend64(off, 15); 5172 if (base_is_valid) 5173 off += kvm_register_read(vcpu, base_reg); 5174 if (index_is_valid) 5175 off += kvm_register_read(vcpu, index_reg) << scaling; 5176 vmx_get_segment(vcpu, &s, seg_reg); 5177 5178 /* 5179 * The effective address, i.e. @off, of a memory operand is truncated 5180 * based on the address size of the instruction. Note that this is 5181 * the *effective address*, i.e. the address prior to accounting for 5182 * the segment's base. 5183 */ 5184 if (addr_size == 1) /* 32 bit */ 5185 off &= 0xffffffff; 5186 else if (addr_size == 0) /* 16 bit */ 5187 off &= 0xffff; 5188 5189 /* Checks for #GP/#SS exceptions. */ 5190 exn = false; 5191 if (is_long_mode(vcpu)) { 5192 /* 5193 * The virtual/linear address is never truncated in 64-bit 5194 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 5195 * address when using FS/GS with a non-zero base. 5196 */ 5197 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 5198 *ret = s.base + off; 5199 else 5200 *ret = off; 5201 5202 *ret = vmx_get_untagged_addr(vcpu, *ret, 0); 5203 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 5204 * non-canonical form. This is the only check on the memory 5205 * destination for long mode! 5206 */ 5207 exn = is_noncanonical_address(*ret, vcpu, 0); 5208 } else { 5209 /* 5210 * When not in long mode, the virtual/linear address is 5211 * unconditionally truncated to 32 bits regardless of the 5212 * address size. 5213 */ 5214 *ret = (s.base + off) & 0xffffffff; 5215 5216 /* Protected mode: apply checks for segment validity in the 5217 * following order: 5218 * - segment type check (#GP(0) may be thrown) 5219 * - usability check (#GP(0)/#SS(0)) 5220 * - limit check (#GP(0)/#SS(0)) 5221 */ 5222 if (wr) 5223 /* #GP(0) if the destination operand is located in a 5224 * read-only data segment or any code segment. 5225 */ 5226 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 5227 else 5228 /* #GP(0) if the source operand is located in an 5229 * execute-only code segment 5230 */ 5231 exn = ((s.type & 0xa) == 8); 5232 if (exn) { 5233 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 5234 return 1; 5235 } 5236 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 5237 */ 5238 exn = (s.unusable != 0); 5239 5240 /* 5241 * Protected mode: #GP(0)/#SS(0) if the memory operand is 5242 * outside the segment limit. All CPUs that support VMX ignore 5243 * limit checks for flat segments, i.e. segments with base==0, 5244 * limit==0xffffffff and of type expand-up data or code. 5245 */ 5246 if (!(s.base == 0 && s.limit == 0xffffffff && 5247 ((s.type & 8) || !(s.type & 4)))) 5248 exn = exn || ((u64)off + len - 1 > s.limit); 5249 } 5250 if (exn) { 5251 kvm_queue_exception_e(vcpu, 5252 seg_reg == VCPU_SREG_SS ? 5253 SS_VECTOR : GP_VECTOR, 5254 0); 5255 return 1; 5256 } 5257 5258 return 0; 5259 } 5260 5261 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, 5262 int *ret) 5263 { 5264 gva_t gva; 5265 struct x86_exception e; 5266 int r; 5267 5268 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5269 vmcs_read32(VMX_INSTRUCTION_INFO), false, 5270 sizeof(*vmpointer), &gva)) { 5271 *ret = 1; 5272 return -EINVAL; 5273 } 5274 5275 r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); 5276 if (r != X86EMUL_CONTINUE) { 5277 *ret = kvm_handle_memory_failure(vcpu, r, &e); 5278 return -EINVAL; 5279 } 5280 5281 return 0; 5282 } 5283 5284 /* 5285 * Allocate a shadow VMCS and associate it with the currently loaded 5286 * VMCS, unless such a shadow VMCS already exists. The newly allocated 5287 * VMCS is also VMCLEARed, so that it is ready for use. 5288 */ 5289 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 5290 { 5291 struct vcpu_vmx *vmx = to_vmx(vcpu); 5292 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 5293 5294 /* 5295 * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it 5296 * when L1 executes VMXOFF or the vCPU is forced out of nested 5297 * operation. VMXON faults if the CPU is already post-VMXON, so it 5298 * should be impossible to already have an allocated shadow VMCS. KVM 5299 * doesn't support virtualization of VMCS shadowing, so vmcs01 should 5300 * always be the loaded VMCS. 5301 */ 5302 if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs)) 5303 return loaded_vmcs->shadow_vmcs; 5304 5305 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 5306 if (loaded_vmcs->shadow_vmcs) 5307 vmcs_clear(loaded_vmcs->shadow_vmcs); 5308 5309 return loaded_vmcs->shadow_vmcs; 5310 } 5311 5312 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 5313 { 5314 struct vcpu_vmx *vmx = to_vmx(vcpu); 5315 int r; 5316 5317 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 5318 if (r < 0) 5319 goto out_vmcs02; 5320 5321 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5322 if (!vmx->nested.cached_vmcs12) 5323 goto out_cached_vmcs12; 5324 5325 vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; 5326 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5327 if (!vmx->nested.cached_shadow_vmcs12) 5328 goto out_cached_shadow_vmcs12; 5329 5330 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 5331 goto out_shadow_vmcs; 5332 5333 hrtimer_setup(&vmx->nested.preemption_timer, vmx_preemption_timer_fn, CLOCK_MONOTONIC, 5334 HRTIMER_MODE_ABS_PINNED); 5335 5336 vmx->nested.vpid02 = allocate_vpid(); 5337 5338 vmx->nested.vmcs02_initialized = false; 5339 vmx->nested.vmxon = true; 5340 5341 if (vmx_pt_mode_is_host_guest()) { 5342 vmx->pt_desc.guest.ctl = 0; 5343 pt_update_intercept_for_msr(vcpu); 5344 } 5345 5346 return 0; 5347 5348 out_shadow_vmcs: 5349 kfree(vmx->nested.cached_shadow_vmcs12); 5350 5351 out_cached_shadow_vmcs12: 5352 kfree(vmx->nested.cached_vmcs12); 5353 5354 out_cached_vmcs12: 5355 free_loaded_vmcs(&vmx->nested.vmcs02); 5356 5357 out_vmcs02: 5358 return -ENOMEM; 5359 } 5360 5361 /* Emulate the VMXON instruction. */ 5362 static int handle_vmxon(struct kvm_vcpu *vcpu) 5363 { 5364 int ret; 5365 gpa_t vmptr; 5366 uint32_t revision; 5367 struct vcpu_vmx *vmx = to_vmx(vcpu); 5368 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED 5369 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 5370 5371 /* 5372 * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter 5373 * the guest and so cannot rely on hardware to perform the check, 5374 * which has higher priority than VM-Exit (see Intel SDM's pseudocode 5375 * for VMXON). 5376 * 5377 * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 5378 * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't 5379 * force any of the relevant guest state. For a restricted guest, KVM 5380 * does force CR0.PE=1, but only to also force VM86 in order to emulate 5381 * Real Mode, and so there's no need to check CR0.PE manually. 5382 */ 5383 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) { 5384 kvm_queue_exception(vcpu, UD_VECTOR); 5385 return 1; 5386 } 5387 5388 /* 5389 * The CPL is checked for "not in VMX operation" and for "in VMX root", 5390 * and has higher priority than the VM-Fail due to being post-VMXON, 5391 * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, 5392 * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits 5393 * from L2 to L1, i.e. there's no need to check for the vCPU being in 5394 * VMX non-root. 5395 * 5396 * Forwarding the VM-Exit unconditionally, i.e. without performing the 5397 * #UD checks (see above), is functionally ok because KVM doesn't allow 5398 * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's 5399 * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are 5400 * missed by hardware due to shadowing CR0 and/or CR4. 5401 */ 5402 if (vmx_get_cpl(vcpu)) { 5403 kvm_inject_gp(vcpu, 0); 5404 return 1; 5405 } 5406 5407 if (vmx->nested.vmxon) 5408 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 5409 5410 /* 5411 * Invalid CR0/CR4 generates #GP. These checks are performed if and 5412 * only if the vCPU isn't already in VMX operation, i.e. effectively 5413 * have lower priority than the VM-Fail above. 5414 */ 5415 if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || 5416 !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { 5417 kvm_inject_gp(vcpu, 0); 5418 return 1; 5419 } 5420 5421 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 5422 != VMXON_NEEDED_FEATURES) { 5423 kvm_inject_gp(vcpu, 0); 5424 return 1; 5425 } 5426 5427 if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) 5428 return ret; 5429 5430 /* 5431 * SDM 3: 24.11.5 5432 * The first 4 bytes of VMXON region contain the supported 5433 * VMCS revision identifier 5434 * 5435 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 5436 * which replaces physical address width with 32 5437 */ 5438 if (!page_address_valid(vcpu, vmptr)) 5439 return nested_vmx_failInvalid(vcpu); 5440 5441 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 5442 revision != VMCS12_REVISION) 5443 return nested_vmx_failInvalid(vcpu); 5444 5445 vmx->nested.vmxon_ptr = vmptr; 5446 ret = enter_vmx_operation(vcpu); 5447 if (ret) 5448 return ret; 5449 5450 return nested_vmx_succeed(vcpu); 5451 } 5452 5453 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 5454 { 5455 struct vcpu_vmx *vmx = to_vmx(vcpu); 5456 5457 if (vmx->nested.current_vmptr == INVALID_GPA) 5458 return; 5459 5460 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 5461 5462 if (enable_shadow_vmcs) { 5463 /* copy to memory all shadowed fields in case 5464 they were modified */ 5465 copy_shadow_to_vmcs12(vmx); 5466 vmx_disable_shadow_vmcs(vmx); 5467 } 5468 vmx->nested.posted_intr_nv = -1; 5469 5470 /* Flush VMCS12 to guest memory */ 5471 kvm_vcpu_write_guest_page(vcpu, 5472 vmx->nested.current_vmptr >> PAGE_SHIFT, 5473 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 5474 5475 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 5476 5477 vmx->nested.current_vmptr = INVALID_GPA; 5478 } 5479 5480 /* Emulate the VMXOFF instruction */ 5481 static int handle_vmxoff(struct kvm_vcpu *vcpu) 5482 { 5483 if (!nested_vmx_check_permission(vcpu)) 5484 return 1; 5485 5486 free_nested(vcpu); 5487 5488 if (kvm_apic_has_pending_init_or_sipi(vcpu)) 5489 kvm_make_request(KVM_REQ_EVENT, vcpu); 5490 5491 return nested_vmx_succeed(vcpu); 5492 } 5493 5494 /* Emulate the VMCLEAR instruction */ 5495 static int handle_vmclear(struct kvm_vcpu *vcpu) 5496 { 5497 struct vcpu_vmx *vmx = to_vmx(vcpu); 5498 u32 zero = 0; 5499 gpa_t vmptr; 5500 int r; 5501 5502 if (!nested_vmx_check_permission(vcpu)) 5503 return 1; 5504 5505 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5506 return r; 5507 5508 if (!page_address_valid(vcpu, vmptr)) 5509 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); 5510 5511 if (vmptr == vmx->nested.vmxon_ptr) 5512 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); 5513 5514 if (likely(!nested_evmcs_handle_vmclear(vcpu, vmptr))) { 5515 if (vmptr == vmx->nested.current_vmptr) 5516 nested_release_vmcs12(vcpu); 5517 5518 /* 5519 * Silently ignore memory errors on VMCLEAR, Intel's pseudocode 5520 * for VMCLEAR includes a "ensure that data for VMCS referenced 5521 * by the operand is in memory" clause that guards writes to 5522 * memory, i.e. doing nothing for I/O is architecturally valid. 5523 * 5524 * FIXME: Suppress failures if and only if no memslot is found, 5525 * i.e. exit to userspace if __copy_to_user() fails. 5526 */ 5527 (void)kvm_vcpu_write_guest(vcpu, 5528 vmptr + offsetof(struct vmcs12, 5529 launch_state), 5530 &zero, sizeof(zero)); 5531 } 5532 5533 return nested_vmx_succeed(vcpu); 5534 } 5535 5536 /* Emulate the VMLAUNCH instruction */ 5537 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 5538 { 5539 return nested_vmx_run(vcpu, true); 5540 } 5541 5542 /* Emulate the VMRESUME instruction */ 5543 static int handle_vmresume(struct kvm_vcpu *vcpu) 5544 { 5545 5546 return nested_vmx_run(vcpu, false); 5547 } 5548 5549 static int handle_vmread(struct kvm_vcpu *vcpu) 5550 { 5551 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5552 : get_vmcs12(vcpu); 5553 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5554 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5555 struct vcpu_vmx *vmx = to_vmx(vcpu); 5556 struct x86_exception e; 5557 unsigned long field; 5558 u64 value; 5559 gva_t gva = 0; 5560 short offset; 5561 int len, r; 5562 5563 if (!nested_vmx_check_permission(vcpu)) 5564 return 1; 5565 5566 /* Decode instruction info and find the field to read */ 5567 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5568 5569 if (!nested_vmx_is_evmptr12_valid(vmx)) { 5570 /* 5571 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5572 * any VMREAD sets the ALU flags for VMfailInvalid. 5573 */ 5574 if (vmx->nested.current_vmptr == INVALID_GPA || 5575 (is_guest_mode(vcpu) && 5576 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5577 return nested_vmx_failInvalid(vcpu); 5578 5579 offset = get_vmcs12_field_offset(field); 5580 if (offset < 0) 5581 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5582 5583 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 5584 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5585 5586 /* Read the field, zero-extended to a u64 value */ 5587 value = vmcs12_read_any(vmcs12, field, offset); 5588 } else { 5589 /* 5590 * Hyper-V TLFS (as of 6.0b) explicitly states, that while an 5591 * enlightened VMCS is active VMREAD/VMWRITE instructions are 5592 * unsupported. Unfortunately, certain versions of Windows 11 5593 * don't comply with this requirement which is not enforced in 5594 * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a 5595 * workaround, as misbehaving guests will panic on VM-Fail. 5596 * Note, enlightened VMCS is incompatible with shadow VMCS so 5597 * all VMREADs from L2 should go to L1. 5598 */ 5599 if (WARN_ON_ONCE(is_guest_mode(vcpu))) 5600 return nested_vmx_failInvalid(vcpu); 5601 5602 offset = evmcs_field_offset(field, NULL); 5603 if (offset < 0) 5604 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5605 5606 /* Read the field, zero-extended to a u64 value */ 5607 value = evmcs_read_any(nested_vmx_evmcs(vmx), field, offset); 5608 } 5609 5610 /* 5611 * Now copy part of this value to register or memory, as requested. 5612 * Note that the number of bits actually copied is 32 or 64 depending 5613 * on the guest's mode (32 or 64 bit), not on the given field's length. 5614 */ 5615 if (instr_info & BIT(10)) { 5616 kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); 5617 } else { 5618 len = is_64_bit_mode(vcpu) ? 8 : 4; 5619 if (get_vmx_mem_address(vcpu, exit_qualification, 5620 instr_info, true, len, &gva)) 5621 return 1; 5622 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 5623 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); 5624 if (r != X86EMUL_CONTINUE) 5625 return kvm_handle_memory_failure(vcpu, r, &e); 5626 } 5627 5628 return nested_vmx_succeed(vcpu); 5629 } 5630 5631 static bool is_shadow_field_rw(unsigned long field) 5632 { 5633 switch (field) { 5634 #define SHADOW_FIELD_RW(x, y) case x: 5635 #include "vmcs_shadow_fields.h" 5636 return true; 5637 default: 5638 break; 5639 } 5640 return false; 5641 } 5642 5643 static bool is_shadow_field_ro(unsigned long field) 5644 { 5645 switch (field) { 5646 #define SHADOW_FIELD_RO(x, y) case x: 5647 #include "vmcs_shadow_fields.h" 5648 return true; 5649 default: 5650 break; 5651 } 5652 return false; 5653 } 5654 5655 static int handle_vmwrite(struct kvm_vcpu *vcpu) 5656 { 5657 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5658 : get_vmcs12(vcpu); 5659 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5660 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5661 struct vcpu_vmx *vmx = to_vmx(vcpu); 5662 struct x86_exception e; 5663 unsigned long field; 5664 short offset; 5665 gva_t gva; 5666 int len, r; 5667 5668 /* 5669 * The value to write might be 32 or 64 bits, depending on L1's long 5670 * mode, and eventually we need to write that into a field of several 5671 * possible lengths. The code below first zero-extends the value to 64 5672 * bit (value), and then copies only the appropriate number of 5673 * bits into the vmcs12 field. 5674 */ 5675 u64 value = 0; 5676 5677 if (!nested_vmx_check_permission(vcpu)) 5678 return 1; 5679 5680 /* 5681 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5682 * any VMWRITE sets the ALU flags for VMfailInvalid. 5683 */ 5684 if (vmx->nested.current_vmptr == INVALID_GPA || 5685 (is_guest_mode(vcpu) && 5686 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5687 return nested_vmx_failInvalid(vcpu); 5688 5689 if (instr_info & BIT(10)) 5690 value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); 5691 else { 5692 len = is_64_bit_mode(vcpu) ? 8 : 4; 5693 if (get_vmx_mem_address(vcpu, exit_qualification, 5694 instr_info, false, len, &gva)) 5695 return 1; 5696 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); 5697 if (r != X86EMUL_CONTINUE) 5698 return kvm_handle_memory_failure(vcpu, r, &e); 5699 } 5700 5701 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5702 5703 offset = get_vmcs12_field_offset(field); 5704 if (offset < 0) 5705 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5706 5707 /* 5708 * If the vCPU supports "VMWRITE to any supported field in the 5709 * VMCS," then the "read-only" fields are actually read/write. 5710 */ 5711 if (vmcs_field_readonly(field) && 5712 !nested_cpu_has_vmwrite_any_field(vcpu)) 5713 return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 5714 5715 /* 5716 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 5717 * vmcs12, else we may crush a field or consume a stale value. 5718 */ 5719 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) 5720 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5721 5722 /* 5723 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 5724 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 5725 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 5726 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 5727 * from L1 will return a different value than VMREAD from L2 (L1 sees 5728 * the stripped down value, L2 sees the full value as stored by KVM). 5729 */ 5730 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 5731 value &= 0x1f0ff; 5732 5733 vmcs12_write_any(vmcs12, field, offset, value); 5734 5735 /* 5736 * Do not track vmcs12 dirty-state if in guest-mode as we actually 5737 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 5738 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 5739 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 5740 */ 5741 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 5742 /* 5743 * L1 can read these fields without exiting, ensure the 5744 * shadow VMCS is up-to-date. 5745 */ 5746 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 5747 preempt_disable(); 5748 vmcs_load(vmx->vmcs01.shadow_vmcs); 5749 5750 __vmcs_writel(field, value); 5751 5752 vmcs_clear(vmx->vmcs01.shadow_vmcs); 5753 vmcs_load(vmx->loaded_vmcs->vmcs); 5754 preempt_enable(); 5755 } 5756 vmx->nested.dirty_vmcs12 = true; 5757 } 5758 5759 return nested_vmx_succeed(vcpu); 5760 } 5761 5762 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 5763 { 5764 vmx->nested.current_vmptr = vmptr; 5765 if (enable_shadow_vmcs) { 5766 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 5767 vmcs_write64(VMCS_LINK_POINTER, 5768 __pa(vmx->vmcs01.shadow_vmcs)); 5769 vmx->nested.need_vmcs12_to_shadow_sync = true; 5770 } 5771 vmx->nested.dirty_vmcs12 = true; 5772 vmx->nested.force_msr_bitmap_recalc = true; 5773 } 5774 5775 /* Emulate the VMPTRLD instruction */ 5776 static int handle_vmptrld(struct kvm_vcpu *vcpu) 5777 { 5778 struct vcpu_vmx *vmx = to_vmx(vcpu); 5779 gpa_t vmptr; 5780 int r; 5781 5782 if (!nested_vmx_check_permission(vcpu)) 5783 return 1; 5784 5785 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5786 return r; 5787 5788 if (!page_address_valid(vcpu, vmptr)) 5789 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); 5790 5791 if (vmptr == vmx->nested.vmxon_ptr) 5792 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); 5793 5794 /* Forbid normal VMPTRLD if Enlightened version was used */ 5795 if (nested_vmx_is_evmptr12_valid(vmx)) 5796 return 1; 5797 5798 if (vmx->nested.current_vmptr != vmptr) { 5799 struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; 5800 struct vmcs_hdr hdr; 5801 5802 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { 5803 /* 5804 * Reads from an unbacked page return all 1s, 5805 * which means that the 32 bits located at the 5806 * given physical address won't match the required 5807 * VMCS12_REVISION identifier. 5808 */ 5809 return nested_vmx_fail(vcpu, 5810 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5811 } 5812 5813 if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 5814 offsetof(struct vmcs12, hdr), 5815 sizeof(hdr))) { 5816 return nested_vmx_fail(vcpu, 5817 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5818 } 5819 5820 if (hdr.revision_id != VMCS12_REVISION || 5821 (hdr.shadow_vmcs && 5822 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 5823 return nested_vmx_fail(vcpu, 5824 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5825 } 5826 5827 nested_release_vmcs12(vcpu); 5828 5829 /* 5830 * Load VMCS12 from guest memory since it is not already 5831 * cached. 5832 */ 5833 if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12, 5834 VMCS12_SIZE)) { 5835 return nested_vmx_fail(vcpu, 5836 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5837 } 5838 5839 set_current_vmptr(vmx, vmptr); 5840 } 5841 5842 return nested_vmx_succeed(vcpu); 5843 } 5844 5845 /* Emulate the VMPTRST instruction */ 5846 static int handle_vmptrst(struct kvm_vcpu *vcpu) 5847 { 5848 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 5849 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5850 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 5851 struct x86_exception e; 5852 gva_t gva; 5853 int r; 5854 5855 if (!nested_vmx_check_permission(vcpu)) 5856 return 1; 5857 5858 if (unlikely(nested_vmx_is_evmptr12_valid(to_vmx(vcpu)))) 5859 return 1; 5860 5861 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 5862 true, sizeof(gpa_t), &gva)) 5863 return 1; 5864 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 5865 r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 5866 sizeof(gpa_t), &e); 5867 if (r != X86EMUL_CONTINUE) 5868 return kvm_handle_memory_failure(vcpu, r, &e); 5869 5870 return nested_vmx_succeed(vcpu); 5871 } 5872 5873 /* Emulate the INVEPT instruction */ 5874 static int handle_invept(struct kvm_vcpu *vcpu) 5875 { 5876 struct vcpu_vmx *vmx = to_vmx(vcpu); 5877 u32 vmx_instruction_info, types; 5878 unsigned long type, roots_to_free; 5879 struct kvm_mmu *mmu; 5880 gva_t gva; 5881 struct x86_exception e; 5882 struct { 5883 u64 eptp, gpa; 5884 } operand; 5885 int i, r, gpr_index; 5886 5887 if (!(vmx->nested.msrs.secondary_ctls_high & 5888 SECONDARY_EXEC_ENABLE_EPT) || 5889 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 5890 kvm_queue_exception(vcpu, UD_VECTOR); 5891 return 1; 5892 } 5893 5894 if (!nested_vmx_check_permission(vcpu)) 5895 return 1; 5896 5897 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5898 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5899 type = kvm_register_read(vcpu, gpr_index); 5900 5901 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 5902 5903 if (type >= 32 || !(types & (1 << type))) 5904 return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5905 5906 /* According to the Intel VMX instruction reference, the memory 5907 * operand is read even if it isn't needed (e.g., for type==global) 5908 */ 5909 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5910 vmx_instruction_info, false, sizeof(operand), &gva)) 5911 return 1; 5912 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5913 if (r != X86EMUL_CONTINUE) 5914 return kvm_handle_memory_failure(vcpu, r, &e); 5915 5916 /* 5917 * Nested EPT roots are always held through guest_mmu, 5918 * not root_mmu. 5919 */ 5920 mmu = &vcpu->arch.guest_mmu; 5921 5922 switch (type) { 5923 case VMX_EPT_EXTENT_CONTEXT: 5924 if (!nested_vmx_check_eptp(vcpu, operand.eptp)) 5925 return nested_vmx_fail(vcpu, 5926 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5927 5928 roots_to_free = 0; 5929 if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd, 5930 operand.eptp)) 5931 roots_to_free |= KVM_MMU_ROOT_CURRENT; 5932 5933 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 5934 if (nested_ept_root_matches(mmu->prev_roots[i].hpa, 5935 mmu->prev_roots[i].pgd, 5936 operand.eptp)) 5937 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 5938 } 5939 break; 5940 case VMX_EPT_EXTENT_GLOBAL: 5941 roots_to_free = KVM_MMU_ROOTS_ALL; 5942 break; 5943 default: 5944 BUG(); 5945 break; 5946 } 5947 5948 if (roots_to_free) 5949 kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); 5950 5951 return nested_vmx_succeed(vcpu); 5952 } 5953 5954 static int handle_invvpid(struct kvm_vcpu *vcpu) 5955 { 5956 struct vcpu_vmx *vmx = to_vmx(vcpu); 5957 u32 vmx_instruction_info; 5958 unsigned long type, types; 5959 gva_t gva; 5960 struct x86_exception e; 5961 struct { 5962 u64 vpid; 5963 u64 gla; 5964 } operand; 5965 u16 vpid02; 5966 int r, gpr_index; 5967 5968 if (!(vmx->nested.msrs.secondary_ctls_high & 5969 SECONDARY_EXEC_ENABLE_VPID) || 5970 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 5971 kvm_queue_exception(vcpu, UD_VECTOR); 5972 return 1; 5973 } 5974 5975 if (!nested_vmx_check_permission(vcpu)) 5976 return 1; 5977 5978 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5979 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5980 type = kvm_register_read(vcpu, gpr_index); 5981 5982 types = (vmx->nested.msrs.vpid_caps & 5983 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 5984 5985 if (type >= 32 || !(types & (1 << type))) 5986 return nested_vmx_fail(vcpu, 5987 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5988 5989 /* according to the intel vmx instruction reference, the memory 5990 * operand is read even if it isn't needed (e.g., for type==global) 5991 */ 5992 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5993 vmx_instruction_info, false, sizeof(operand), &gva)) 5994 return 1; 5995 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5996 if (r != X86EMUL_CONTINUE) 5997 return kvm_handle_memory_failure(vcpu, r, &e); 5998 5999 if (operand.vpid >> 16) 6000 return nested_vmx_fail(vcpu, 6001 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6002 6003 /* 6004 * Always flush the effective vpid02, i.e. never flush the current VPID 6005 * and never explicitly flush vpid01. INVVPID targets a VPID, not a 6006 * VMCS, and so whether or not the current vmcs12 has VPID enabled is 6007 * irrelevant (and there may not be a loaded vmcs12). 6008 */ 6009 vpid02 = nested_get_vpid02(vcpu); 6010 switch (type) { 6011 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 6012 /* 6013 * LAM doesn't apply to addresses that are inputs to TLB 6014 * invalidation. 6015 */ 6016 if (!operand.vpid || 6017 is_noncanonical_invlpg_address(operand.gla, vcpu)) 6018 return nested_vmx_fail(vcpu, 6019 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6020 vpid_sync_vcpu_addr(vpid02, operand.gla); 6021 break; 6022 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 6023 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 6024 if (!operand.vpid) 6025 return nested_vmx_fail(vcpu, 6026 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6027 vpid_sync_context(vpid02); 6028 break; 6029 case VMX_VPID_EXTENT_ALL_CONTEXT: 6030 vpid_sync_context(vpid02); 6031 break; 6032 default: 6033 WARN_ON_ONCE(1); 6034 return kvm_skip_emulated_instruction(vcpu); 6035 } 6036 6037 /* 6038 * Sync the shadow page tables if EPT is disabled, L1 is invalidating 6039 * linear mappings for L2 (tagged with L2's VPID). Free all guest 6040 * roots as VPIDs are not tracked in the MMU role. 6041 * 6042 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share 6043 * an MMU when EPT is disabled. 6044 * 6045 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. 6046 */ 6047 if (!enable_ept) 6048 kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu); 6049 6050 return nested_vmx_succeed(vcpu); 6051 } 6052 6053 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 6054 struct vmcs12 *vmcs12) 6055 { 6056 u32 index = kvm_rcx_read(vcpu); 6057 u64 new_eptp; 6058 6059 if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) 6060 return 1; 6061 if (index >= VMFUNC_EPTP_ENTRIES) 6062 return 1; 6063 6064 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 6065 &new_eptp, index * 8, 8)) 6066 return 1; 6067 6068 /* 6069 * If the (L2) guest does a vmfunc to the currently 6070 * active ept pointer, we don't have to do anything else 6071 */ 6072 if (vmcs12->ept_pointer != new_eptp) { 6073 if (!nested_vmx_check_eptp(vcpu, new_eptp)) 6074 return 1; 6075 6076 vmcs12->ept_pointer = new_eptp; 6077 nested_ept_new_eptp(vcpu); 6078 6079 if (!nested_cpu_has_vpid(vmcs12)) 6080 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 6081 } 6082 6083 return 0; 6084 } 6085 6086 static int handle_vmfunc(struct kvm_vcpu *vcpu) 6087 { 6088 struct vcpu_vmx *vmx = to_vmx(vcpu); 6089 struct vmcs12 *vmcs12; 6090 u32 function = kvm_rax_read(vcpu); 6091 6092 /* 6093 * VMFUNC should never execute cleanly while L1 is active; KVM supports 6094 * VMFUNC for nested VMs, but not for L1. 6095 */ 6096 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) { 6097 kvm_queue_exception(vcpu, UD_VECTOR); 6098 return 1; 6099 } 6100 6101 vmcs12 = get_vmcs12(vcpu); 6102 6103 /* 6104 * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC 6105 * is enabled in vmcs02 if and only if it's enabled in vmcs12. 6106 */ 6107 if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { 6108 kvm_queue_exception(vcpu, UD_VECTOR); 6109 return 1; 6110 } 6111 6112 if (!(vmcs12->vm_function_control & BIT_ULL(function))) 6113 goto fail; 6114 6115 switch (function) { 6116 case 0: 6117 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 6118 goto fail; 6119 break; 6120 default: 6121 goto fail; 6122 } 6123 return kvm_skip_emulated_instruction(vcpu); 6124 6125 fail: 6126 /* 6127 * This is effectively a reflected VM-Exit, as opposed to a synthesized 6128 * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode 6129 * EXIT_REASON_VMFUNC as the exit reason. 6130 */ 6131 nested_vmx_vmexit(vcpu, vmx->exit_reason.full, 6132 vmx_get_intr_info(vcpu), 6133 vmx_get_exit_qual(vcpu)); 6134 return 1; 6135 } 6136 6137 /* 6138 * Return true if an IO instruction with the specified port and size should cause 6139 * a VM-exit into L1. 6140 */ 6141 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, 6142 int size) 6143 { 6144 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6145 gpa_t bitmap, last_bitmap; 6146 u8 b; 6147 6148 last_bitmap = INVALID_GPA; 6149 b = -1; 6150 6151 while (size > 0) { 6152 if (port < 0x8000) 6153 bitmap = vmcs12->io_bitmap_a; 6154 else if (port < 0x10000) 6155 bitmap = vmcs12->io_bitmap_b; 6156 else 6157 return true; 6158 bitmap += (port & 0x7fff) / 8; 6159 6160 if (last_bitmap != bitmap) 6161 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 6162 return true; 6163 if (b & (1 << (port & 7))) 6164 return true; 6165 6166 port++; 6167 size--; 6168 last_bitmap = bitmap; 6169 } 6170 6171 return false; 6172 } 6173 6174 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 6175 struct vmcs12 *vmcs12) 6176 { 6177 unsigned long exit_qualification; 6178 unsigned short port; 6179 int size; 6180 6181 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 6182 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 6183 6184 exit_qualification = vmx_get_exit_qual(vcpu); 6185 6186 port = exit_qualification >> 16; 6187 size = (exit_qualification & 7) + 1; 6188 6189 return nested_vmx_check_io_bitmaps(vcpu, port, size); 6190 } 6191 6192 /* 6193 * Return 1 if we should exit from L2 to L1 to handle an MSR access, 6194 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 6195 * disinterest in the current event (read or write a specific MSR) by using an 6196 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 6197 */ 6198 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 6199 struct vmcs12 *vmcs12, 6200 union vmx_exit_reason exit_reason) 6201 { 6202 u32 msr_index = kvm_rcx_read(vcpu); 6203 gpa_t bitmap; 6204 6205 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 6206 return true; 6207 6208 /* 6209 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 6210 * for the four combinations of read/write and low/high MSR numbers. 6211 * First we need to figure out which of the four to use: 6212 */ 6213 bitmap = vmcs12->msr_bitmap; 6214 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 6215 bitmap += 2048; 6216 if (msr_index >= 0xc0000000) { 6217 msr_index -= 0xc0000000; 6218 bitmap += 1024; 6219 } 6220 6221 /* Then read the msr_index'th bit from this bitmap: */ 6222 if (msr_index < 1024*8) { 6223 unsigned char b; 6224 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 6225 return true; 6226 return 1 & (b >> (msr_index & 7)); 6227 } else 6228 return true; /* let L1 handle the wrong parameter */ 6229 } 6230 6231 /* 6232 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 6233 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 6234 * intercept (via guest_host_mask etc.) the current event. 6235 */ 6236 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 6237 struct vmcs12 *vmcs12) 6238 { 6239 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 6240 int cr = exit_qualification & 15; 6241 int reg; 6242 unsigned long val; 6243 6244 switch ((exit_qualification >> 4) & 3) { 6245 case 0: /* mov to cr */ 6246 reg = (exit_qualification >> 8) & 15; 6247 val = kvm_register_read(vcpu, reg); 6248 switch (cr) { 6249 case 0: 6250 if (vmcs12->cr0_guest_host_mask & 6251 (val ^ vmcs12->cr0_read_shadow)) 6252 return true; 6253 break; 6254 case 3: 6255 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 6256 return true; 6257 break; 6258 case 4: 6259 if (vmcs12->cr4_guest_host_mask & 6260 (vmcs12->cr4_read_shadow ^ val)) 6261 return true; 6262 break; 6263 case 8: 6264 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 6265 return true; 6266 break; 6267 } 6268 break; 6269 case 2: /* clts */ 6270 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 6271 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 6272 return true; 6273 break; 6274 case 1: /* mov from cr */ 6275 switch (cr) { 6276 case 3: 6277 if (vmcs12->cpu_based_vm_exec_control & 6278 CPU_BASED_CR3_STORE_EXITING) 6279 return true; 6280 break; 6281 case 8: 6282 if (vmcs12->cpu_based_vm_exec_control & 6283 CPU_BASED_CR8_STORE_EXITING) 6284 return true; 6285 break; 6286 } 6287 break; 6288 case 3: /* lmsw */ 6289 /* 6290 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 6291 * cr0. Other attempted changes are ignored, with no exit. 6292 */ 6293 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 6294 if (vmcs12->cr0_guest_host_mask & 0xe & 6295 (val ^ vmcs12->cr0_read_shadow)) 6296 return true; 6297 if ((vmcs12->cr0_guest_host_mask & 0x1) && 6298 !(vmcs12->cr0_read_shadow & 0x1) && 6299 (val & 0x1)) 6300 return true; 6301 break; 6302 } 6303 return false; 6304 } 6305 6306 static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, 6307 struct vmcs12 *vmcs12) 6308 { 6309 u32 encls_leaf; 6310 6311 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) || 6312 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) 6313 return false; 6314 6315 encls_leaf = kvm_rax_read(vcpu); 6316 if (encls_leaf > 62) 6317 encls_leaf = 63; 6318 return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); 6319 } 6320 6321 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 6322 struct vmcs12 *vmcs12, gpa_t bitmap) 6323 { 6324 u32 vmx_instruction_info; 6325 unsigned long field; 6326 u8 b; 6327 6328 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 6329 return true; 6330 6331 /* Decode instruction info and find the field to access */ 6332 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6333 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 6334 6335 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 6336 if (field >> 15) 6337 return true; 6338 6339 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 6340 return true; 6341 6342 return 1 & (b >> (field & 7)); 6343 } 6344 6345 static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) 6346 { 6347 u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; 6348 6349 if (nested_cpu_has_mtf(vmcs12)) 6350 return true; 6351 6352 /* 6353 * An MTF VM-exit may be injected into the guest by setting the 6354 * interruption-type to 7 (other event) and the vector field to 0. Such 6355 * is the case regardless of the 'monitor trap flag' VM-execution 6356 * control. 6357 */ 6358 return entry_intr_info == (INTR_INFO_VALID_MASK 6359 | INTR_TYPE_OTHER_EVENT); 6360 } 6361 6362 /* 6363 * Return true if L0 wants to handle an exit from L2 regardless of whether or not 6364 * L1 wants the exit. Only call this when in is_guest_mode (L2). 6365 */ 6366 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, 6367 union vmx_exit_reason exit_reason) 6368 { 6369 u32 intr_info; 6370 6371 switch ((u16)exit_reason.basic) { 6372 case EXIT_REASON_EXCEPTION_NMI: 6373 intr_info = vmx_get_intr_info(vcpu); 6374 if (is_nmi(intr_info)) 6375 return true; 6376 else if (is_page_fault(intr_info)) 6377 return vcpu->arch.apf.host_apf_flags || 6378 vmx_need_pf_intercept(vcpu); 6379 else if (is_debug(intr_info) && 6380 vcpu->guest_debug & 6381 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 6382 return true; 6383 else if (is_breakpoint(intr_info) && 6384 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 6385 return true; 6386 else if (is_alignment_check(intr_info) && 6387 !vmx_guest_inject_ac(vcpu)) 6388 return true; 6389 else if (is_ve_fault(intr_info)) 6390 return true; 6391 return false; 6392 case EXIT_REASON_EXTERNAL_INTERRUPT: 6393 return true; 6394 case EXIT_REASON_MCE_DURING_VMENTRY: 6395 return true; 6396 case EXIT_REASON_EPT_VIOLATION: 6397 /* 6398 * L0 always deals with the EPT violation. If nested EPT is 6399 * used, and the nested mmu code discovers that the address is 6400 * missing in the guest EPT table (EPT12), the EPT violation 6401 * will be injected with nested_ept_inject_page_fault() 6402 */ 6403 return true; 6404 case EXIT_REASON_EPT_MISCONFIG: 6405 /* 6406 * L2 never uses directly L1's EPT, but rather L0's own EPT 6407 * table (shadow on EPT) or a merged EPT table that L0 built 6408 * (EPT on EPT). So any problems with the structure of the 6409 * table is L0's fault. 6410 */ 6411 return true; 6412 case EXIT_REASON_PREEMPTION_TIMER: 6413 return true; 6414 case EXIT_REASON_PML_FULL: 6415 /* 6416 * PML is emulated for an L1 VMM and should never be enabled in 6417 * vmcs02, always "handle" PML_FULL by exiting to userspace. 6418 */ 6419 return true; 6420 case EXIT_REASON_VMFUNC: 6421 /* VM functions are emulated through L2->L0 vmexits. */ 6422 return true; 6423 case EXIT_REASON_BUS_LOCK: 6424 /* 6425 * At present, bus lock VM exit is never exposed to L1. 6426 * Handle L2's bus locks in L0 directly. 6427 */ 6428 return true; 6429 #ifdef CONFIG_KVM_HYPERV 6430 case EXIT_REASON_VMCALL: 6431 /* Hyper-V L2 TLB flush hypercall is handled by L0 */ 6432 return guest_hv_cpuid_has_l2_tlb_flush(vcpu) && 6433 nested_evmcs_l2_tlb_flush_enabled(vcpu) && 6434 kvm_hv_is_tlb_flush_hcall(vcpu); 6435 #endif 6436 default: 6437 break; 6438 } 6439 return false; 6440 } 6441 6442 /* 6443 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in 6444 * is_guest_mode (L2). 6445 */ 6446 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, 6447 union vmx_exit_reason exit_reason) 6448 { 6449 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6450 u32 intr_info; 6451 6452 switch ((u16)exit_reason.basic) { 6453 case EXIT_REASON_EXCEPTION_NMI: 6454 intr_info = vmx_get_intr_info(vcpu); 6455 if (is_nmi(intr_info)) 6456 return true; 6457 else if (is_page_fault(intr_info)) 6458 return true; 6459 return vmcs12->exception_bitmap & 6460 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 6461 case EXIT_REASON_EXTERNAL_INTERRUPT: 6462 return nested_exit_on_intr(vcpu); 6463 case EXIT_REASON_TRIPLE_FAULT: 6464 return true; 6465 case EXIT_REASON_INTERRUPT_WINDOW: 6466 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); 6467 case EXIT_REASON_NMI_WINDOW: 6468 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); 6469 case EXIT_REASON_TASK_SWITCH: 6470 return true; 6471 case EXIT_REASON_CPUID: 6472 return true; 6473 case EXIT_REASON_HLT: 6474 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 6475 case EXIT_REASON_INVD: 6476 return true; 6477 case EXIT_REASON_INVLPG: 6478 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6479 case EXIT_REASON_RDPMC: 6480 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 6481 case EXIT_REASON_RDRAND: 6482 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 6483 case EXIT_REASON_RDSEED: 6484 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 6485 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 6486 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 6487 case EXIT_REASON_VMREAD: 6488 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6489 vmcs12->vmread_bitmap); 6490 case EXIT_REASON_VMWRITE: 6491 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6492 vmcs12->vmwrite_bitmap); 6493 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 6494 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 6495 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 6496 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 6497 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 6498 /* 6499 * VMX instructions trap unconditionally. This allows L1 to 6500 * emulate them for its L2 guest, i.e., allows 3-level nesting! 6501 */ 6502 return true; 6503 case EXIT_REASON_CR_ACCESS: 6504 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 6505 case EXIT_REASON_DR_ACCESS: 6506 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 6507 case EXIT_REASON_IO_INSTRUCTION: 6508 return nested_vmx_exit_handled_io(vcpu, vmcs12); 6509 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 6510 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 6511 case EXIT_REASON_MSR_READ: 6512 case EXIT_REASON_MSR_WRITE: 6513 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 6514 case EXIT_REASON_INVALID_STATE: 6515 return true; 6516 case EXIT_REASON_MWAIT_INSTRUCTION: 6517 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 6518 case EXIT_REASON_MONITOR_TRAP_FLAG: 6519 return nested_vmx_exit_handled_mtf(vmcs12); 6520 case EXIT_REASON_MONITOR_INSTRUCTION: 6521 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 6522 case EXIT_REASON_PAUSE_INSTRUCTION: 6523 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 6524 nested_cpu_has2(vmcs12, 6525 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 6526 case EXIT_REASON_MCE_DURING_VMENTRY: 6527 return true; 6528 case EXIT_REASON_TPR_BELOW_THRESHOLD: 6529 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 6530 case EXIT_REASON_APIC_ACCESS: 6531 case EXIT_REASON_APIC_WRITE: 6532 case EXIT_REASON_EOI_INDUCED: 6533 /* 6534 * The controls for "virtualize APIC accesses," "APIC- 6535 * register virtualization," and "virtual-interrupt 6536 * delivery" only come from vmcs12. 6537 */ 6538 return true; 6539 case EXIT_REASON_INVPCID: 6540 return 6541 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 6542 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6543 case EXIT_REASON_WBINVD: 6544 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 6545 case EXIT_REASON_XSETBV: 6546 return true; 6547 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 6548 /* 6549 * This should never happen, since it is not possible to 6550 * set XSS to a non-zero value---neither in L1 nor in L2. 6551 * If if it were, XSS would have to be checked against 6552 * the XSS exit bitmap in vmcs12. 6553 */ 6554 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES); 6555 case EXIT_REASON_UMWAIT: 6556 case EXIT_REASON_TPAUSE: 6557 return nested_cpu_has2(vmcs12, 6558 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); 6559 case EXIT_REASON_ENCLS: 6560 return nested_vmx_exit_handled_encls(vcpu, vmcs12); 6561 case EXIT_REASON_NOTIFY: 6562 /* Notify VM exit is not exposed to L1 */ 6563 return false; 6564 default: 6565 return true; 6566 } 6567 } 6568 6569 /* 6570 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was 6571 * reflected into L1. 6572 */ 6573 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) 6574 { 6575 struct vcpu_vmx *vmx = to_vmx(vcpu); 6576 union vmx_exit_reason exit_reason = vmx->exit_reason; 6577 unsigned long exit_qual; 6578 u32 exit_intr_info; 6579 6580 WARN_ON_ONCE(vmx->nested.nested_run_pending); 6581 6582 /* 6583 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM 6584 * has already loaded L2's state. 6585 */ 6586 if (unlikely(vmx->fail)) { 6587 trace_kvm_nested_vmenter_failed( 6588 "hardware VM-instruction error: ", 6589 vmcs_read32(VM_INSTRUCTION_ERROR)); 6590 exit_intr_info = 0; 6591 exit_qual = 0; 6592 goto reflect_vmexit; 6593 } 6594 6595 trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); 6596 6597 /* If L0 (KVM) wants the exit, it trumps L1's desires. */ 6598 if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) 6599 return false; 6600 6601 /* If L1 doesn't want the exit, handle it in L0. */ 6602 if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) 6603 return false; 6604 6605 /* 6606 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For 6607 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would 6608 * need to be synthesized by querying the in-kernel LAPIC, but external 6609 * interrupts are never reflected to L1 so it's a non-issue. 6610 */ 6611 exit_intr_info = vmx_get_intr_info(vcpu); 6612 if (is_exception_with_error_code(exit_intr_info)) { 6613 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6614 6615 vmcs12->vm_exit_intr_error_code = 6616 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6617 } 6618 exit_qual = vmx_get_exit_qual(vcpu); 6619 6620 reflect_vmexit: 6621 nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); 6622 return true; 6623 } 6624 6625 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 6626 struct kvm_nested_state __user *user_kvm_nested_state, 6627 u32 user_data_size) 6628 { 6629 struct vcpu_vmx *vmx; 6630 struct vmcs12 *vmcs12; 6631 struct kvm_nested_state kvm_state = { 6632 .flags = 0, 6633 .format = KVM_STATE_NESTED_FORMAT_VMX, 6634 .size = sizeof(kvm_state), 6635 .hdr.vmx.flags = 0, 6636 .hdr.vmx.vmxon_pa = INVALID_GPA, 6637 .hdr.vmx.vmcs12_pa = INVALID_GPA, 6638 .hdr.vmx.preemption_timer_deadline = 0, 6639 }; 6640 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6641 &user_kvm_nested_state->data.vmx[0]; 6642 6643 if (!vcpu) 6644 return kvm_state.size + sizeof(*user_vmx_nested_state); 6645 6646 vmx = to_vmx(vcpu); 6647 vmcs12 = get_vmcs12(vcpu); 6648 6649 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) && 6650 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 6651 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 6652 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 6653 6654 if (vmx_has_valid_vmcs12(vcpu)) { 6655 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 6656 6657 /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ 6658 if (nested_vmx_is_evmptr12_set(vmx)) 6659 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 6660 6661 if (is_guest_mode(vcpu) && 6662 nested_cpu_has_shadow_vmcs(vmcs12) && 6663 vmcs12->vmcs_link_pointer != INVALID_GPA) 6664 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 6665 } 6666 6667 if (vmx->nested.smm.vmxon) 6668 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 6669 6670 if (vmx->nested.smm.guest_mode) 6671 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 6672 6673 if (is_guest_mode(vcpu)) { 6674 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 6675 6676 if (vmx->nested.nested_run_pending) 6677 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 6678 6679 if (vmx->nested.mtf_pending) 6680 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; 6681 6682 if (nested_cpu_has_preemption_timer(vmcs12) && 6683 vmx->nested.has_preemption_timer_deadline) { 6684 kvm_state.hdr.vmx.flags |= 6685 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; 6686 kvm_state.hdr.vmx.preemption_timer_deadline = 6687 vmx->nested.preemption_timer_deadline; 6688 } 6689 } 6690 } 6691 6692 if (user_data_size < kvm_state.size) 6693 goto out; 6694 6695 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 6696 return -EFAULT; 6697 6698 if (!vmx_has_valid_vmcs12(vcpu)) 6699 goto out; 6700 6701 /* 6702 * When running L2, the authoritative vmcs12 state is in the 6703 * vmcs02. When running L1, the authoritative vmcs12 state is 6704 * in the shadow or enlightened vmcs linked to vmcs01, unless 6705 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 6706 * vmcs12 state is in the vmcs12 already. 6707 */ 6708 if (is_guest_mode(vcpu)) { 6709 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 6710 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 6711 } else { 6712 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 6713 if (!vmx->nested.need_vmcs12_to_shadow_sync) { 6714 if (nested_vmx_is_evmptr12_valid(vmx)) 6715 /* 6716 * L1 hypervisor is not obliged to keep eVMCS 6717 * clean fields data always up-to-date while 6718 * not in guest mode, 'hv_clean_fields' is only 6719 * supposed to be actual upon vmentry so we need 6720 * to ignore it here and do full copy. 6721 */ 6722 copy_enlightened_to_vmcs12(vmx, 0); 6723 else if (enable_shadow_vmcs) 6724 copy_shadow_to_vmcs12(vmx); 6725 } 6726 } 6727 6728 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 6729 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 6730 6731 /* 6732 * Copy over the full allocated size of vmcs12 rather than just the size 6733 * of the struct. 6734 */ 6735 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 6736 return -EFAULT; 6737 6738 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6739 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6740 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 6741 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 6742 return -EFAULT; 6743 } 6744 out: 6745 return kvm_state.size; 6746 } 6747 6748 void vmx_leave_nested(struct kvm_vcpu *vcpu) 6749 { 6750 if (is_guest_mode(vcpu)) { 6751 to_vmx(vcpu)->nested.nested_run_pending = 0; 6752 nested_vmx_vmexit(vcpu, -1, 0, 0); 6753 } 6754 free_nested(vcpu); 6755 } 6756 6757 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 6758 struct kvm_nested_state __user *user_kvm_nested_state, 6759 struct kvm_nested_state *kvm_state) 6760 { 6761 struct vcpu_vmx *vmx = to_vmx(vcpu); 6762 struct vmcs12 *vmcs12; 6763 enum vm_entry_failure_code ignored; 6764 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6765 &user_kvm_nested_state->data.vmx[0]; 6766 int ret; 6767 6768 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 6769 return -EINVAL; 6770 6771 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { 6772 if (kvm_state->hdr.vmx.smm.flags) 6773 return -EINVAL; 6774 6775 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) 6776 return -EINVAL; 6777 6778 /* 6779 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 6780 * enable eVMCS capability on vCPU. However, since then 6781 * code was changed such that flag signals vmcs12 should 6782 * be copied into eVMCS in guest memory. 6783 * 6784 * To preserve backwards compatibility, allow user 6785 * to set this flag even when there is no VMXON region. 6786 */ 6787 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 6788 return -EINVAL; 6789 } else { 6790 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 6791 return -EINVAL; 6792 6793 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 6794 return -EINVAL; 6795 } 6796 6797 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6798 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6799 return -EINVAL; 6800 6801 if (kvm_state->hdr.vmx.smm.flags & 6802 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 6803 return -EINVAL; 6804 6805 if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) 6806 return -EINVAL; 6807 6808 /* 6809 * SMM temporarily disables VMX, so we cannot be in guest mode, 6810 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 6811 * must be zero. 6812 */ 6813 if (is_smm(vcpu) ? 6814 (kvm_state->flags & 6815 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 6816 : kvm_state->hdr.vmx.smm.flags) 6817 return -EINVAL; 6818 6819 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6820 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 6821 return -EINVAL; 6822 6823 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 6824 (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) || 6825 !vmx->nested.enlightened_vmcs_enabled)) 6826 return -EINVAL; 6827 6828 vmx_leave_nested(vcpu); 6829 6830 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) 6831 return 0; 6832 6833 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 6834 ret = enter_vmx_operation(vcpu); 6835 if (ret) 6836 return ret; 6837 6838 /* Empty 'VMXON' state is permitted if no VMCS loaded */ 6839 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { 6840 /* See vmx_has_valid_vmcs12. */ 6841 if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || 6842 (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || 6843 (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) 6844 return -EINVAL; 6845 else 6846 return 0; 6847 } 6848 6849 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { 6850 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 6851 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 6852 return -EINVAL; 6853 6854 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 6855 #ifdef CONFIG_KVM_HYPERV 6856 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 6857 /* 6858 * nested_vmx_handle_enlightened_vmptrld() cannot be called 6859 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be 6860 * restored yet. EVMCS will be mapped from 6861 * nested_get_vmcs12_pages(). 6862 */ 6863 vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; 6864 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 6865 #endif 6866 } else { 6867 return -EINVAL; 6868 } 6869 6870 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 6871 vmx->nested.smm.vmxon = true; 6872 vmx->nested.vmxon = false; 6873 6874 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 6875 vmx->nested.smm.guest_mode = true; 6876 } 6877 6878 vmcs12 = get_vmcs12(vcpu); 6879 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 6880 return -EFAULT; 6881 6882 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 6883 return -EINVAL; 6884 6885 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6886 return 0; 6887 6888 vmx->nested.nested_run_pending = 6889 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 6890 6891 vmx->nested.mtf_pending = 6892 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); 6893 6894 ret = -EINVAL; 6895 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6896 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6897 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 6898 6899 if (kvm_state->size < 6900 sizeof(*kvm_state) + 6901 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 6902 goto error_guest_mode; 6903 6904 if (copy_from_user(shadow_vmcs12, 6905 user_vmx_nested_state->shadow_vmcs12, 6906 sizeof(*shadow_vmcs12))) { 6907 ret = -EFAULT; 6908 goto error_guest_mode; 6909 } 6910 6911 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 6912 !shadow_vmcs12->hdr.shadow_vmcs) 6913 goto error_guest_mode; 6914 } 6915 6916 vmx->nested.has_preemption_timer_deadline = false; 6917 if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { 6918 vmx->nested.has_preemption_timer_deadline = true; 6919 vmx->nested.preemption_timer_deadline = 6920 kvm_state->hdr.vmx.preemption_timer_deadline; 6921 } 6922 6923 if (nested_vmx_check_controls(vcpu, vmcs12) || 6924 nested_vmx_check_host_state(vcpu, vmcs12) || 6925 nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) 6926 goto error_guest_mode; 6927 6928 vmx->nested.dirty_vmcs12 = true; 6929 vmx->nested.force_msr_bitmap_recalc = true; 6930 ret = nested_vmx_enter_non_root_mode(vcpu, false); 6931 if (ret) 6932 goto error_guest_mode; 6933 6934 if (vmx->nested.mtf_pending) 6935 kvm_make_request(KVM_REQ_EVENT, vcpu); 6936 6937 return 0; 6938 6939 error_guest_mode: 6940 vmx->nested.nested_run_pending = 0; 6941 return ret; 6942 } 6943 6944 void nested_vmx_set_vmcs_shadowing_bitmap(void) 6945 { 6946 if (enable_shadow_vmcs) { 6947 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 6948 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 6949 } 6950 } 6951 6952 /* 6953 * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo 6954 * that madness to get the encoding for comparison. 6955 */ 6956 #define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10))) 6957 6958 static u64 nested_vmx_calc_vmcs_enum_msr(void) 6959 { 6960 /* 6961 * Note these are the so called "index" of the VMCS field encoding, not 6962 * the index into vmcs12. 6963 */ 6964 unsigned int max_idx, idx; 6965 int i; 6966 6967 /* 6968 * For better or worse, KVM allows VMREAD/VMWRITE to all fields in 6969 * vmcs12, regardless of whether or not the associated feature is 6970 * exposed to L1. Simply find the field with the highest index. 6971 */ 6972 max_idx = 0; 6973 for (i = 0; i < nr_vmcs12_fields; i++) { 6974 /* The vmcs12 table is very, very sparsely populated. */ 6975 if (!vmcs12_field_offsets[i]) 6976 continue; 6977 6978 idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); 6979 if (idx > max_idx) 6980 max_idx = idx; 6981 } 6982 6983 return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; 6984 } 6985 6986 static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf, 6987 struct nested_vmx_msrs *msrs) 6988 { 6989 msrs->pinbased_ctls_low = 6990 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6991 6992 msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl; 6993 msrs->pinbased_ctls_high &= 6994 PIN_BASED_EXT_INTR_MASK | 6995 PIN_BASED_NMI_EXITING | 6996 PIN_BASED_VIRTUAL_NMIS | 6997 (enable_apicv ? PIN_BASED_POSTED_INTR : 0); 6998 msrs->pinbased_ctls_high |= 6999 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 7000 PIN_BASED_VMX_PREEMPTION_TIMER; 7001 } 7002 7003 static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf, 7004 struct nested_vmx_msrs *msrs) 7005 { 7006 msrs->exit_ctls_low = 7007 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 7008 7009 msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl; 7010 msrs->exit_ctls_high &= 7011 #ifdef CONFIG_X86_64 7012 VM_EXIT_HOST_ADDR_SPACE_SIZE | 7013 #endif 7014 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | 7015 VM_EXIT_CLEAR_BNDCFGS; 7016 msrs->exit_ctls_high |= 7017 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 7018 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 7019 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT | 7020 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 7021 7022 /* We support free control of debug control saving. */ 7023 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 7024 } 7025 7026 static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf, 7027 struct nested_vmx_msrs *msrs) 7028 { 7029 msrs->entry_ctls_low = 7030 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 7031 7032 msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl; 7033 msrs->entry_ctls_high &= 7034 #ifdef CONFIG_X86_64 7035 VM_ENTRY_IA32E_MODE | 7036 #endif 7037 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; 7038 msrs->entry_ctls_high |= 7039 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER | 7040 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 7041 7042 /* We support free control of debug control loading. */ 7043 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 7044 } 7045 7046 static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf, 7047 struct nested_vmx_msrs *msrs) 7048 { 7049 msrs->procbased_ctls_low = 7050 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 7051 7052 msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl; 7053 msrs->procbased_ctls_high &= 7054 CPU_BASED_INTR_WINDOW_EXITING | 7055 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | 7056 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 7057 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 7058 CPU_BASED_CR3_STORE_EXITING | 7059 #ifdef CONFIG_X86_64 7060 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 7061 #endif 7062 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 7063 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 7064 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 7065 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 7066 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 7067 /* 7068 * We can allow some features even when not supported by the 7069 * hardware. For example, L1 can specify an MSR bitmap - and we 7070 * can use it to avoid exits to L1 - even when L0 runs L2 7071 * without MSR bitmaps. 7072 */ 7073 msrs->procbased_ctls_high |= 7074 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 7075 CPU_BASED_USE_MSR_BITMAPS; 7076 7077 /* We support free control of CR3 access interception. */ 7078 msrs->procbased_ctls_low &= 7079 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 7080 } 7081 7082 static void nested_vmx_setup_secondary_ctls(u32 ept_caps, 7083 struct vmcs_config *vmcs_conf, 7084 struct nested_vmx_msrs *msrs) 7085 { 7086 msrs->secondary_ctls_low = 0; 7087 7088 msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl; 7089 msrs->secondary_ctls_high &= 7090 SECONDARY_EXEC_DESC | 7091 SECONDARY_EXEC_ENABLE_RDTSCP | 7092 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 7093 SECONDARY_EXEC_WBINVD_EXITING | 7094 SECONDARY_EXEC_APIC_REGISTER_VIRT | 7095 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 7096 SECONDARY_EXEC_RDRAND_EXITING | 7097 SECONDARY_EXEC_ENABLE_INVPCID | 7098 SECONDARY_EXEC_ENABLE_VMFUNC | 7099 SECONDARY_EXEC_RDSEED_EXITING | 7100 SECONDARY_EXEC_ENABLE_XSAVES | 7101 SECONDARY_EXEC_TSC_SCALING | 7102 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; 7103 7104 /* 7105 * We can emulate "VMCS shadowing," even if the hardware 7106 * doesn't support it. 7107 */ 7108 msrs->secondary_ctls_high |= 7109 SECONDARY_EXEC_SHADOW_VMCS; 7110 7111 if (enable_ept) { 7112 /* nested EPT: emulate EPT also to L1 */ 7113 msrs->secondary_ctls_high |= 7114 SECONDARY_EXEC_ENABLE_EPT; 7115 msrs->ept_caps = 7116 VMX_EPT_PAGE_WALK_4_BIT | 7117 VMX_EPT_PAGE_WALK_5_BIT | 7118 VMX_EPTP_WB_BIT | 7119 VMX_EPT_INVEPT_BIT | 7120 VMX_EPT_EXECUTE_ONLY_BIT; 7121 7122 msrs->ept_caps &= ept_caps; 7123 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 7124 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 7125 VMX_EPT_1GB_PAGE_BIT; 7126 if (enable_ept_ad_bits) { 7127 msrs->secondary_ctls_high |= 7128 SECONDARY_EXEC_ENABLE_PML; 7129 msrs->ept_caps |= VMX_EPT_AD_BIT; 7130 } 7131 7132 /* 7133 * Advertise EPTP switching irrespective of hardware support, 7134 * KVM emulates it in software so long as VMFUNC is supported. 7135 */ 7136 if (cpu_has_vmx_vmfunc()) 7137 msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING; 7138 } 7139 7140 /* 7141 * Old versions of KVM use the single-context version without 7142 * checking for support, so declare that it is supported even 7143 * though it is treated as global context. The alternative is 7144 * not failing the single-context invvpid, and it is worse. 7145 */ 7146 if (enable_vpid) { 7147 msrs->secondary_ctls_high |= 7148 SECONDARY_EXEC_ENABLE_VPID; 7149 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 7150 VMX_VPID_EXTENT_SUPPORTED_MASK; 7151 } 7152 7153 if (enable_unrestricted_guest) 7154 msrs->secondary_ctls_high |= 7155 SECONDARY_EXEC_UNRESTRICTED_GUEST; 7156 7157 if (flexpriority_enabled) 7158 msrs->secondary_ctls_high |= 7159 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 7160 7161 if (enable_sgx) 7162 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; 7163 } 7164 7165 static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf, 7166 struct nested_vmx_msrs *msrs) 7167 { 7168 msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; 7169 msrs->misc_low |= 7170 VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 7171 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 7172 VMX_MISC_ACTIVITY_HLT | 7173 VMX_MISC_ACTIVITY_WAIT_SIPI; 7174 msrs->misc_high = 0; 7175 } 7176 7177 static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) 7178 { 7179 /* 7180 * This MSR reports some information about VMX support. We 7181 * should return information about the VMX we emulate for the 7182 * guest, and the VMCS structure we give it - not about the 7183 * VMX support of the underlying hardware. 7184 */ 7185 msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE, 7186 X86_MEMTYPE_WB); 7187 7188 msrs->basic |= VMX_BASIC_TRUE_CTLS; 7189 if (cpu_has_vmx_basic_inout()) 7190 msrs->basic |= VMX_BASIC_INOUT; 7191 } 7192 7193 static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) 7194 { 7195 /* 7196 * These MSRs specify bits which the guest must keep fixed on 7197 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 7198 * We picked the standard core2 setting. 7199 */ 7200 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 7201 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 7202 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 7203 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 7204 7205 /* These MSRs specify bits which the guest must keep fixed off. */ 7206 rdmsrq(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 7207 rdmsrq(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 7208 7209 if (vmx_umip_emulated()) 7210 msrs->cr4_fixed1 |= X86_CR4_UMIP; 7211 } 7212 7213 /* 7214 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 7215 * returned for the various VMX controls MSRs when nested VMX is enabled. 7216 * The same values should also be used to verify that vmcs12 control fields are 7217 * valid during nested entry from L1 to L2. 7218 * Each of these control msrs has a low and high 32-bit half: A low bit is on 7219 * if the corresponding bit in the (32-bit) control field *must* be on, and a 7220 * bit in the high half is on if the corresponding bit in the control field 7221 * may be on. See also vmx_control_verify(). 7222 */ 7223 void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) 7224 { 7225 struct nested_vmx_msrs *msrs = &vmcs_conf->nested; 7226 7227 /* 7228 * Note that as a general rule, the high half of the MSRs (bits in 7229 * the control fields which may be 1) should be initialized by the 7230 * intersection of the underlying hardware's MSR (i.e., features which 7231 * can be supported) and the list of features we want to expose - 7232 * because they are known to be properly supported in our code. 7233 * Also, usually, the low half of the MSRs (bits which must be 1) can 7234 * be set to 0, meaning that L1 may turn off any of these bits. The 7235 * reason is that if one of these bits is necessary, it will appear 7236 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 7237 * fields of vmcs01 and vmcs02, will turn these bits off - and 7238 * nested_vmx_l1_wants_exit() will not pass related exits to L1. 7239 * These rules have exceptions below. 7240 */ 7241 nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs); 7242 7243 nested_vmx_setup_exit_ctls(vmcs_conf, msrs); 7244 7245 nested_vmx_setup_entry_ctls(vmcs_conf, msrs); 7246 7247 nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs); 7248 7249 nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs); 7250 7251 nested_vmx_setup_misc_data(vmcs_conf, msrs); 7252 7253 nested_vmx_setup_basic(msrs); 7254 7255 nested_vmx_setup_cr_fixed(msrs); 7256 7257 msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); 7258 } 7259 7260 void nested_vmx_hardware_unsetup(void) 7261 { 7262 int i; 7263 7264 if (enable_shadow_vmcs) { 7265 for (i = 0; i < VMX_BITMAP_NR; i++) 7266 free_page((unsigned long)vmx_bitmap[i]); 7267 } 7268 } 7269 7270 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 7271 { 7272 int i; 7273 7274 if (!cpu_has_vmx_shadow_vmcs()) 7275 enable_shadow_vmcs = 0; 7276 if (enable_shadow_vmcs) { 7277 for (i = 0; i < VMX_BITMAP_NR; i++) { 7278 /* 7279 * The vmx_bitmap is not tied to a VM and so should 7280 * not be charged to a memcg. 7281 */ 7282 vmx_bitmap[i] = (unsigned long *) 7283 __get_free_page(GFP_KERNEL); 7284 if (!vmx_bitmap[i]) { 7285 nested_vmx_hardware_unsetup(); 7286 return -ENOMEM; 7287 } 7288 } 7289 7290 init_vmcs_shadow_fields(); 7291 } 7292 7293 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; 7294 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; 7295 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; 7296 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; 7297 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; 7298 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; 7299 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; 7300 exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff; 7301 exit_handlers[EXIT_REASON_VMON] = handle_vmxon; 7302 exit_handlers[EXIT_REASON_INVEPT] = handle_invept; 7303 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; 7304 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; 7305 7306 return 0; 7307 } 7308 7309 struct kvm_x86_nested_ops vmx_nested_ops = { 7310 .leave_nested = vmx_leave_nested, 7311 .is_exception_vmexit = nested_vmx_is_exception_vmexit, 7312 .check_events = vmx_check_nested_events, 7313 .has_events = vmx_has_nested_events, 7314 .triple_fault = nested_vmx_triple_fault, 7315 .get_state = vmx_get_nested_state, 7316 .set_state = vmx_set_nested_state, 7317 .get_nested_state_pages = vmx_get_nested_state_pages, 7318 .write_log_dirty = nested_vmx_write_pml_buffer, 7319 #ifdef CONFIG_KVM_HYPERV 7320 .enable_evmcs = nested_enable_evmcs, 7321 .get_evmcs_version = nested_get_evmcs_version, 7322 .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush, 7323 #endif 7324 }; 7325