1 // SPDX-License-Identifier: GPL-2.0 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include <linux/objtool.h> 5 #include <linux/percpu.h> 6 7 #include <asm/debugreg.h> 8 #include <asm/mmu_context.h> 9 10 #include "cpuid.h" 11 #include "hyperv.h" 12 #include "mmu.h" 13 #include "nested.h" 14 #include "pmu.h" 15 #include "posted_intr.h" 16 #include "sgx.h" 17 #include "trace.h" 18 #include "vmx.h" 19 #include "x86.h" 20 #include "smm.h" 21 22 static bool __read_mostly enable_shadow_vmcs = 1; 23 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 24 25 static bool __read_mostly nested_early_check = 0; 26 module_param(nested_early_check, bool, S_IRUGO); 27 28 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK 29 30 /* 31 * Hyper-V requires all of these, so mark them as supported even though 32 * they are just treated the same as all-context. 33 */ 34 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 35 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 36 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 37 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 38 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 39 40 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 41 42 enum { 43 VMX_VMREAD_BITMAP, 44 VMX_VMWRITE_BITMAP, 45 VMX_BITMAP_NR 46 }; 47 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 48 49 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 50 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 51 52 struct shadow_vmcs_field { 53 u16 encoding; 54 u16 offset; 55 }; 56 static struct shadow_vmcs_field shadow_read_only_fields[] = { 57 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 58 #include "vmcs_shadow_fields.h" 59 }; 60 static int max_shadow_read_only_fields = 61 ARRAY_SIZE(shadow_read_only_fields); 62 63 static struct shadow_vmcs_field shadow_read_write_fields[] = { 64 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 65 #include "vmcs_shadow_fields.h" 66 }; 67 static int max_shadow_read_write_fields = 68 ARRAY_SIZE(shadow_read_write_fields); 69 70 static void init_vmcs_shadow_fields(void) 71 { 72 int i, j; 73 74 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 75 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 76 77 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 78 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 79 u16 field = entry.encoding; 80 81 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 82 (i + 1 == max_shadow_read_only_fields || 83 shadow_read_only_fields[i + 1].encoding != field + 1)) 84 pr_err("Missing field from shadow_read_only_field %x\n", 85 field + 1); 86 87 clear_bit(field, vmx_vmread_bitmap); 88 if (field & 1) 89 #ifdef CONFIG_X86_64 90 continue; 91 #else 92 entry.offset += sizeof(u32); 93 #endif 94 shadow_read_only_fields[j++] = entry; 95 } 96 max_shadow_read_only_fields = j; 97 98 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 99 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 100 u16 field = entry.encoding; 101 102 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 103 (i + 1 == max_shadow_read_write_fields || 104 shadow_read_write_fields[i + 1].encoding != field + 1)) 105 pr_err("Missing field from shadow_read_write_field %x\n", 106 field + 1); 107 108 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 109 field <= GUEST_TR_AR_BYTES, 110 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 111 112 /* 113 * PML and the preemption timer can be emulated, but the 114 * processor cannot vmwrite to fields that don't exist 115 * on bare metal. 116 */ 117 switch (field) { 118 case GUEST_PML_INDEX: 119 if (!cpu_has_vmx_pml()) 120 continue; 121 break; 122 case VMX_PREEMPTION_TIMER_VALUE: 123 if (!cpu_has_vmx_preemption_timer()) 124 continue; 125 break; 126 case GUEST_INTR_STATUS: 127 if (!cpu_has_vmx_apicv()) 128 continue; 129 break; 130 default: 131 break; 132 } 133 134 clear_bit(field, vmx_vmwrite_bitmap); 135 clear_bit(field, vmx_vmread_bitmap); 136 if (field & 1) 137 #ifdef CONFIG_X86_64 138 continue; 139 #else 140 entry.offset += sizeof(u32); 141 #endif 142 shadow_read_write_fields[j++] = entry; 143 } 144 max_shadow_read_write_fields = j; 145 } 146 147 /* 148 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 149 * set the success or error code of an emulated VMX instruction (as specified 150 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 151 * instruction. 152 */ 153 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 154 { 155 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 156 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 157 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 158 return kvm_skip_emulated_instruction(vcpu); 159 } 160 161 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 162 { 163 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 164 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 165 X86_EFLAGS_SF | X86_EFLAGS_OF)) 166 | X86_EFLAGS_CF); 167 return kvm_skip_emulated_instruction(vcpu); 168 } 169 170 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 171 u32 vm_instruction_error) 172 { 173 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 174 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 175 X86_EFLAGS_SF | X86_EFLAGS_OF)) 176 | X86_EFLAGS_ZF); 177 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 178 /* 179 * We don't need to force sync to shadow VMCS because 180 * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all 181 * fields and thus must be synced. 182 */ 183 if (nested_vmx_is_evmptr12_set(to_vmx(vcpu))) 184 to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; 185 186 return kvm_skip_emulated_instruction(vcpu); 187 } 188 189 static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) 190 { 191 struct vcpu_vmx *vmx = to_vmx(vcpu); 192 193 /* 194 * failValid writes the error number to the current VMCS, which 195 * can't be done if there isn't a current VMCS. 196 */ 197 if (vmx->nested.current_vmptr == INVALID_GPA && 198 !nested_vmx_is_evmptr12_valid(vmx)) 199 return nested_vmx_failInvalid(vcpu); 200 201 return nested_vmx_failValid(vcpu, vm_instruction_error); 202 } 203 204 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 205 { 206 /* TODO: not to reset guest simply here. */ 207 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 208 pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator); 209 } 210 211 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 212 { 213 return fixed_bits_valid(control, low, high); 214 } 215 216 static inline u64 vmx_control_msr(u32 low, u32 high) 217 { 218 return low | ((u64)high << 32); 219 } 220 221 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 222 { 223 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 224 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 225 vmx->nested.need_vmcs12_to_shadow_sync = false; 226 } 227 228 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 229 { 230 #ifdef CONFIG_KVM_HYPERV 231 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); 232 struct vcpu_vmx *vmx = to_vmx(vcpu); 233 234 if (nested_vmx_is_evmptr12_valid(vmx)) { 235 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); 236 vmx->nested.hv_evmcs = NULL; 237 } 238 239 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 240 241 if (hv_vcpu) { 242 hv_vcpu->nested.pa_page_gpa = INVALID_GPA; 243 hv_vcpu->nested.vm_id = 0; 244 hv_vcpu->nested.vp_id = 0; 245 } 246 #endif 247 } 248 249 static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr) 250 { 251 #ifdef CONFIG_KVM_HYPERV 252 struct vcpu_vmx *vmx = to_vmx(vcpu); 253 /* 254 * When Enlightened VMEntry is enabled on the calling CPU we treat 255 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 256 * way to distinguish it from VMCS12) and we must not corrupt it by 257 * writing to the non-existent 'launch_state' field. The area doesn't 258 * have to be the currently active EVMCS on the calling CPU and there's 259 * nothing KVM has to do to transition it from 'active' to 'non-active' 260 * state. It is possible that the area will stay mapped as 261 * vmx->nested.hv_evmcs but this shouldn't be a problem. 262 */ 263 if (!guest_cpuid_has_evmcs(vcpu) || 264 !evmptr_is_valid(nested_get_evmptr(vcpu))) 265 return false; 266 267 if (nested_vmx_evmcs(vmx) && vmptr == vmx->nested.hv_evmcs_vmptr) 268 nested_release_evmcs(vcpu); 269 270 return true; 271 #else 272 return false; 273 #endif 274 } 275 276 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 277 struct loaded_vmcs *prev) 278 { 279 struct vmcs_host_state *dest, *src; 280 281 if (unlikely(!vmx->guest_state_loaded)) 282 return; 283 284 src = &prev->host_state; 285 dest = &vmx->loaded_vmcs->host_state; 286 287 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 288 dest->ldt_sel = src->ldt_sel; 289 #ifdef CONFIG_X86_64 290 dest->ds_sel = src->ds_sel; 291 dest->es_sel = src->es_sel; 292 #endif 293 } 294 295 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 296 { 297 struct vcpu_vmx *vmx = to_vmx(vcpu); 298 struct loaded_vmcs *prev; 299 int cpu; 300 301 if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) 302 return; 303 304 cpu = get_cpu(); 305 prev = vmx->loaded_vmcs; 306 vmx->loaded_vmcs = vmcs; 307 vmx_vcpu_load_vmcs(vcpu, cpu, prev); 308 vmx_sync_vmcs_host_state(vmx, prev); 309 put_cpu(); 310 311 vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET; 312 313 /* 314 * All lazily updated registers will be reloaded from VMCS12 on both 315 * vmentry and vmexit. 316 */ 317 vcpu->arch.regs_dirty = 0; 318 } 319 320 /* 321 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 322 * just stops using VMX. 323 */ 324 static void free_nested(struct kvm_vcpu *vcpu) 325 { 326 struct vcpu_vmx *vmx = to_vmx(vcpu); 327 328 if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) 329 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 330 331 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 332 return; 333 334 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 335 336 vmx->nested.vmxon = false; 337 vmx->nested.smm.vmxon = false; 338 vmx->nested.vmxon_ptr = INVALID_GPA; 339 free_vpid(vmx->nested.vpid02); 340 vmx->nested.posted_intr_nv = -1; 341 vmx->nested.current_vmptr = INVALID_GPA; 342 if (enable_shadow_vmcs) { 343 vmx_disable_shadow_vmcs(vmx); 344 vmcs_clear(vmx->vmcs01.shadow_vmcs); 345 free_vmcs(vmx->vmcs01.shadow_vmcs); 346 vmx->vmcs01.shadow_vmcs = NULL; 347 } 348 kfree(vmx->nested.cached_vmcs12); 349 vmx->nested.cached_vmcs12 = NULL; 350 kfree(vmx->nested.cached_shadow_vmcs12); 351 vmx->nested.cached_shadow_vmcs12 = NULL; 352 /* 353 * Unpin physical memory we referred to in the vmcs02. The APIC access 354 * page's backing page (yeah, confusing) shouldn't actually be accessed, 355 * and if it is written, the contents are irrelevant. 356 */ 357 kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); 358 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 359 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 360 vmx->nested.pi_desc = NULL; 361 362 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 363 364 nested_release_evmcs(vcpu); 365 366 free_loaded_vmcs(&vmx->nested.vmcs02); 367 } 368 369 /* 370 * Ensure that the current vmcs of the logical processor is the 371 * vmcs01 of the vcpu before calling free_nested(). 372 */ 373 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 374 { 375 vcpu_load(vcpu); 376 vmx_leave_nested(vcpu); 377 vcpu_put(vcpu); 378 } 379 380 #define EPTP_PA_MASK GENMASK_ULL(51, 12) 381 382 static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) 383 { 384 return VALID_PAGE(root_hpa) && 385 ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); 386 } 387 388 static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, 389 gpa_t addr) 390 { 391 unsigned long roots = 0; 392 uint i; 393 struct kvm_mmu_root_info *cached_root; 394 395 WARN_ON_ONCE(!mmu_is_nested(vcpu)); 396 397 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 398 cached_root = &vcpu->arch.mmu->prev_roots[i]; 399 400 if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, 401 eptp)) 402 roots |= KVM_MMU_ROOT_PREVIOUS(i); 403 } 404 if (roots) 405 kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots); 406 } 407 408 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 409 struct x86_exception *fault) 410 { 411 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 412 struct vcpu_vmx *vmx = to_vmx(vcpu); 413 unsigned long exit_qualification; 414 u32 vm_exit_reason; 415 416 if (vmx->nested.pml_full) { 417 vm_exit_reason = EXIT_REASON_PML_FULL; 418 vmx->nested.pml_full = false; 419 420 /* 421 * It should be impossible to trigger a nested PML Full VM-Exit 422 * for anything other than an EPT Violation from L2. KVM *can* 423 * trigger nEPT page fault injection in response to an EPT 424 * Misconfig, e.g. if the MMIO SPTE was stale and L1's EPT 425 * tables also changed, but KVM should not treat EPT Misconfig 426 * VM-Exits as writes. 427 */ 428 WARN_ON_ONCE(vmx->exit_reason.basic != EXIT_REASON_EPT_VIOLATION); 429 430 /* 431 * PML Full and EPT Violation VM-Exits both use bit 12 to report 432 * "NMI unblocking due to IRET", i.e. the bit can be propagated 433 * as-is from the original EXIT_QUALIFICATION. 434 */ 435 exit_qualification = vmx_get_exit_qual(vcpu) & INTR_INFO_UNBLOCK_NMI; 436 } else { 437 if (fault->error_code & PFERR_RSVD_MASK) { 438 vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 439 exit_qualification = 0; 440 } else { 441 exit_qualification = fault->exit_qualification; 442 exit_qualification |= vmx_get_exit_qual(vcpu) & 443 (EPT_VIOLATION_GVA_IS_VALID | 444 EPT_VIOLATION_GVA_TRANSLATED); 445 vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 446 } 447 448 /* 449 * Although the caller (kvm_inject_emulated_page_fault) would 450 * have already synced the faulting address in the shadow EPT 451 * tables for the current EPTP12, we also need to sync it for 452 * any other cached EPTP02s based on the same EP4TA, since the 453 * TLB associates mappings to the EP4TA rather than the full EPTP. 454 */ 455 nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, 456 fault->address); 457 } 458 459 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); 460 vmcs12->guest_physical_address = fault->address; 461 } 462 463 static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) 464 { 465 struct vcpu_vmx *vmx = to_vmx(vcpu); 466 bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT; 467 int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps); 468 469 kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level, 470 nested_ept_ad_enabled(vcpu), 471 nested_ept_get_eptp(vcpu)); 472 } 473 474 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 475 { 476 WARN_ON(mmu_is_nested(vcpu)); 477 478 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 479 nested_ept_new_eptp(vcpu); 480 vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; 481 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 482 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 483 484 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 485 } 486 487 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 488 { 489 vcpu->arch.mmu = &vcpu->arch.root_mmu; 490 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 491 } 492 493 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 494 u16 error_code) 495 { 496 bool inequality, bit; 497 498 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 499 inequality = 500 (error_code & vmcs12->page_fault_error_code_mask) != 501 vmcs12->page_fault_error_code_match; 502 return inequality ^ bit; 503 } 504 505 static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector, 506 u32 error_code) 507 { 508 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 509 510 /* 511 * Drop bits 31:16 of the error code when performing the #PF mask+match 512 * check. All VMCS fields involved are 32 bits, but Intel CPUs never 513 * set bits 31:16 and VMX disallows setting bits 31:16 in the injected 514 * error code. Including the to-be-dropped bits in the check might 515 * result in an "impossible" or missed exit from L1's perspective. 516 */ 517 if (vector == PF_VECTOR) 518 return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code); 519 520 return (vmcs12->exception_bitmap & (1u << vector)); 521 } 522 523 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 524 struct vmcs12 *vmcs12) 525 { 526 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 527 return 0; 528 529 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 530 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 531 return -EINVAL; 532 533 return 0; 534 } 535 536 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 537 struct vmcs12 *vmcs12) 538 { 539 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 540 return 0; 541 542 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 543 return -EINVAL; 544 545 return 0; 546 } 547 548 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 549 struct vmcs12 *vmcs12) 550 { 551 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 552 return 0; 553 554 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 555 return -EINVAL; 556 557 return 0; 558 } 559 560 /* 561 * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 562 * itself utilizing x2APIC. All MSRs were previously set to be intercepted, 563 * only the "disable intercept" case needs to be handled. 564 */ 565 static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, 566 unsigned long *msr_bitmap_l0, 567 u32 msr, int type) 568 { 569 if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) 570 vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); 571 572 if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) 573 vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); 574 } 575 576 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) 577 { 578 int msr; 579 580 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 581 unsigned word = msr / BITS_PER_LONG; 582 583 msr_bitmap[word] = ~0; 584 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 585 } 586 } 587 588 #define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ 589 static inline \ 590 void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ 591 unsigned long *msr_bitmap_l1, \ 592 unsigned long *msr_bitmap_l0, u32 msr) \ 593 { \ 594 if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ 595 vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ 596 vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 597 else \ 598 vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 599 } 600 BUILD_NVMX_MSR_INTERCEPT_HELPER(read) 601 BUILD_NVMX_MSR_INTERCEPT_HELPER(write) 602 603 static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, 604 unsigned long *msr_bitmap_l1, 605 unsigned long *msr_bitmap_l0, 606 u32 msr, int types) 607 { 608 if (types & MSR_TYPE_R) 609 nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, 610 msr_bitmap_l0, msr); 611 if (types & MSR_TYPE_W) 612 nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, 613 msr_bitmap_l0, msr); 614 } 615 616 /* 617 * Merge L0's and L1's MSR bitmap, return false to indicate that 618 * we do not use the hardware. 619 */ 620 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 621 struct vmcs12 *vmcs12) 622 { 623 struct vcpu_vmx *vmx = to_vmx(vcpu); 624 int msr; 625 unsigned long *msr_bitmap_l1; 626 unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; 627 struct kvm_host_map *map = &vmx->nested.msr_bitmap_map; 628 629 /* Nothing to do if the MSR bitmap is not in use. */ 630 if (!cpu_has_vmx_msr_bitmap() || 631 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 632 return false; 633 634 /* 635 * MSR bitmap update can be skipped when: 636 * - MSR bitmap for L1 hasn't changed. 637 * - Nested hypervisor (L1) is attempting to launch the same L2 as 638 * before. 639 * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature 640 * and tells KVM (L0) there were no changes in MSR bitmap for L2. 641 */ 642 if (!vmx->nested.force_msr_bitmap_recalc) { 643 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 644 645 if (evmcs && evmcs->hv_enlightenments_control.msr_bitmap && 646 evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) 647 return true; 648 } 649 650 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) 651 return false; 652 653 msr_bitmap_l1 = (unsigned long *)map->hva; 654 655 /* 656 * To keep the control flow simple, pay eight 8-byte writes (sixteen 657 * 4-byte writes on 32-bit systems) up front to enable intercepts for 658 * the x2APIC MSR range and selectively toggle those relevant to L2. 659 */ 660 enable_x2apic_msr_intercepts(msr_bitmap_l0); 661 662 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 663 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 664 /* 665 * L0 need not intercept reads for MSRs between 0x800 666 * and 0x8ff, it just lets the processor take the value 667 * from the virtual-APIC page; take those 256 bits 668 * directly from the L1 bitmap. 669 */ 670 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 671 unsigned word = msr / BITS_PER_LONG; 672 673 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 674 } 675 } 676 677 nested_vmx_disable_intercept_for_x2apic_msr( 678 msr_bitmap_l1, msr_bitmap_l0, 679 X2APIC_MSR(APIC_TASKPRI), 680 MSR_TYPE_R | MSR_TYPE_W); 681 682 if (nested_cpu_has_vid(vmcs12)) { 683 nested_vmx_disable_intercept_for_x2apic_msr( 684 msr_bitmap_l1, msr_bitmap_l0, 685 X2APIC_MSR(APIC_EOI), 686 MSR_TYPE_W); 687 nested_vmx_disable_intercept_for_x2apic_msr( 688 msr_bitmap_l1, msr_bitmap_l0, 689 X2APIC_MSR(APIC_SELF_IPI), 690 MSR_TYPE_W); 691 } 692 } 693 694 /* 695 * Always check vmcs01's bitmap to honor userspace MSR filters and any 696 * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. 697 */ 698 #ifdef CONFIG_X86_64 699 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 700 MSR_FS_BASE, MSR_TYPE_RW); 701 702 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 703 MSR_GS_BASE, MSR_TYPE_RW); 704 705 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 706 MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 707 #endif 708 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 709 MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); 710 711 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 712 MSR_IA32_PRED_CMD, MSR_TYPE_W); 713 714 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 715 MSR_IA32_FLUSH_CMD, MSR_TYPE_W); 716 717 kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false); 718 719 vmx->nested.force_msr_bitmap_recalc = false; 720 721 return true; 722 } 723 724 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 725 struct vmcs12 *vmcs12) 726 { 727 struct vcpu_vmx *vmx = to_vmx(vcpu); 728 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 729 730 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 731 vmcs12->vmcs_link_pointer == INVALID_GPA) 732 return; 733 734 if (ghc->gpa != vmcs12->vmcs_link_pointer && 735 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 736 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 737 return; 738 739 kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 740 VMCS12_SIZE); 741 } 742 743 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 744 struct vmcs12 *vmcs12) 745 { 746 struct vcpu_vmx *vmx = to_vmx(vcpu); 747 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 748 749 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 750 vmcs12->vmcs_link_pointer == INVALID_GPA) 751 return; 752 753 if (ghc->gpa != vmcs12->vmcs_link_pointer && 754 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 755 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 756 return; 757 758 kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 759 VMCS12_SIZE); 760 } 761 762 /* 763 * In nested virtualization, check if L1 has set 764 * VM_EXIT_ACK_INTR_ON_EXIT 765 */ 766 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 767 { 768 return get_vmcs12(vcpu)->vm_exit_controls & 769 VM_EXIT_ACK_INTR_ON_EXIT; 770 } 771 772 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 773 struct vmcs12 *vmcs12) 774 { 775 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 776 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 777 return -EINVAL; 778 else 779 return 0; 780 } 781 782 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 783 struct vmcs12 *vmcs12) 784 { 785 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 786 !nested_cpu_has_apic_reg_virt(vmcs12) && 787 !nested_cpu_has_vid(vmcs12) && 788 !nested_cpu_has_posted_intr(vmcs12)) 789 return 0; 790 791 /* 792 * If virtualize x2apic mode is enabled, 793 * virtualize apic access must be disabled. 794 */ 795 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 796 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 797 return -EINVAL; 798 799 /* 800 * If virtual interrupt delivery is enabled, 801 * we must exit on external interrupts. 802 */ 803 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 804 return -EINVAL; 805 806 /* 807 * bits 15:8 should be zero in posted_intr_nv, 808 * the descriptor address has been already checked 809 * in nested_get_vmcs12_pages. 810 * 811 * bits 5:0 of posted_intr_desc_addr should be zero. 812 */ 813 if (nested_cpu_has_posted_intr(vmcs12) && 814 (CC(!nested_cpu_has_vid(vmcs12)) || 815 CC(!nested_exit_intr_ack_set(vcpu)) || 816 CC((vmcs12->posted_intr_nv & 0xff00)) || 817 CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) 818 return -EINVAL; 819 820 /* tpr shadow is needed by all apicv features. */ 821 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 822 return -EINVAL; 823 824 return 0; 825 } 826 827 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 828 u32 count, u64 addr) 829 { 830 if (count == 0) 831 return 0; 832 833 if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || 834 !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) 835 return -EINVAL; 836 837 return 0; 838 } 839 840 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 841 struct vmcs12 *vmcs12) 842 { 843 if (CC(nested_vmx_check_msr_switch(vcpu, 844 vmcs12->vm_exit_msr_load_count, 845 vmcs12->vm_exit_msr_load_addr)) || 846 CC(nested_vmx_check_msr_switch(vcpu, 847 vmcs12->vm_exit_msr_store_count, 848 vmcs12->vm_exit_msr_store_addr))) 849 return -EINVAL; 850 851 return 0; 852 } 853 854 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 855 struct vmcs12 *vmcs12) 856 { 857 if (CC(nested_vmx_check_msr_switch(vcpu, 858 vmcs12->vm_entry_msr_load_count, 859 vmcs12->vm_entry_msr_load_addr))) 860 return -EINVAL; 861 862 return 0; 863 } 864 865 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 866 struct vmcs12 *vmcs12) 867 { 868 if (!nested_cpu_has_pml(vmcs12)) 869 return 0; 870 871 if (CC(!nested_cpu_has_ept(vmcs12)) || 872 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 873 return -EINVAL; 874 875 return 0; 876 } 877 878 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 879 struct vmcs12 *vmcs12) 880 { 881 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 882 !nested_cpu_has_ept(vmcs12))) 883 return -EINVAL; 884 return 0; 885 } 886 887 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 888 struct vmcs12 *vmcs12) 889 { 890 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 891 !nested_cpu_has_ept(vmcs12))) 892 return -EINVAL; 893 return 0; 894 } 895 896 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 897 struct vmcs12 *vmcs12) 898 { 899 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 900 return 0; 901 902 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 903 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 904 return -EINVAL; 905 906 return 0; 907 } 908 909 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 910 struct vmx_msr_entry *e) 911 { 912 /* x2APIC MSR accesses are not allowed */ 913 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 914 return -EINVAL; 915 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 916 CC(e->index == MSR_IA32_UCODE_REV)) 917 return -EINVAL; 918 if (CC(e->reserved != 0)) 919 return -EINVAL; 920 return 0; 921 } 922 923 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 924 struct vmx_msr_entry *e) 925 { 926 if (CC(e->index == MSR_FS_BASE) || 927 CC(e->index == MSR_GS_BASE) || 928 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 929 nested_vmx_msr_check_common(vcpu, e)) 930 return -EINVAL; 931 return 0; 932 } 933 934 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 935 struct vmx_msr_entry *e) 936 { 937 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 938 nested_vmx_msr_check_common(vcpu, e)) 939 return -EINVAL; 940 return 0; 941 } 942 943 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) 944 { 945 struct vcpu_vmx *vmx = to_vmx(vcpu); 946 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 947 vmx->nested.msrs.misc_high); 948 949 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; 950 } 951 952 /* 953 * Load guest's/host's msr at nested entry/exit. 954 * return 0 for success, entry index for failure. 955 * 956 * One of the failure modes for MSR load/store is when a list exceeds the 957 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch 958 * as possible, process all valid entries before failing rather than precheck 959 * for a capacity violation. 960 */ 961 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 962 { 963 u32 i; 964 struct vmx_msr_entry e; 965 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 966 967 for (i = 0; i < count; i++) { 968 if (unlikely(i >= max_msr_list_size)) 969 goto fail; 970 971 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 972 &e, sizeof(e))) { 973 pr_debug_ratelimited( 974 "%s cannot read MSR entry (%u, 0x%08llx)\n", 975 __func__, i, gpa + i * sizeof(e)); 976 goto fail; 977 } 978 if (nested_vmx_load_msr_check(vcpu, &e)) { 979 pr_debug_ratelimited( 980 "%s check failed (%u, 0x%x, 0x%x)\n", 981 __func__, i, e.index, e.reserved); 982 goto fail; 983 } 984 if (kvm_set_msr(vcpu, e.index, e.value)) { 985 pr_debug_ratelimited( 986 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 987 __func__, i, e.index, e.value); 988 goto fail; 989 } 990 } 991 return 0; 992 fail: 993 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ 994 return i + 1; 995 } 996 997 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, 998 u32 msr_index, 999 u64 *data) 1000 { 1001 struct vcpu_vmx *vmx = to_vmx(vcpu); 1002 1003 /* 1004 * If the L0 hypervisor stored a more accurate value for the TSC that 1005 * does not include the time taken for emulation of the L2->L1 1006 * VM-exit in L0, use the more accurate value. 1007 */ 1008 if (msr_index == MSR_IA32_TSC) { 1009 int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, 1010 MSR_IA32_TSC); 1011 1012 if (i >= 0) { 1013 u64 val = vmx->msr_autostore.guest.val[i].value; 1014 1015 *data = kvm_read_l1_tsc(vcpu, val); 1016 return true; 1017 } 1018 } 1019 1020 if (kvm_get_msr(vcpu, msr_index, data)) { 1021 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 1022 msr_index); 1023 return false; 1024 } 1025 return true; 1026 } 1027 1028 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, 1029 struct vmx_msr_entry *e) 1030 { 1031 if (kvm_vcpu_read_guest(vcpu, 1032 gpa + i * sizeof(*e), 1033 e, 2 * sizeof(u32))) { 1034 pr_debug_ratelimited( 1035 "%s cannot read MSR entry (%u, 0x%08llx)\n", 1036 __func__, i, gpa + i * sizeof(*e)); 1037 return false; 1038 } 1039 if (nested_vmx_store_msr_check(vcpu, e)) { 1040 pr_debug_ratelimited( 1041 "%s check failed (%u, 0x%x, 0x%x)\n", 1042 __func__, i, e->index, e->reserved); 1043 return false; 1044 } 1045 return true; 1046 } 1047 1048 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 1049 { 1050 u64 data; 1051 u32 i; 1052 struct vmx_msr_entry e; 1053 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 1054 1055 for (i = 0; i < count; i++) { 1056 if (unlikely(i >= max_msr_list_size)) 1057 return -EINVAL; 1058 1059 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1060 return -EINVAL; 1061 1062 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) 1063 return -EINVAL; 1064 1065 if (kvm_vcpu_write_guest(vcpu, 1066 gpa + i * sizeof(e) + 1067 offsetof(struct vmx_msr_entry, value), 1068 &data, sizeof(data))) { 1069 pr_debug_ratelimited( 1070 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1071 __func__, i, e.index, data); 1072 return -EINVAL; 1073 } 1074 } 1075 return 0; 1076 } 1077 1078 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) 1079 { 1080 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1081 u32 count = vmcs12->vm_exit_msr_store_count; 1082 u64 gpa = vmcs12->vm_exit_msr_store_addr; 1083 struct vmx_msr_entry e; 1084 u32 i; 1085 1086 for (i = 0; i < count; i++) { 1087 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1088 return false; 1089 1090 if (e.index == msr_index) 1091 return true; 1092 } 1093 return false; 1094 } 1095 1096 static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, 1097 u32 msr_index) 1098 { 1099 struct vcpu_vmx *vmx = to_vmx(vcpu); 1100 struct vmx_msrs *autostore = &vmx->msr_autostore.guest; 1101 bool in_vmcs12_store_list; 1102 int msr_autostore_slot; 1103 bool in_autostore_list; 1104 int last; 1105 1106 msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); 1107 in_autostore_list = msr_autostore_slot >= 0; 1108 in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); 1109 1110 if (in_vmcs12_store_list && !in_autostore_list) { 1111 if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { 1112 /* 1113 * Emulated VMEntry does not fail here. Instead a less 1114 * accurate value will be returned by 1115 * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() 1116 * instead of reading the value from the vmcs02 VMExit 1117 * MSR-store area. 1118 */ 1119 pr_warn_ratelimited( 1120 "Not enough msr entries in msr_autostore. Can't add msr %x\n", 1121 msr_index); 1122 return; 1123 } 1124 last = autostore->nr++; 1125 autostore->val[last].index = msr_index; 1126 } else if (!in_vmcs12_store_list && in_autostore_list) { 1127 last = --autostore->nr; 1128 autostore->val[msr_autostore_slot] = autostore->val[last]; 1129 } 1130 } 1131 1132 /* 1133 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are 1134 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected 1135 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to 1136 * @entry_failure_code. 1137 */ 1138 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, 1139 bool nested_ept, bool reload_pdptrs, 1140 enum vm_entry_failure_code *entry_failure_code) 1141 { 1142 if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) { 1143 *entry_failure_code = ENTRY_FAIL_DEFAULT; 1144 return -EINVAL; 1145 } 1146 1147 /* 1148 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 1149 * must not be dereferenced. 1150 */ 1151 if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && 1152 CC(!load_pdptrs(vcpu, cr3))) { 1153 *entry_failure_code = ENTRY_FAIL_PDPTE; 1154 return -EINVAL; 1155 } 1156 1157 vcpu->arch.cr3 = cr3; 1158 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); 1159 1160 /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ 1161 kvm_init_mmu(vcpu); 1162 1163 if (!nested_ept) 1164 kvm_mmu_new_pgd(vcpu, cr3); 1165 1166 return 0; 1167 } 1168 1169 /* 1170 * Returns if KVM is able to config CPU to tag TLB entries 1171 * populated by L2 differently than TLB entries populated 1172 * by L1. 1173 * 1174 * If L0 uses EPT, L1 and L2 run with different EPTP because 1175 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries 1176 * are tagged with different EPTP. 1177 * 1178 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 1179 * with different VPID (L1 entries are tagged with vmx->vpid 1180 * while L2 entries are tagged with vmx->nested.vpid02). 1181 */ 1182 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1183 { 1184 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1185 1186 return enable_ept || 1187 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1188 } 1189 1190 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, 1191 struct vmcs12 *vmcs12, 1192 bool is_vmenter) 1193 { 1194 struct vcpu_vmx *vmx = to_vmx(vcpu); 1195 1196 /* Handle pending Hyper-V TLB flush requests */ 1197 kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept); 1198 1199 /* 1200 * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings 1201 * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a 1202 * full TLB flush from the guest's perspective. This is required even 1203 * if VPID is disabled in the host as KVM may need to synchronize the 1204 * MMU in response to the guest TLB flush. 1205 * 1206 * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. 1207 * EPT is a special snowflake, as guest-physical mappings aren't 1208 * flushed on VPID invalidations, including VM-Enter or VM-Exit with 1209 * VPID disabled. As a result, KVM _never_ needs to sync nEPT 1210 * entries on VM-Enter because L1 can't rely on VM-Enter to flush 1211 * those mappings. 1212 */ 1213 if (!nested_cpu_has_vpid(vmcs12)) { 1214 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1215 return; 1216 } 1217 1218 /* L2 should never have a VPID if VPID is disabled. */ 1219 WARN_ON(!enable_vpid); 1220 1221 /* 1222 * VPID is enabled and in use by vmcs12. If vpid12 is changing, then 1223 * emulate a guest TLB flush as KVM does not track vpid12 history nor 1224 * is the VPID incorporated into the MMU context. I.e. KVM must assume 1225 * that the new vpid12 has never been used and thus represents a new 1226 * guest ASID that cannot have entries in the TLB. 1227 */ 1228 if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 1229 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 1230 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1231 return; 1232 } 1233 1234 /* 1235 * If VPID is enabled, used by vmc12, and vpid12 is not changing but 1236 * does not have a unique TLB tag (ASID), i.e. EPT is disabled and 1237 * KVM was unable to allocate a VPID for L2, flush the current context 1238 * as the effective ASID is common to both L1 and L2. 1239 */ 1240 if (!nested_has_guest_tlb_tag(vcpu)) 1241 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 1242 } 1243 1244 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1245 { 1246 superset &= mask; 1247 subset &= mask; 1248 1249 return (superset | subset) == superset; 1250 } 1251 1252 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1253 { 1254 const u64 feature_and_reserved = 1255 /* feature (except bit 48; see below) */ 1256 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | 1257 /* reserved */ 1258 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); 1259 u64 vmx_basic = vmcs_config.nested.basic; 1260 1261 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) 1262 return -EINVAL; 1263 1264 /* 1265 * KVM does not emulate a version of VMX that constrains physical 1266 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1267 */ 1268 if (data & BIT_ULL(48)) 1269 return -EINVAL; 1270 1271 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1272 vmx_basic_vmcs_revision_id(data)) 1273 return -EINVAL; 1274 1275 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1276 return -EINVAL; 1277 1278 vmx->nested.msrs.basic = data; 1279 return 0; 1280 } 1281 1282 static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index, 1283 u32 **low, u32 **high) 1284 { 1285 switch (msr_index) { 1286 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1287 *low = &msrs->pinbased_ctls_low; 1288 *high = &msrs->pinbased_ctls_high; 1289 break; 1290 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1291 *low = &msrs->procbased_ctls_low; 1292 *high = &msrs->procbased_ctls_high; 1293 break; 1294 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1295 *low = &msrs->exit_ctls_low; 1296 *high = &msrs->exit_ctls_high; 1297 break; 1298 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1299 *low = &msrs->entry_ctls_low; 1300 *high = &msrs->entry_ctls_high; 1301 break; 1302 case MSR_IA32_VMX_PROCBASED_CTLS2: 1303 *low = &msrs->secondary_ctls_low; 1304 *high = &msrs->secondary_ctls_high; 1305 break; 1306 default: 1307 BUG(); 1308 } 1309 } 1310 1311 static int 1312 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1313 { 1314 u32 *lowp, *highp; 1315 u64 supported; 1316 1317 vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp); 1318 1319 supported = vmx_control_msr(*lowp, *highp); 1320 1321 /* Check must-be-1 bits are still 1. */ 1322 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1323 return -EINVAL; 1324 1325 /* Check must-be-0 bits are still 0. */ 1326 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1327 return -EINVAL; 1328 1329 vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp); 1330 *lowp = data; 1331 *highp = data >> 32; 1332 return 0; 1333 } 1334 1335 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1336 { 1337 const u64 feature_and_reserved_bits = 1338 /* feature */ 1339 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | 1340 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | 1341 /* reserved */ 1342 GENMASK_ULL(13, 9) | BIT_ULL(31); 1343 u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, 1344 vmcs_config.nested.misc_high); 1345 1346 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) 1347 return -EINVAL; 1348 1349 if ((vmx->nested.msrs.pinbased_ctls_high & 1350 PIN_BASED_VMX_PREEMPTION_TIMER) && 1351 vmx_misc_preemption_timer_rate(data) != 1352 vmx_misc_preemption_timer_rate(vmx_misc)) 1353 return -EINVAL; 1354 1355 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1356 return -EINVAL; 1357 1358 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1359 return -EINVAL; 1360 1361 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1362 return -EINVAL; 1363 1364 vmx->nested.msrs.misc_low = data; 1365 vmx->nested.msrs.misc_high = data >> 32; 1366 1367 return 0; 1368 } 1369 1370 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1371 { 1372 u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps, 1373 vmcs_config.nested.vpid_caps); 1374 1375 /* Every bit is either reserved or a feature bit. */ 1376 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1377 return -EINVAL; 1378 1379 vmx->nested.msrs.ept_caps = data; 1380 vmx->nested.msrs.vpid_caps = data >> 32; 1381 return 0; 1382 } 1383 1384 static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index) 1385 { 1386 switch (msr_index) { 1387 case MSR_IA32_VMX_CR0_FIXED0: 1388 return &msrs->cr0_fixed0; 1389 case MSR_IA32_VMX_CR4_FIXED0: 1390 return &msrs->cr4_fixed0; 1391 default: 1392 BUG(); 1393 } 1394 } 1395 1396 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1397 { 1398 const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index); 1399 1400 /* 1401 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1402 * must be 1 in the restored value. 1403 */ 1404 if (!is_bitwise_subset(data, *msr, -1ULL)) 1405 return -EINVAL; 1406 1407 *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data; 1408 return 0; 1409 } 1410 1411 /* 1412 * Called when userspace is restoring VMX MSRs. 1413 * 1414 * Returns 0 on success, non-0 otherwise. 1415 */ 1416 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1417 { 1418 struct vcpu_vmx *vmx = to_vmx(vcpu); 1419 1420 /* 1421 * Don't allow changes to the VMX capability MSRs while the vCPU 1422 * is in VMX operation. 1423 */ 1424 if (vmx->nested.vmxon) 1425 return -EBUSY; 1426 1427 switch (msr_index) { 1428 case MSR_IA32_VMX_BASIC: 1429 return vmx_restore_vmx_basic(vmx, data); 1430 case MSR_IA32_VMX_PINBASED_CTLS: 1431 case MSR_IA32_VMX_PROCBASED_CTLS: 1432 case MSR_IA32_VMX_EXIT_CTLS: 1433 case MSR_IA32_VMX_ENTRY_CTLS: 1434 /* 1435 * The "non-true" VMX capability MSRs are generated from the 1436 * "true" MSRs, so we do not support restoring them directly. 1437 * 1438 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1439 * should restore the "true" MSRs with the must-be-1 bits 1440 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1441 * DEFAULT SETTINGS". 1442 */ 1443 return -EINVAL; 1444 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1445 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1446 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1447 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1448 case MSR_IA32_VMX_PROCBASED_CTLS2: 1449 return vmx_restore_control_msr(vmx, msr_index, data); 1450 case MSR_IA32_VMX_MISC: 1451 return vmx_restore_vmx_misc(vmx, data); 1452 case MSR_IA32_VMX_CR0_FIXED0: 1453 case MSR_IA32_VMX_CR4_FIXED0: 1454 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1455 case MSR_IA32_VMX_CR0_FIXED1: 1456 case MSR_IA32_VMX_CR4_FIXED1: 1457 /* 1458 * These MSRs are generated based on the vCPU's CPUID, so we 1459 * do not support restoring them directly. 1460 */ 1461 return -EINVAL; 1462 case MSR_IA32_VMX_EPT_VPID_CAP: 1463 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1464 case MSR_IA32_VMX_VMCS_ENUM: 1465 vmx->nested.msrs.vmcs_enum = data; 1466 return 0; 1467 case MSR_IA32_VMX_VMFUNC: 1468 if (data & ~vmcs_config.nested.vmfunc_controls) 1469 return -EINVAL; 1470 vmx->nested.msrs.vmfunc_controls = data; 1471 return 0; 1472 default: 1473 /* 1474 * The rest of the VMX capability MSRs do not support restore. 1475 */ 1476 return -EINVAL; 1477 } 1478 } 1479 1480 /* Returns 0 on success, non-0 otherwise. */ 1481 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1482 { 1483 switch (msr_index) { 1484 case MSR_IA32_VMX_BASIC: 1485 *pdata = msrs->basic; 1486 break; 1487 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1488 case MSR_IA32_VMX_PINBASED_CTLS: 1489 *pdata = vmx_control_msr( 1490 msrs->pinbased_ctls_low, 1491 msrs->pinbased_ctls_high); 1492 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1493 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1494 break; 1495 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1496 case MSR_IA32_VMX_PROCBASED_CTLS: 1497 *pdata = vmx_control_msr( 1498 msrs->procbased_ctls_low, 1499 msrs->procbased_ctls_high); 1500 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1501 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1502 break; 1503 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1504 case MSR_IA32_VMX_EXIT_CTLS: 1505 *pdata = vmx_control_msr( 1506 msrs->exit_ctls_low, 1507 msrs->exit_ctls_high); 1508 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1509 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1510 break; 1511 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1512 case MSR_IA32_VMX_ENTRY_CTLS: 1513 *pdata = vmx_control_msr( 1514 msrs->entry_ctls_low, 1515 msrs->entry_ctls_high); 1516 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1517 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1518 break; 1519 case MSR_IA32_VMX_MISC: 1520 *pdata = vmx_control_msr( 1521 msrs->misc_low, 1522 msrs->misc_high); 1523 break; 1524 case MSR_IA32_VMX_CR0_FIXED0: 1525 *pdata = msrs->cr0_fixed0; 1526 break; 1527 case MSR_IA32_VMX_CR0_FIXED1: 1528 *pdata = msrs->cr0_fixed1; 1529 break; 1530 case MSR_IA32_VMX_CR4_FIXED0: 1531 *pdata = msrs->cr4_fixed0; 1532 break; 1533 case MSR_IA32_VMX_CR4_FIXED1: 1534 *pdata = msrs->cr4_fixed1; 1535 break; 1536 case MSR_IA32_VMX_VMCS_ENUM: 1537 *pdata = msrs->vmcs_enum; 1538 break; 1539 case MSR_IA32_VMX_PROCBASED_CTLS2: 1540 *pdata = vmx_control_msr( 1541 msrs->secondary_ctls_low, 1542 msrs->secondary_ctls_high); 1543 break; 1544 case MSR_IA32_VMX_EPT_VPID_CAP: 1545 *pdata = msrs->ept_caps | 1546 ((u64)msrs->vpid_caps << 32); 1547 break; 1548 case MSR_IA32_VMX_VMFUNC: 1549 *pdata = msrs->vmfunc_controls; 1550 break; 1551 default: 1552 return 1; 1553 } 1554 1555 return 0; 1556 } 1557 1558 /* 1559 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1560 * been modified by the L1 guest. Note, "writable" in this context means 1561 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1562 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1563 * VM-exit information fields (which are actually writable if the vCPU is 1564 * configured to support "VMWRITE to any supported field in the VMCS"). 1565 */ 1566 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1567 { 1568 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1569 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1570 struct shadow_vmcs_field field; 1571 unsigned long val; 1572 int i; 1573 1574 if (WARN_ON(!shadow_vmcs)) 1575 return; 1576 1577 preempt_disable(); 1578 1579 vmcs_load(shadow_vmcs); 1580 1581 for (i = 0; i < max_shadow_read_write_fields; i++) { 1582 field = shadow_read_write_fields[i]; 1583 val = __vmcs_readl(field.encoding); 1584 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1585 } 1586 1587 vmcs_clear(shadow_vmcs); 1588 vmcs_load(vmx->loaded_vmcs->vmcs); 1589 1590 preempt_enable(); 1591 } 1592 1593 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1594 { 1595 const struct shadow_vmcs_field *fields[] = { 1596 shadow_read_write_fields, 1597 shadow_read_only_fields 1598 }; 1599 const int max_fields[] = { 1600 max_shadow_read_write_fields, 1601 max_shadow_read_only_fields 1602 }; 1603 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1604 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1605 struct shadow_vmcs_field field; 1606 unsigned long val; 1607 int i, q; 1608 1609 if (WARN_ON(!shadow_vmcs)) 1610 return; 1611 1612 vmcs_load(shadow_vmcs); 1613 1614 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1615 for (i = 0; i < max_fields[q]; i++) { 1616 field = fields[q][i]; 1617 val = vmcs12_read_any(vmcs12, field.encoding, 1618 field.offset); 1619 __vmcs_writel(field.encoding, val); 1620 } 1621 } 1622 1623 vmcs_clear(shadow_vmcs); 1624 vmcs_load(vmx->loaded_vmcs->vmcs); 1625 } 1626 1627 static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) 1628 { 1629 #ifdef CONFIG_KVM_HYPERV 1630 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1631 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 1632 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu); 1633 1634 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1635 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1636 vmcs12->guest_rip = evmcs->guest_rip; 1637 1638 if (unlikely(!(hv_clean_fields & 1639 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) { 1640 hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page; 1641 hv_vcpu->nested.vm_id = evmcs->hv_vm_id; 1642 hv_vcpu->nested.vp_id = evmcs->hv_vp_id; 1643 } 1644 1645 if (unlikely(!(hv_clean_fields & 1646 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1647 vmcs12->guest_rsp = evmcs->guest_rsp; 1648 vmcs12->guest_rflags = evmcs->guest_rflags; 1649 vmcs12->guest_interruptibility_info = 1650 evmcs->guest_interruptibility_info; 1651 /* 1652 * Not present in struct vmcs12: 1653 * vmcs12->guest_ssp = evmcs->guest_ssp; 1654 */ 1655 } 1656 1657 if (unlikely(!(hv_clean_fields & 1658 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1659 vmcs12->cpu_based_vm_exec_control = 1660 evmcs->cpu_based_vm_exec_control; 1661 } 1662 1663 if (unlikely(!(hv_clean_fields & 1664 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1665 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1666 } 1667 1668 if (unlikely(!(hv_clean_fields & 1669 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1670 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1671 } 1672 1673 if (unlikely(!(hv_clean_fields & 1674 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1675 vmcs12->vm_entry_intr_info_field = 1676 evmcs->vm_entry_intr_info_field; 1677 vmcs12->vm_entry_exception_error_code = 1678 evmcs->vm_entry_exception_error_code; 1679 vmcs12->vm_entry_instruction_len = 1680 evmcs->vm_entry_instruction_len; 1681 } 1682 1683 if (unlikely(!(hv_clean_fields & 1684 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1685 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1686 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1687 vmcs12->host_cr0 = evmcs->host_cr0; 1688 vmcs12->host_cr3 = evmcs->host_cr3; 1689 vmcs12->host_cr4 = evmcs->host_cr4; 1690 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1691 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1692 vmcs12->host_rip = evmcs->host_rip; 1693 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1694 vmcs12->host_es_selector = evmcs->host_es_selector; 1695 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1696 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1697 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1698 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1699 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1700 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1701 vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl; 1702 /* 1703 * Not present in struct vmcs12: 1704 * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet; 1705 * vmcs12->host_ssp = evmcs->host_ssp; 1706 * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr; 1707 */ 1708 } 1709 1710 if (unlikely(!(hv_clean_fields & 1711 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1712 vmcs12->pin_based_vm_exec_control = 1713 evmcs->pin_based_vm_exec_control; 1714 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1715 vmcs12->secondary_vm_exec_control = 1716 evmcs->secondary_vm_exec_control; 1717 } 1718 1719 if (unlikely(!(hv_clean_fields & 1720 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1721 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1722 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1723 } 1724 1725 if (unlikely(!(hv_clean_fields & 1726 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1727 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1728 } 1729 1730 if (unlikely(!(hv_clean_fields & 1731 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1732 vmcs12->guest_es_base = evmcs->guest_es_base; 1733 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1734 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1735 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1736 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1737 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1738 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1739 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1740 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1741 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1742 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1743 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1744 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1745 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1746 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1747 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1748 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1749 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1750 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1751 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1752 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1753 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1754 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1755 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1756 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1757 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1758 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1759 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1760 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1761 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1762 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1763 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1764 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1765 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1766 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1767 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1768 } 1769 1770 if (unlikely(!(hv_clean_fields & 1771 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1772 vmcs12->tsc_offset = evmcs->tsc_offset; 1773 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1774 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1775 vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap; 1776 vmcs12->tsc_multiplier = evmcs->tsc_multiplier; 1777 } 1778 1779 if (unlikely(!(hv_clean_fields & 1780 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1781 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1782 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1783 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1784 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1785 vmcs12->guest_cr0 = evmcs->guest_cr0; 1786 vmcs12->guest_cr3 = evmcs->guest_cr3; 1787 vmcs12->guest_cr4 = evmcs->guest_cr4; 1788 vmcs12->guest_dr7 = evmcs->guest_dr7; 1789 } 1790 1791 if (unlikely(!(hv_clean_fields & 1792 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1793 vmcs12->host_fs_base = evmcs->host_fs_base; 1794 vmcs12->host_gs_base = evmcs->host_gs_base; 1795 vmcs12->host_tr_base = evmcs->host_tr_base; 1796 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1797 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1798 vmcs12->host_rsp = evmcs->host_rsp; 1799 } 1800 1801 if (unlikely(!(hv_clean_fields & 1802 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1803 vmcs12->ept_pointer = evmcs->ept_pointer; 1804 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1805 } 1806 1807 if (unlikely(!(hv_clean_fields & 1808 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1809 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1810 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1811 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1812 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1813 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1814 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1815 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1816 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1817 vmcs12->guest_pending_dbg_exceptions = 1818 evmcs->guest_pending_dbg_exceptions; 1819 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1820 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1821 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1822 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1823 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1824 vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl; 1825 /* 1826 * Not present in struct vmcs12: 1827 * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet; 1828 * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl; 1829 * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr; 1830 */ 1831 } 1832 1833 /* 1834 * Not used? 1835 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1836 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1837 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1838 * vmcs12->page_fault_error_code_mask = 1839 * evmcs->page_fault_error_code_mask; 1840 * vmcs12->page_fault_error_code_match = 1841 * evmcs->page_fault_error_code_match; 1842 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1843 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1844 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1845 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1846 */ 1847 1848 /* 1849 * Read only fields: 1850 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1851 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1852 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1853 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1854 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1855 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1856 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1857 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1858 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1859 * vmcs12->exit_qualification = evmcs->exit_qualification; 1860 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1861 * 1862 * Not present in struct vmcs12: 1863 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1864 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1865 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1866 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1867 */ 1868 1869 return; 1870 #else /* CONFIG_KVM_HYPERV */ 1871 KVM_BUG_ON(1, vmx->vcpu.kvm); 1872 #endif /* CONFIG_KVM_HYPERV */ 1873 } 1874 1875 static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1876 { 1877 #ifdef CONFIG_KVM_HYPERV 1878 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1879 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 1880 1881 /* 1882 * Should not be changed by KVM: 1883 * 1884 * evmcs->host_es_selector = vmcs12->host_es_selector; 1885 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1886 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1887 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1888 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1889 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1890 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1891 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1892 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1893 * evmcs->host_cr0 = vmcs12->host_cr0; 1894 * evmcs->host_cr3 = vmcs12->host_cr3; 1895 * evmcs->host_cr4 = vmcs12->host_cr4; 1896 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1897 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1898 * evmcs->host_rip = vmcs12->host_rip; 1899 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1900 * evmcs->host_fs_base = vmcs12->host_fs_base; 1901 * evmcs->host_gs_base = vmcs12->host_gs_base; 1902 * evmcs->host_tr_base = vmcs12->host_tr_base; 1903 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1904 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1905 * evmcs->host_rsp = vmcs12->host_rsp; 1906 * sync_vmcs02_to_vmcs12() doesn't read these: 1907 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1908 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1909 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1910 * evmcs->ept_pointer = vmcs12->ept_pointer; 1911 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 1912 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 1913 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 1914 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 1915 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 1916 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 1917 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 1918 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 1919 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 1920 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 1921 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 1922 * evmcs->page_fault_error_code_mask = 1923 * vmcs12->page_fault_error_code_mask; 1924 * evmcs->page_fault_error_code_match = 1925 * vmcs12->page_fault_error_code_match; 1926 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 1927 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 1928 * evmcs->tsc_offset = vmcs12->tsc_offset; 1929 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 1930 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 1931 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 1932 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 1933 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 1934 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 1935 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 1936 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 1937 * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl; 1938 * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl; 1939 * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap; 1940 * evmcs->tsc_multiplier = vmcs12->tsc_multiplier; 1941 * 1942 * Not present in struct vmcs12: 1943 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 1944 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 1945 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 1946 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 1947 * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet; 1948 * evmcs->host_ssp = vmcs12->host_ssp; 1949 * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr; 1950 * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet; 1951 * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl; 1952 * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr; 1953 * evmcs->guest_ssp = vmcs12->guest_ssp; 1954 */ 1955 1956 evmcs->guest_es_selector = vmcs12->guest_es_selector; 1957 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 1958 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 1959 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 1960 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 1961 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 1962 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 1963 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 1964 1965 evmcs->guest_es_limit = vmcs12->guest_es_limit; 1966 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 1967 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 1968 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 1969 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 1970 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 1971 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 1972 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 1973 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 1974 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 1975 1976 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 1977 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 1978 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 1979 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 1980 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 1981 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 1982 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 1983 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 1984 1985 evmcs->guest_es_base = vmcs12->guest_es_base; 1986 evmcs->guest_cs_base = vmcs12->guest_cs_base; 1987 evmcs->guest_ss_base = vmcs12->guest_ss_base; 1988 evmcs->guest_ds_base = vmcs12->guest_ds_base; 1989 evmcs->guest_fs_base = vmcs12->guest_fs_base; 1990 evmcs->guest_gs_base = vmcs12->guest_gs_base; 1991 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 1992 evmcs->guest_tr_base = vmcs12->guest_tr_base; 1993 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 1994 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 1995 1996 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 1997 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 1998 1999 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 2000 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 2001 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 2002 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 2003 2004 evmcs->guest_pending_dbg_exceptions = 2005 vmcs12->guest_pending_dbg_exceptions; 2006 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 2007 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 2008 2009 evmcs->guest_activity_state = vmcs12->guest_activity_state; 2010 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 2011 2012 evmcs->guest_cr0 = vmcs12->guest_cr0; 2013 evmcs->guest_cr3 = vmcs12->guest_cr3; 2014 evmcs->guest_cr4 = vmcs12->guest_cr4; 2015 evmcs->guest_dr7 = vmcs12->guest_dr7; 2016 2017 evmcs->guest_physical_address = vmcs12->guest_physical_address; 2018 2019 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 2020 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 2021 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 2022 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 2023 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 2024 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 2025 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 2026 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 2027 2028 evmcs->exit_qualification = vmcs12->exit_qualification; 2029 2030 evmcs->guest_linear_address = vmcs12->guest_linear_address; 2031 evmcs->guest_rsp = vmcs12->guest_rsp; 2032 evmcs->guest_rflags = vmcs12->guest_rflags; 2033 2034 evmcs->guest_interruptibility_info = 2035 vmcs12->guest_interruptibility_info; 2036 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 2037 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 2038 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 2039 evmcs->vm_entry_exception_error_code = 2040 vmcs12->vm_entry_exception_error_code; 2041 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 2042 2043 evmcs->guest_rip = vmcs12->guest_rip; 2044 2045 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 2046 2047 return; 2048 #else /* CONFIG_KVM_HYPERV */ 2049 KVM_BUG_ON(1, vmx->vcpu.kvm); 2050 #endif /* CONFIG_KVM_HYPERV */ 2051 } 2052 2053 /* 2054 * This is an equivalent of the nested hypervisor executing the vmptrld 2055 * instruction. 2056 */ 2057 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( 2058 struct kvm_vcpu *vcpu, bool from_launch) 2059 { 2060 #ifdef CONFIG_KVM_HYPERV 2061 struct vcpu_vmx *vmx = to_vmx(vcpu); 2062 bool evmcs_gpa_changed = false; 2063 u64 evmcs_gpa; 2064 2065 if (likely(!guest_cpuid_has_evmcs(vcpu))) 2066 return EVMPTRLD_DISABLED; 2067 2068 evmcs_gpa = nested_get_evmptr(vcpu); 2069 if (!evmptr_is_valid(evmcs_gpa)) { 2070 nested_release_evmcs(vcpu); 2071 return EVMPTRLD_DISABLED; 2072 } 2073 2074 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 2075 vmx->nested.current_vmptr = INVALID_GPA; 2076 2077 nested_release_evmcs(vcpu); 2078 2079 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 2080 &vmx->nested.hv_evmcs_map)) 2081 return EVMPTRLD_ERROR; 2082 2083 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 2084 2085 /* 2086 * Currently, KVM only supports eVMCS version 1 2087 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 2088 * value to first u32 field of eVMCS which should specify eVMCS 2089 * VersionNumber. 2090 * 2091 * Guest should be aware of supported eVMCS versions by host by 2092 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 2093 * expected to set this CPUID leaf according to the value 2094 * returned in vmcs_version from nested_enable_evmcs(). 2095 * 2096 * However, it turns out that Microsoft Hyper-V fails to comply 2097 * to their own invented interface: When Hyper-V use eVMCS, it 2098 * just sets first u32 field of eVMCS to revision_id specified 2099 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 2100 * which is one of the supported versions specified in 2101 * CPUID.0x4000000A.EAX[0:15]. 2102 * 2103 * To overcome Hyper-V bug, we accept here either a supported 2104 * eVMCS version or VMCS12 revision_id as valid values for first 2105 * u32 field of eVMCS. 2106 */ 2107 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 2108 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 2109 nested_release_evmcs(vcpu); 2110 return EVMPTRLD_VMFAIL; 2111 } 2112 2113 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 2114 2115 evmcs_gpa_changed = true; 2116 /* 2117 * Unlike normal vmcs12, enlightened vmcs12 is not fully 2118 * reloaded from guest's memory (read only fields, fields not 2119 * present in struct hv_enlightened_vmcs, ...). Make sure there 2120 * are no leftovers. 2121 */ 2122 if (from_launch) { 2123 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2124 memset(vmcs12, 0, sizeof(*vmcs12)); 2125 vmcs12->hdr.revision_id = VMCS12_REVISION; 2126 } 2127 2128 } 2129 2130 /* 2131 * Clean fields data can't be used on VMLAUNCH and when we switch 2132 * between different L2 guests as KVM keeps a single VMCS12 per L1. 2133 */ 2134 if (from_launch || evmcs_gpa_changed) { 2135 vmx->nested.hv_evmcs->hv_clean_fields &= 2136 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2137 2138 vmx->nested.force_msr_bitmap_recalc = true; 2139 } 2140 2141 return EVMPTRLD_SUCCEEDED; 2142 #else 2143 return EVMPTRLD_DISABLED; 2144 #endif 2145 } 2146 2147 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 2148 { 2149 struct vcpu_vmx *vmx = to_vmx(vcpu); 2150 2151 if (nested_vmx_is_evmptr12_valid(vmx)) 2152 copy_vmcs12_to_enlightened(vmx); 2153 else 2154 copy_vmcs12_to_shadow(vmx); 2155 2156 vmx->nested.need_vmcs12_to_shadow_sync = false; 2157 } 2158 2159 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 2160 { 2161 struct vcpu_vmx *vmx = 2162 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 2163 2164 vmx->nested.preemption_timer_expired = true; 2165 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 2166 kvm_vcpu_kick(&vmx->vcpu); 2167 2168 return HRTIMER_NORESTART; 2169 } 2170 2171 static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) 2172 { 2173 struct vcpu_vmx *vmx = to_vmx(vcpu); 2174 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2175 2176 u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> 2177 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2178 2179 if (!vmx->nested.has_preemption_timer_deadline) { 2180 vmx->nested.preemption_timer_deadline = 2181 vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; 2182 vmx->nested.has_preemption_timer_deadline = true; 2183 } 2184 return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; 2185 } 2186 2187 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, 2188 u64 preemption_timeout) 2189 { 2190 struct vcpu_vmx *vmx = to_vmx(vcpu); 2191 2192 /* 2193 * A timer value of zero is architecturally guaranteed to cause 2194 * a VMExit prior to executing any instructions in the guest. 2195 */ 2196 if (preemption_timeout == 0) { 2197 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 2198 return; 2199 } 2200 2201 if (vcpu->arch.virtual_tsc_khz == 0) 2202 return; 2203 2204 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2205 preemption_timeout *= 1000000; 2206 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 2207 hrtimer_start(&vmx->nested.preemption_timer, 2208 ktime_add_ns(ktime_get(), preemption_timeout), 2209 HRTIMER_MODE_ABS_PINNED); 2210 } 2211 2212 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2213 { 2214 if (vmx->nested.nested_run_pending && 2215 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 2216 return vmcs12->guest_ia32_efer; 2217 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 2218 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 2219 else 2220 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 2221 } 2222 2223 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 2224 { 2225 struct kvm *kvm = vmx->vcpu.kvm; 2226 2227 /* 2228 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 2229 * according to L0's settings (vmcs12 is irrelevant here). Host 2230 * fields that come from L0 and are not constant, e.g. HOST_CR3, 2231 * will be set as needed prior to VMLAUNCH/VMRESUME. 2232 */ 2233 if (vmx->nested.vmcs02_initialized) 2234 return; 2235 vmx->nested.vmcs02_initialized = true; 2236 2237 /* 2238 * We don't care what the EPTP value is we just need to guarantee 2239 * it's valid so we don't get a false positive when doing early 2240 * consistency checks. 2241 */ 2242 if (enable_ept && nested_early_check) 2243 vmcs_write64(EPT_POINTER, 2244 construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); 2245 2246 if (vmx->ve_info) 2247 vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info)); 2248 2249 /* All VMFUNCs are currently emulated through L0 vmexits. */ 2250 if (cpu_has_vmx_vmfunc()) 2251 vmcs_write64(VM_FUNCTION_CONTROL, 0); 2252 2253 if (cpu_has_vmx_posted_intr()) 2254 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 2255 2256 if (cpu_has_vmx_msr_bitmap()) 2257 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2258 2259 /* 2260 * PML is emulated for L2, but never enabled in hardware as the MMU 2261 * handles A/D emulation. Disabling PML for L2 also avoids having to 2262 * deal with filtering out L2 GPAs from the buffer. 2263 */ 2264 if (enable_pml) { 2265 vmcs_write64(PML_ADDRESS, 0); 2266 vmcs_write16(GUEST_PML_INDEX, -1); 2267 } 2268 2269 if (cpu_has_vmx_encls_vmexit()) 2270 vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); 2271 2272 if (kvm_notify_vmexit_enabled(kvm)) 2273 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 2274 2275 /* 2276 * Set the MSR load/store lists to match L0's settings. Only the 2277 * addresses are constant (for vmcs02), the counts can change based 2278 * on L2's behavior, e.g. switching to/from long mode. 2279 */ 2280 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); 2281 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2282 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2283 2284 vmx_set_constant_host_state(vmx); 2285 } 2286 2287 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2288 struct vmcs12 *vmcs12) 2289 { 2290 prepare_vmcs02_constant_state(vmx); 2291 2292 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 2293 2294 if (enable_vpid) { 2295 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2296 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2297 else 2298 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2299 } 2300 } 2301 2302 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, 2303 struct vmcs12 *vmcs12) 2304 { 2305 u32 exec_control; 2306 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2307 2308 if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) 2309 prepare_vmcs02_early_rare(vmx, vmcs12); 2310 2311 /* 2312 * PIN CONTROLS 2313 */ 2314 exec_control = __pin_controls_get(vmcs01); 2315 exec_control |= (vmcs12->pin_based_vm_exec_control & 2316 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2317 2318 /* Posted interrupts setting is only taken from vmcs12. */ 2319 vmx->nested.pi_pending = false; 2320 if (nested_cpu_has_posted_intr(vmcs12)) 2321 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2322 else 2323 exec_control &= ~PIN_BASED_POSTED_INTR; 2324 pin_controls_set(vmx, exec_control); 2325 2326 /* 2327 * EXEC CONTROLS 2328 */ 2329 exec_control = __exec_controls_get(vmcs01); /* L0's desires */ 2330 exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; 2331 exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; 2332 exec_control &= ~CPU_BASED_TPR_SHADOW; 2333 exec_control |= vmcs12->cpu_based_vm_exec_control; 2334 2335 vmx->nested.l1_tpr_threshold = -1; 2336 if (exec_control & CPU_BASED_TPR_SHADOW) 2337 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2338 #ifdef CONFIG_X86_64 2339 else 2340 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2341 CPU_BASED_CR8_STORE_EXITING; 2342 #endif 2343 2344 /* 2345 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2346 * for I/O port accesses. 2347 */ 2348 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2349 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2350 2351 /* 2352 * This bit will be computed in nested_get_vmcs12_pages, because 2353 * we do not have access to L1's MSR bitmap yet. For now, keep 2354 * the same bit as before, hoping to avoid multiple VMWRITEs that 2355 * only set/clear this bit. 2356 */ 2357 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2358 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2359 2360 exec_controls_set(vmx, exec_control); 2361 2362 /* 2363 * SECONDARY EXEC CONTROLS 2364 */ 2365 if (cpu_has_secondary_exec_ctrls()) { 2366 exec_control = __secondary_exec_controls_get(vmcs01); 2367 2368 /* Take the following fields only from vmcs12 */ 2369 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2370 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2371 SECONDARY_EXEC_ENABLE_INVPCID | 2372 SECONDARY_EXEC_ENABLE_RDTSCP | 2373 SECONDARY_EXEC_ENABLE_XSAVES | 2374 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2375 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2376 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2377 SECONDARY_EXEC_ENABLE_VMFUNC | 2378 SECONDARY_EXEC_DESC); 2379 2380 if (nested_cpu_has(vmcs12, 2381 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 2382 exec_control |= vmcs12->secondary_vm_exec_control; 2383 2384 /* PML is emulated and never enabled in hardware for L2. */ 2385 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 2386 2387 /* VMCS shadowing for L2 is emulated for now */ 2388 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2389 2390 /* 2391 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2392 * will not have to rewrite the controls just for this bit. 2393 */ 2394 if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2395 exec_control |= SECONDARY_EXEC_DESC; 2396 2397 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2398 vmcs_write16(GUEST_INTR_STATUS, 2399 vmcs12->guest_intr_status); 2400 2401 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 2402 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2403 2404 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) 2405 vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); 2406 2407 secondary_exec_controls_set(vmx, exec_control); 2408 } 2409 2410 /* 2411 * ENTRY CONTROLS 2412 * 2413 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2414 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2415 * on the related bits (if supported by the CPU) in the hope that 2416 * we can avoid VMWrites during vmx_set_efer(). 2417 * 2418 * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is 2419 * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to 2420 * do the same for L2. 2421 */ 2422 exec_control = __vm_entry_controls_get(vmcs01); 2423 exec_control |= (vmcs12->vm_entry_controls & 2424 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 2425 exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); 2426 if (cpu_has_load_ia32_efer()) { 2427 if (guest_efer & EFER_LMA) 2428 exec_control |= VM_ENTRY_IA32E_MODE; 2429 if (guest_efer != kvm_host.efer) 2430 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2431 } 2432 vm_entry_controls_set(vmx, exec_control); 2433 2434 /* 2435 * EXIT CONTROLS 2436 * 2437 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2438 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2439 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2440 */ 2441 exec_control = __vm_exit_controls_get(vmcs01); 2442 if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer) 2443 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2444 else 2445 exec_control &= ~VM_EXIT_LOAD_IA32_EFER; 2446 vm_exit_controls_set(vmx, exec_control); 2447 2448 /* 2449 * Interrupt/Exception Fields 2450 */ 2451 if (vmx->nested.nested_run_pending) { 2452 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2453 vmcs12->vm_entry_intr_info_field); 2454 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2455 vmcs12->vm_entry_exception_error_code); 2456 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2457 vmcs12->vm_entry_instruction_len); 2458 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2459 vmcs12->guest_interruptibility_info); 2460 vmx->loaded_vmcs->nmi_known_unmasked = 2461 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2462 } else { 2463 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2464 } 2465 } 2466 2467 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2468 { 2469 struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx); 2470 2471 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2472 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2473 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2474 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2475 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2476 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2477 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2478 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2479 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2480 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2481 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2482 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2483 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2484 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2485 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2486 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2487 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2488 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2489 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2490 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2491 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2492 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2493 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2494 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2495 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2496 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2497 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2498 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2499 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2500 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2501 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2502 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2503 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2504 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2505 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2506 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2507 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2508 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2509 2510 vmx->segment_cache.bitmask = 0; 2511 } 2512 2513 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2514 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2515 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2516 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2517 vmcs12->guest_pending_dbg_exceptions); 2518 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2519 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2520 2521 /* 2522 * L1 may access the L2's PDPTR, so save them to construct 2523 * vmcs12 2524 */ 2525 if (enable_ept) { 2526 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2527 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2528 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2529 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2530 } 2531 2532 if (kvm_mpx_supported() && vmx->nested.nested_run_pending && 2533 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2534 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2535 } 2536 2537 if (nested_cpu_has_xsaves(vmcs12)) 2538 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2539 2540 /* 2541 * Whether page-faults are trapped is determined by a combination of 2542 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 2543 * doesn't care about page faults then we should set all of these to 2544 * L1's desires. However, if L0 does care about (some) page faults, it 2545 * is not easy (if at all possible?) to merge L0 and L1's desires, we 2546 * simply ask to exit on each and every L2 page fault. This is done by 2547 * setting MASK=MATCH=0 and (see below) EB.PF=1. 2548 * Note that below we don't need special code to set EB.PF beyond the 2549 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2550 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2551 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2552 */ 2553 if (vmx_need_pf_intercept(&vmx->vcpu)) { 2554 /* 2555 * TODO: if both L0 and L1 need the same MASK and MATCH, 2556 * go ahead and use it? 2557 */ 2558 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 2559 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 2560 } else { 2561 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); 2562 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); 2563 } 2564 2565 if (cpu_has_vmx_apicv()) { 2566 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2567 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2568 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2569 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2570 } 2571 2572 /* 2573 * Make sure the msr_autostore list is up to date before we set the 2574 * count in the vmcs02. 2575 */ 2576 prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); 2577 2578 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); 2579 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2580 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2581 2582 set_cr4_guest_host_mask(vmx); 2583 } 2584 2585 /* 2586 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2587 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2588 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2589 * guest in a way that will both be appropriate to L1's requests, and our 2590 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2591 * function also has additional necessary side-effects, like setting various 2592 * vcpu->arch fields. 2593 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2594 * is assigned to entry_failure_code on failure. 2595 */ 2596 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2597 bool from_vmentry, 2598 enum vm_entry_failure_code *entry_failure_code) 2599 { 2600 struct vcpu_vmx *vmx = to_vmx(vcpu); 2601 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 2602 bool load_guest_pdptrs_vmcs12 = false; 2603 2604 if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) { 2605 prepare_vmcs02_rare(vmx, vmcs12); 2606 vmx->nested.dirty_vmcs12 = false; 2607 2608 load_guest_pdptrs_vmcs12 = !nested_vmx_is_evmptr12_valid(vmx) || 2609 !(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2610 } 2611 2612 if (vmx->nested.nested_run_pending && 2613 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2614 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2615 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); 2616 } else { 2617 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2618 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); 2619 } 2620 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2621 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2622 vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); 2623 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2624 2625 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2626 * bitwise-or of what L1 wants to trap for L2, and what we want to 2627 * trap. Note that CR0.TS also needs updating - we do this later. 2628 */ 2629 vmx_update_exception_bitmap(vcpu); 2630 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2631 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2632 2633 if (vmx->nested.nested_run_pending && 2634 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2635 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2636 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2637 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2638 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 2639 } 2640 2641 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( 2642 vcpu->arch.l1_tsc_offset, 2643 vmx_get_l2_tsc_offset(vcpu), 2644 vmx_get_l2_tsc_multiplier(vcpu)); 2645 2646 vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( 2647 vcpu->arch.l1_tsc_scaling_ratio, 2648 vmx_get_l2_tsc_multiplier(vcpu)); 2649 2650 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2651 if (kvm_caps.has_tsc_control) 2652 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 2653 2654 nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); 2655 2656 if (nested_cpu_has_ept(vmcs12)) 2657 nested_ept_init_mmu_context(vcpu); 2658 2659 /* 2660 * Override the CR0/CR4 read shadows after setting the effective guest 2661 * CR0/CR4. The common helpers also set the shadows, but they don't 2662 * account for vmcs12's cr0/4_guest_host_mask. 2663 */ 2664 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2665 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2666 2667 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2668 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2669 2670 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2671 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2672 vmx_set_efer(vcpu, vcpu->arch.efer); 2673 2674 /* 2675 * Guest state is invalid and unrestricted guest is disabled, 2676 * which means L1 attempted VMEntry to L2 with invalid state. 2677 * Fail the VMEntry. 2678 * 2679 * However when force loading the guest state (SMM exit or 2680 * loading nested state after migration, it is possible to 2681 * have invalid guest state now, which will be later fixed by 2682 * restoring L2 register state 2683 */ 2684 if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { 2685 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2686 return -EINVAL; 2687 } 2688 2689 /* Shadow page tables on either EPT or shadow page tables. */ 2690 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2691 from_vmentry, entry_failure_code)) 2692 return -EINVAL; 2693 2694 /* 2695 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 2696 * on nested VM-Exit, which can occur without actually running L2 and 2697 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with 2698 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the 2699 * transition to HLT instead of running L2. 2700 */ 2701 if (enable_ept) 2702 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); 2703 2704 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2705 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2706 is_pae_paging(vcpu)) { 2707 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2708 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2709 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2710 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2711 } 2712 2713 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2714 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && 2715 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2716 vmcs12->guest_ia32_perf_global_ctrl))) { 2717 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2718 return -EINVAL; 2719 } 2720 2721 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2722 kvm_rip_write(vcpu, vmcs12->guest_rip); 2723 2724 /* 2725 * It was observed that genuine Hyper-V running in L1 doesn't reset 2726 * 'hv_clean_fields' by itself, it only sets the corresponding dirty 2727 * bits when it changes a field in eVMCS. Mark all fields as clean 2728 * here. 2729 */ 2730 if (nested_vmx_is_evmptr12_valid(vmx)) 2731 evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2732 2733 return 0; 2734 } 2735 2736 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2737 { 2738 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2739 nested_cpu_has_virtual_nmis(vmcs12))) 2740 return -EINVAL; 2741 2742 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2743 nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) 2744 return -EINVAL; 2745 2746 return 0; 2747 } 2748 2749 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) 2750 { 2751 struct vcpu_vmx *vmx = to_vmx(vcpu); 2752 2753 /* Check for memory type validity */ 2754 switch (new_eptp & VMX_EPTP_MT_MASK) { 2755 case VMX_EPTP_MT_UC: 2756 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2757 return false; 2758 break; 2759 case VMX_EPTP_MT_WB: 2760 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2761 return false; 2762 break; 2763 default: 2764 return false; 2765 } 2766 2767 /* Page-walk levels validity. */ 2768 switch (new_eptp & VMX_EPTP_PWL_MASK) { 2769 case VMX_EPTP_PWL_5: 2770 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) 2771 return false; 2772 break; 2773 case VMX_EPTP_PWL_4: 2774 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) 2775 return false; 2776 break; 2777 default: 2778 return false; 2779 } 2780 2781 /* Reserved bits should not be set */ 2782 if (CC(!kvm_vcpu_is_legal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) 2783 return false; 2784 2785 /* AD, if set, should be supported */ 2786 if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { 2787 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2788 return false; 2789 } 2790 2791 return true; 2792 } 2793 2794 /* 2795 * Checks related to VM-Execution Control Fields 2796 */ 2797 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2798 struct vmcs12 *vmcs12) 2799 { 2800 struct vcpu_vmx *vmx = to_vmx(vcpu); 2801 2802 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2803 vmx->nested.msrs.pinbased_ctls_low, 2804 vmx->nested.msrs.pinbased_ctls_high)) || 2805 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2806 vmx->nested.msrs.procbased_ctls_low, 2807 vmx->nested.msrs.procbased_ctls_high))) 2808 return -EINVAL; 2809 2810 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2811 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2812 vmx->nested.msrs.secondary_ctls_low, 2813 vmx->nested.msrs.secondary_ctls_high))) 2814 return -EINVAL; 2815 2816 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2817 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2818 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2819 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2820 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2821 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2822 nested_vmx_check_nmi_controls(vmcs12) || 2823 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2824 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2825 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2826 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2827 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2828 return -EINVAL; 2829 2830 if (!nested_cpu_has_preemption_timer(vmcs12) && 2831 nested_cpu_has_save_preemption_timer(vmcs12)) 2832 return -EINVAL; 2833 2834 if (nested_cpu_has_ept(vmcs12) && 2835 CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) 2836 return -EINVAL; 2837 2838 if (nested_cpu_has_vmfunc(vmcs12)) { 2839 if (CC(vmcs12->vm_function_control & 2840 ~vmx->nested.msrs.vmfunc_controls)) 2841 return -EINVAL; 2842 2843 if (nested_cpu_has_eptp_switching(vmcs12)) { 2844 if (CC(!nested_cpu_has_ept(vmcs12)) || 2845 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2846 return -EINVAL; 2847 } 2848 } 2849 2850 return 0; 2851 } 2852 2853 /* 2854 * Checks related to VM-Exit Control Fields 2855 */ 2856 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2857 struct vmcs12 *vmcs12) 2858 { 2859 struct vcpu_vmx *vmx = to_vmx(vcpu); 2860 2861 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 2862 vmx->nested.msrs.exit_ctls_low, 2863 vmx->nested.msrs.exit_ctls_high)) || 2864 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 2865 return -EINVAL; 2866 2867 return 0; 2868 } 2869 2870 /* 2871 * Checks related to VM-Entry Control Fields 2872 */ 2873 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 2874 struct vmcs12 *vmcs12) 2875 { 2876 struct vcpu_vmx *vmx = to_vmx(vcpu); 2877 2878 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 2879 vmx->nested.msrs.entry_ctls_low, 2880 vmx->nested.msrs.entry_ctls_high))) 2881 return -EINVAL; 2882 2883 /* 2884 * From the Intel SDM, volume 3: 2885 * Fields relevant to VM-entry event injection must be set properly. 2886 * These fields are the VM-entry interruption-information field, the 2887 * VM-entry exception error code, and the VM-entry instruction length. 2888 */ 2889 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 2890 u32 intr_info = vmcs12->vm_entry_intr_info_field; 2891 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 2892 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 2893 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 2894 bool should_have_error_code; 2895 bool urg = nested_cpu_has2(vmcs12, 2896 SECONDARY_EXEC_UNRESTRICTED_GUEST); 2897 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 2898 2899 /* VM-entry interruption-info field: interruption type */ 2900 if (CC(intr_type == INTR_TYPE_RESERVED) || 2901 CC(intr_type == INTR_TYPE_OTHER_EVENT && 2902 !nested_cpu_supports_monitor_trap_flag(vcpu))) 2903 return -EINVAL; 2904 2905 /* VM-entry interruption-info field: vector */ 2906 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 2907 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 2908 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 2909 return -EINVAL; 2910 2911 /* VM-entry interruption-info field: deliver error code */ 2912 should_have_error_code = 2913 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && 2914 x86_exception_has_error_code(vector); 2915 if (CC(has_error_code != should_have_error_code)) 2916 return -EINVAL; 2917 2918 /* VM-entry exception error code */ 2919 if (CC(has_error_code && 2920 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) 2921 return -EINVAL; 2922 2923 /* VM-entry interruption-info field: reserved bits */ 2924 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 2925 return -EINVAL; 2926 2927 /* VM-entry instruction length */ 2928 switch (intr_type) { 2929 case INTR_TYPE_SOFT_EXCEPTION: 2930 case INTR_TYPE_SOFT_INTR: 2931 case INTR_TYPE_PRIV_SW_EXCEPTION: 2932 if (CC(vmcs12->vm_entry_instruction_len > 15) || 2933 CC(vmcs12->vm_entry_instruction_len == 0 && 2934 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 2935 return -EINVAL; 2936 } 2937 } 2938 2939 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 2940 return -EINVAL; 2941 2942 return 0; 2943 } 2944 2945 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 2946 struct vmcs12 *vmcs12) 2947 { 2948 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 2949 nested_check_vm_exit_controls(vcpu, vmcs12) || 2950 nested_check_vm_entry_controls(vcpu, vmcs12)) 2951 return -EINVAL; 2952 2953 #ifdef CONFIG_KVM_HYPERV 2954 if (guest_cpuid_has_evmcs(vcpu)) 2955 return nested_evmcs_check_controls(vmcs12); 2956 #endif 2957 2958 return 0; 2959 } 2960 2961 static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, 2962 struct vmcs12 *vmcs12) 2963 { 2964 #ifdef CONFIG_X86_64 2965 if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != 2966 !!(vcpu->arch.efer & EFER_LMA))) 2967 return -EINVAL; 2968 #endif 2969 return 0; 2970 } 2971 2972 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 2973 struct vmcs12 *vmcs12) 2974 { 2975 bool ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); 2976 2977 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 2978 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 2979 CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3))) 2980 return -EINVAL; 2981 2982 if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 2983 CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 2984 return -EINVAL; 2985 2986 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 2987 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 2988 return -EINVAL; 2989 2990 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 2991 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 2992 vmcs12->host_ia32_perf_global_ctrl))) 2993 return -EINVAL; 2994 2995 if (ia32e) { 2996 if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) 2997 return -EINVAL; 2998 } else { 2999 if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || 3000 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || 3001 CC((vmcs12->host_rip) >> 32)) 3002 return -EINVAL; 3003 } 3004 3005 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3006 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3007 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3008 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3009 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3010 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3011 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3012 CC(vmcs12->host_cs_selector == 0) || 3013 CC(vmcs12->host_tr_selector == 0) || 3014 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 3015 return -EINVAL; 3016 3017 if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || 3018 CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || 3019 CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || 3020 CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || 3021 CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || 3022 CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) 3023 return -EINVAL; 3024 3025 /* 3026 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 3027 * IA32_EFER MSR must be 0 in the field for that register. In addition, 3028 * the values of the LMA and LME bits in the field must each be that of 3029 * the host address-space size VM-exit control. 3030 */ 3031 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 3032 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 3033 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 3034 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 3035 return -EINVAL; 3036 } 3037 3038 return 0; 3039 } 3040 3041 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 3042 struct vmcs12 *vmcs12) 3043 { 3044 struct vcpu_vmx *vmx = to_vmx(vcpu); 3045 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 3046 struct vmcs_hdr hdr; 3047 3048 if (vmcs12->vmcs_link_pointer == INVALID_GPA) 3049 return 0; 3050 3051 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 3052 return -EINVAL; 3053 3054 if (ghc->gpa != vmcs12->vmcs_link_pointer && 3055 CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 3056 vmcs12->vmcs_link_pointer, VMCS12_SIZE))) 3057 return -EINVAL; 3058 3059 if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 3060 offsetof(struct vmcs12, hdr), 3061 sizeof(hdr)))) 3062 return -EINVAL; 3063 3064 if (CC(hdr.revision_id != VMCS12_REVISION) || 3065 CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 3066 return -EINVAL; 3067 3068 return 0; 3069 } 3070 3071 /* 3072 * Checks related to Guest Non-register State 3073 */ 3074 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 3075 { 3076 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 3077 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && 3078 vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) 3079 return -EINVAL; 3080 3081 return 0; 3082 } 3083 3084 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 3085 struct vmcs12 *vmcs12, 3086 enum vm_entry_failure_code *entry_failure_code) 3087 { 3088 bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE); 3089 3090 *entry_failure_code = ENTRY_FAIL_DEFAULT; 3091 3092 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 3093 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 3094 return -EINVAL; 3095 3096 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && 3097 CC(!kvm_dr7_valid(vmcs12->guest_dr7))) 3098 return -EINVAL; 3099 3100 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 3101 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 3102 return -EINVAL; 3103 3104 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 3105 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; 3106 return -EINVAL; 3107 } 3108 3109 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 3110 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 3111 vmcs12->guest_ia32_perf_global_ctrl))) 3112 return -EINVAL; 3113 3114 if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG)) 3115 return -EINVAL; 3116 3117 if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) || 3118 CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG))) 3119 return -EINVAL; 3120 3121 /* 3122 * If the load IA32_EFER VM-entry control is 1, the following checks 3123 * are performed on the field for the IA32_EFER MSR: 3124 * - Bits reserved in the IA32_EFER MSR must be 0. 3125 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 3126 * the IA-32e mode guest VM-exit control. It must also be identical 3127 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 3128 * CR0.PG) is 1. 3129 */ 3130 if (to_vmx(vcpu)->nested.nested_run_pending && 3131 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 3132 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 3133 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 3134 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 3135 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 3136 return -EINVAL; 3137 } 3138 3139 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 3140 (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 3141 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 3142 return -EINVAL; 3143 3144 if (nested_check_guest_non_reg_state(vmcs12)) 3145 return -EINVAL; 3146 3147 return 0; 3148 } 3149 3150 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) 3151 { 3152 struct vcpu_vmx *vmx = to_vmx(vcpu); 3153 unsigned long cr3, cr4; 3154 bool vm_fail; 3155 3156 if (!nested_early_check) 3157 return 0; 3158 3159 if (vmx->msr_autoload.host.nr) 3160 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 3161 if (vmx->msr_autoload.guest.nr) 3162 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 3163 3164 preempt_disable(); 3165 3166 vmx_prepare_switch_to_guest(vcpu); 3167 3168 /* 3169 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, 3170 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to 3171 * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. 3172 * there is no need to preserve other bits or save/restore the field. 3173 */ 3174 vmcs_writel(GUEST_RFLAGS, 0); 3175 3176 cr3 = __get_current_cr3_fast(); 3177 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 3178 vmcs_writel(HOST_CR3, cr3); 3179 vmx->loaded_vmcs->host_state.cr3 = cr3; 3180 } 3181 3182 cr4 = cr4_read_shadow(); 3183 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 3184 vmcs_writel(HOST_CR4, cr4); 3185 vmx->loaded_vmcs->host_state.cr4 = cr4; 3186 } 3187 3188 vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 3189 __vmx_vcpu_run_flags(vmx)); 3190 3191 if (vmx->msr_autoload.host.nr) 3192 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 3193 if (vmx->msr_autoload.guest.nr) 3194 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 3195 3196 if (vm_fail) { 3197 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); 3198 3199 preempt_enable(); 3200 3201 trace_kvm_nested_vmenter_failed( 3202 "early hardware check VM-instruction error: ", error); 3203 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3204 return 1; 3205 } 3206 3207 /* 3208 * VMExit clears RFLAGS.IF and DR7, even on a consistency check. 3209 */ 3210 if (hw_breakpoint_active()) 3211 set_debugreg(__this_cpu_read(cpu_dr7), 7); 3212 local_irq_enable(); 3213 preempt_enable(); 3214 3215 /* 3216 * A non-failing VMEntry means we somehow entered guest mode with 3217 * an illegal RIP, and that's just the tip of the iceberg. There 3218 * is no telling what memory has been modified or what state has 3219 * been exposed to unknown code. Hitting this all but guarantees 3220 * a (very critical) hardware issue. 3221 */ 3222 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & 3223 VMX_EXIT_REASONS_FAILED_VMENTRY)); 3224 3225 return 0; 3226 } 3227 3228 #ifdef CONFIG_KVM_HYPERV 3229 static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) 3230 { 3231 struct vcpu_vmx *vmx = to_vmx(vcpu); 3232 3233 /* 3234 * hv_evmcs may end up being not mapped after migration (when 3235 * L2 was running), map it here to make sure vmcs12 changes are 3236 * properly reflected. 3237 */ 3238 if (guest_cpuid_has_evmcs(vcpu) && 3239 vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { 3240 enum nested_evmptrld_status evmptrld_status = 3241 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 3242 3243 if (evmptrld_status == EVMPTRLD_VMFAIL || 3244 evmptrld_status == EVMPTRLD_ERROR) 3245 return false; 3246 3247 /* 3248 * Post migration VMCS12 always provides the most actual 3249 * information, copy it to eVMCS upon entry. 3250 */ 3251 vmx->nested.need_vmcs12_to_shadow_sync = true; 3252 } 3253 3254 return true; 3255 } 3256 #endif 3257 3258 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 3259 { 3260 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3261 struct vcpu_vmx *vmx = to_vmx(vcpu); 3262 struct kvm_host_map *map; 3263 3264 if (!vcpu->arch.pdptrs_from_userspace && 3265 !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 3266 /* 3267 * Reload the guest's PDPTRs since after a migration 3268 * the guest CR3 might be restored prior to setting the nested 3269 * state which can lead to a load of wrong PDPTRs. 3270 */ 3271 if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) 3272 return false; 3273 } 3274 3275 3276 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3277 map = &vmx->nested.apic_access_page_map; 3278 3279 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) { 3280 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn)); 3281 } else { 3282 pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n", 3283 __func__); 3284 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3285 vcpu->run->internal.suberror = 3286 KVM_INTERNAL_ERROR_EMULATION; 3287 vcpu->run->internal.ndata = 0; 3288 return false; 3289 } 3290 } 3291 3292 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3293 map = &vmx->nested.virtual_apic_map; 3294 3295 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 3296 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 3297 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 3298 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 3299 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3300 /* 3301 * The processor will never use the TPR shadow, simply 3302 * clear the bit from the execution control. Such a 3303 * configuration is useless, but it happens in tests. 3304 * For any other configuration, failing the vm entry is 3305 * _not_ what the processor does but it's basically the 3306 * only possibility we have. 3307 */ 3308 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 3309 } else { 3310 /* 3311 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 3312 * force VM-Entry to fail. 3313 */ 3314 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); 3315 } 3316 } 3317 3318 if (nested_cpu_has_posted_intr(vmcs12)) { 3319 map = &vmx->nested.pi_desc_map; 3320 3321 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 3322 vmx->nested.pi_desc = 3323 (struct pi_desc *)(((void *)map->hva) + 3324 offset_in_page(vmcs12->posted_intr_desc_addr)); 3325 vmcs_write64(POSTED_INTR_DESC_ADDR, 3326 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 3327 } else { 3328 /* 3329 * Defer the KVM_INTERNAL_EXIT until KVM tries to 3330 * access the contents of the VMCS12 posted interrupt 3331 * descriptor. (Note that KVM may do this when it 3332 * should not, per the architectural specification.) 3333 */ 3334 vmx->nested.pi_desc = NULL; 3335 pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); 3336 } 3337 } 3338 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 3339 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3340 else 3341 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3342 3343 return true; 3344 } 3345 3346 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) 3347 { 3348 #ifdef CONFIG_KVM_HYPERV 3349 /* 3350 * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy 3351 * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory 3352 * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post 3353 * migration. 3354 */ 3355 if (!nested_get_evmcs_page(vcpu)) { 3356 pr_debug_ratelimited("%s: enlightened vmptrld failed\n", 3357 __func__); 3358 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3359 vcpu->run->internal.suberror = 3360 KVM_INTERNAL_ERROR_EMULATION; 3361 vcpu->run->internal.ndata = 0; 3362 3363 return false; 3364 } 3365 #endif 3366 3367 if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) 3368 return false; 3369 3370 return true; 3371 } 3372 3373 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) 3374 { 3375 struct vmcs12 *vmcs12; 3376 struct vcpu_vmx *vmx = to_vmx(vcpu); 3377 gpa_t dst; 3378 3379 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) 3380 return 0; 3381 3382 if (WARN_ON_ONCE(vmx->nested.pml_full)) 3383 return 1; 3384 3385 /* 3386 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is 3387 * set is already checked as part of A/D emulation. 3388 */ 3389 vmcs12 = get_vmcs12(vcpu); 3390 if (!nested_cpu_has_pml(vmcs12)) 3391 return 0; 3392 3393 if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { 3394 vmx->nested.pml_full = true; 3395 return 1; 3396 } 3397 3398 gpa &= ~0xFFFull; 3399 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; 3400 3401 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, 3402 offset_in_page(dst), sizeof(gpa))) 3403 return 0; 3404 3405 vmcs12->guest_pml_index--; 3406 3407 return 0; 3408 } 3409 3410 /* 3411 * Intel's VMX Instruction Reference specifies a common set of prerequisites 3412 * for running VMX instructions (except VMXON, whose prerequisites are 3413 * slightly different). It also specifies what exception to inject otherwise. 3414 * Note that many of these exceptions have priority over VM exits, so they 3415 * don't have to be checked again here. 3416 */ 3417 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 3418 { 3419 if (!to_vmx(vcpu)->nested.vmxon) { 3420 kvm_queue_exception(vcpu, UD_VECTOR); 3421 return 0; 3422 } 3423 3424 if (vmx_get_cpl(vcpu)) { 3425 kvm_inject_gp(vcpu, 0); 3426 return 0; 3427 } 3428 3429 return 1; 3430 } 3431 3432 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) 3433 { 3434 u8 rvi = vmx_get_rvi(); 3435 u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); 3436 3437 return ((rvi & 0xf0) > (vppr & 0xf0)); 3438 } 3439 3440 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3441 struct vmcs12 *vmcs12); 3442 3443 /* 3444 * If from_vmentry is false, this is being called from state restore (either RSM 3445 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 3446 * 3447 * Returns: 3448 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode 3449 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail 3450 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit 3451 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error 3452 */ 3453 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, 3454 bool from_vmentry) 3455 { 3456 struct vcpu_vmx *vmx = to_vmx(vcpu); 3457 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3458 enum vm_entry_failure_code entry_failure_code; 3459 bool evaluate_pending_interrupts; 3460 union vmx_exit_reason exit_reason = { 3461 .basic = EXIT_REASON_INVALID_STATE, 3462 .failed_vmentry = 1, 3463 }; 3464 u32 failed_index; 3465 3466 trace_kvm_nested_vmenter(kvm_rip_read(vcpu), 3467 vmx->nested.current_vmptr, 3468 vmcs12->guest_rip, 3469 vmcs12->guest_intr_status, 3470 vmcs12->vm_entry_intr_info_field, 3471 vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT, 3472 vmcs12->ept_pointer, 3473 vmcs12->guest_cr3, 3474 KVM_ISA_VMX); 3475 3476 kvm_service_local_tlb_flush_requests(vcpu); 3477 3478 evaluate_pending_interrupts = exec_controls_get(vmx) & 3479 (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); 3480 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) 3481 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); 3482 if (!evaluate_pending_interrupts) 3483 evaluate_pending_interrupts |= kvm_apic_has_pending_init_or_sipi(vcpu); 3484 3485 if (!vmx->nested.nested_run_pending || 3486 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3487 vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 3488 if (kvm_mpx_supported() && 3489 (!vmx->nested.nested_run_pending || 3490 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 3491 vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3492 3493 /* 3494 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* 3495 * nested early checks are disabled. In the event of a "late" VM-Fail, 3496 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its 3497 * software model to the pre-VMEntry host state. When EPT is disabled, 3498 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes 3499 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing 3500 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to 3501 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested 3502 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is 3503 * guaranteed to be overwritten with a shadow CR3 prior to re-entering 3504 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as 3505 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks 3506 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail 3507 * path would need to manually save/restore vmcs01.GUEST_CR3. 3508 */ 3509 if (!enable_ept && !nested_early_check) 3510 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3511 3512 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3513 3514 prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); 3515 3516 if (from_vmentry) { 3517 if (unlikely(!nested_get_vmcs12_pages(vcpu))) { 3518 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3519 return NVMX_VMENTRY_KVM_INTERNAL_ERROR; 3520 } 3521 3522 if (nested_vmx_check_vmentry_hw(vcpu)) { 3523 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3524 return NVMX_VMENTRY_VMFAIL; 3525 } 3526 3527 if (nested_vmx_check_guest_state(vcpu, vmcs12, 3528 &entry_failure_code)) { 3529 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3530 vmcs12->exit_qualification = entry_failure_code; 3531 goto vmentry_fail_vmexit; 3532 } 3533 } 3534 3535 enter_guest_mode(vcpu); 3536 3537 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) { 3538 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3539 vmcs12->exit_qualification = entry_failure_code; 3540 goto vmentry_fail_vmexit_guest_mode; 3541 } 3542 3543 if (from_vmentry) { 3544 failed_index = nested_vmx_load_msr(vcpu, 3545 vmcs12->vm_entry_msr_load_addr, 3546 vmcs12->vm_entry_msr_load_count); 3547 if (failed_index) { 3548 exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; 3549 vmcs12->exit_qualification = failed_index; 3550 goto vmentry_fail_vmexit_guest_mode; 3551 } 3552 } else { 3553 /* 3554 * The MMU is not initialized to point at the right entities yet and 3555 * "get pages" would need to read data from the guest (i.e. we will 3556 * need to perform gpa to hpa translation). Request a call 3557 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3558 * have already been set at vmentry time and should not be reset. 3559 */ 3560 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 3561 } 3562 3563 /* 3564 * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI 3565 * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can 3566 * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit 3567 * unconditionally. 3568 */ 3569 if (unlikely(evaluate_pending_interrupts)) 3570 kvm_make_request(KVM_REQ_EVENT, vcpu); 3571 3572 /* 3573 * Do not start the preemption timer hrtimer until after we know 3574 * we are successful, so that only nested_vmx_vmexit needs to cancel 3575 * the timer. 3576 */ 3577 vmx->nested.preemption_timer_expired = false; 3578 if (nested_cpu_has_preemption_timer(vmcs12)) { 3579 u64 timer_value = vmx_calc_preemption_timer_value(vcpu); 3580 vmx_start_preemption_timer(vcpu, timer_value); 3581 } 3582 3583 /* 3584 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3585 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3586 * returned as far as L1 is concerned. It will only return (and set 3587 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3588 */ 3589 return NVMX_VMENTRY_SUCCESS; 3590 3591 /* 3592 * A failed consistency check that leads to a VMExit during L1's 3593 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3594 * 26.7 "VM-entry failures during or after loading guest state". 3595 */ 3596 vmentry_fail_vmexit_guest_mode: 3597 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3598 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3599 leave_guest_mode(vcpu); 3600 3601 vmentry_fail_vmexit: 3602 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3603 3604 if (!from_vmentry) 3605 return NVMX_VMENTRY_VMEXIT; 3606 3607 load_vmcs12_host_state(vcpu, vmcs12); 3608 vmcs12->vm_exit_reason = exit_reason.full; 3609 if (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)) 3610 vmx->nested.need_vmcs12_to_shadow_sync = true; 3611 return NVMX_VMENTRY_VMEXIT; 3612 } 3613 3614 /* 3615 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3616 * for running an L2 nested guest. 3617 */ 3618 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3619 { 3620 struct vmcs12 *vmcs12; 3621 enum nvmx_vmentry_status status; 3622 struct vcpu_vmx *vmx = to_vmx(vcpu); 3623 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3624 enum nested_evmptrld_status evmptrld_status; 3625 3626 if (!nested_vmx_check_permission(vcpu)) 3627 return 1; 3628 3629 evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); 3630 if (evmptrld_status == EVMPTRLD_ERROR) { 3631 kvm_queue_exception(vcpu, UD_VECTOR); 3632 return 1; 3633 } 3634 3635 kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); 3636 3637 if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) 3638 return nested_vmx_failInvalid(vcpu); 3639 3640 if (CC(!nested_vmx_is_evmptr12_valid(vmx) && 3641 vmx->nested.current_vmptr == INVALID_GPA)) 3642 return nested_vmx_failInvalid(vcpu); 3643 3644 vmcs12 = get_vmcs12(vcpu); 3645 3646 /* 3647 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3648 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3649 * rather than RFLAGS.ZF, and no error number is stored to the 3650 * VM-instruction error field. 3651 */ 3652 if (CC(vmcs12->hdr.shadow_vmcs)) 3653 return nested_vmx_failInvalid(vcpu); 3654 3655 if (nested_vmx_is_evmptr12_valid(vmx)) { 3656 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 3657 3658 copy_enlightened_to_vmcs12(vmx, evmcs->hv_clean_fields); 3659 /* Enlightened VMCS doesn't have launch state */ 3660 vmcs12->launch_state = !launch; 3661 } else if (enable_shadow_vmcs) { 3662 copy_shadow_to_vmcs12(vmx); 3663 } 3664 3665 /* 3666 * The nested entry process starts with enforcing various prerequisites 3667 * on vmcs12 as required by the Intel SDM, and act appropriately when 3668 * they fail: As the SDM explains, some conditions should cause the 3669 * instruction to fail, while others will cause the instruction to seem 3670 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3671 * To speed up the normal (success) code path, we should avoid checking 3672 * for misconfigurations which will anyway be caught by the processor 3673 * when using the merged vmcs02. 3674 */ 3675 if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) 3676 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3677 3678 if (CC(vmcs12->launch_state == launch)) 3679 return nested_vmx_fail(vcpu, 3680 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3681 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3682 3683 if (nested_vmx_check_controls(vcpu, vmcs12)) 3684 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3685 3686 if (nested_vmx_check_address_space_size(vcpu, vmcs12)) 3687 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3688 3689 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3690 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3691 3692 /* 3693 * We're finally done with prerequisite checking, and can start with 3694 * the nested entry. 3695 */ 3696 vmx->nested.nested_run_pending = 1; 3697 vmx->nested.has_preemption_timer_deadline = false; 3698 status = nested_vmx_enter_non_root_mode(vcpu, true); 3699 if (unlikely(status != NVMX_VMENTRY_SUCCESS)) 3700 goto vmentry_failed; 3701 3702 /* Emulate processing of posted interrupts on VM-Enter. */ 3703 if (nested_cpu_has_posted_intr(vmcs12) && 3704 kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) { 3705 vmx->nested.pi_pending = true; 3706 kvm_make_request(KVM_REQ_EVENT, vcpu); 3707 kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv); 3708 } 3709 3710 /* Hide L1D cache contents from the nested guest. */ 3711 vmx->vcpu.arch.l1tf_flush_l1d = true; 3712 3713 /* 3714 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3715 * also be used as part of restoring nVMX state for 3716 * snapshot restore (migration). 3717 * 3718 * In this flow, it is assumed that vmcs12 cache was 3719 * transferred as part of captured nVMX state and should 3720 * therefore not be read from guest memory (which may not 3721 * exist on destination host yet). 3722 */ 3723 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3724 3725 switch (vmcs12->guest_activity_state) { 3726 case GUEST_ACTIVITY_HLT: 3727 /* 3728 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3729 * awakened by event injection or by an NMI-window VM-exit or 3730 * by an interrupt-window VM-exit, halt the vcpu. 3731 */ 3732 if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3733 !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && 3734 !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && 3735 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3736 vmx->nested.nested_run_pending = 0; 3737 return kvm_emulate_halt_noskip(vcpu); 3738 } 3739 break; 3740 case GUEST_ACTIVITY_WAIT_SIPI: 3741 vmx->nested.nested_run_pending = 0; 3742 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 3743 break; 3744 default: 3745 break; 3746 } 3747 3748 return 1; 3749 3750 vmentry_failed: 3751 vmx->nested.nested_run_pending = 0; 3752 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) 3753 return 0; 3754 if (status == NVMX_VMENTRY_VMEXIT) 3755 return 1; 3756 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); 3757 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3758 } 3759 3760 /* 3761 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3762 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). 3763 * This function returns the new value we should put in vmcs12.guest_cr0. 3764 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3765 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3766 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3767 * didn't trap the bit, because if L1 did, so would L0). 3768 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3769 * been modified by L2, and L1 knows it. So just leave the old value of 3770 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3771 * isn't relevant, because if L0 traps this bit it can set it to anything. 3772 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3773 * changed these bits, and therefore they need to be updated, but L0 3774 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3775 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3776 */ 3777 static inline unsigned long 3778 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3779 { 3780 return 3781 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3782 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3783 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3784 vcpu->arch.cr0_guest_owned_bits)); 3785 } 3786 3787 static inline unsigned long 3788 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3789 { 3790 return 3791 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3792 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3793 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3794 vcpu->arch.cr4_guest_owned_bits)); 3795 } 3796 3797 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3798 struct vmcs12 *vmcs12, 3799 u32 vm_exit_reason, u32 exit_intr_info) 3800 { 3801 u32 idt_vectoring; 3802 unsigned int nr; 3803 3804 /* 3805 * Per the SDM, VM-Exits due to double and triple faults are never 3806 * considered to occur during event delivery, even if the double/triple 3807 * fault is the result of an escalating vectoring issue. 3808 * 3809 * Note, the SDM qualifies the double fault behavior with "The original 3810 * event results in a double-fault exception". It's unclear why the 3811 * qualification exists since exits due to double fault can occur only 3812 * while vectoring a different exception (injected events are never 3813 * subject to interception), i.e. there's _always_ an original event. 3814 * 3815 * The SDM also uses NMI as a confusing example for the "original event 3816 * causes the VM exit directly" clause. NMI isn't special in any way, 3817 * the same rule applies to all events that cause an exit directly. 3818 * NMI is an odd choice for the example because NMIs can only occur on 3819 * instruction boundaries, i.e. they _can't_ occur during vectoring. 3820 */ 3821 if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT || 3822 ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI && 3823 is_double_fault(exit_intr_info))) { 3824 vmcs12->idt_vectoring_info_field = 0; 3825 } else if (vcpu->arch.exception.injected) { 3826 nr = vcpu->arch.exception.vector; 3827 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3828 3829 if (kvm_exception_is_soft(nr)) { 3830 vmcs12->vm_exit_instruction_len = 3831 vcpu->arch.event_exit_inst_len; 3832 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 3833 } else 3834 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 3835 3836 if (vcpu->arch.exception.has_error_code) { 3837 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 3838 vmcs12->idt_vectoring_error_code = 3839 vcpu->arch.exception.error_code; 3840 } 3841 3842 vmcs12->idt_vectoring_info_field = idt_vectoring; 3843 } else if (vcpu->arch.nmi_injected) { 3844 vmcs12->idt_vectoring_info_field = 3845 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 3846 } else if (vcpu->arch.interrupt.injected) { 3847 nr = vcpu->arch.interrupt.nr; 3848 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3849 3850 if (vcpu->arch.interrupt.soft) { 3851 idt_vectoring |= INTR_TYPE_SOFT_INTR; 3852 vmcs12->vm_entry_instruction_len = 3853 vcpu->arch.event_exit_inst_len; 3854 } else 3855 idt_vectoring |= INTR_TYPE_EXT_INTR; 3856 3857 vmcs12->idt_vectoring_info_field = idt_vectoring; 3858 } else { 3859 vmcs12->idt_vectoring_info_field = 0; 3860 } 3861 } 3862 3863 3864 void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) 3865 { 3866 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3867 gfn_t gfn; 3868 3869 /* 3870 * Don't need to mark the APIC access page dirty; it is never 3871 * written to by the CPU during APIC virtualization. 3872 */ 3873 3874 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3875 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; 3876 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3877 } 3878 3879 if (nested_cpu_has_posted_intr(vmcs12)) { 3880 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; 3881 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3882 } 3883 } 3884 3885 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 3886 { 3887 struct vcpu_vmx *vmx = to_vmx(vcpu); 3888 int max_irr; 3889 void *vapic_page; 3890 u16 status; 3891 3892 if (!vmx->nested.pi_pending) 3893 return 0; 3894 3895 if (!vmx->nested.pi_desc) 3896 goto mmio_needed; 3897 3898 vmx->nested.pi_pending = false; 3899 3900 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 3901 return 0; 3902 3903 max_irr = pi_find_highest_vector(vmx->nested.pi_desc); 3904 if (max_irr > 0) { 3905 vapic_page = vmx->nested.virtual_apic_map.hva; 3906 if (!vapic_page) 3907 goto mmio_needed; 3908 3909 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 3910 vapic_page, &max_irr); 3911 status = vmcs_read16(GUEST_INTR_STATUS); 3912 if ((u8)max_irr > ((u8)status & 0xff)) { 3913 status &= ~0xff; 3914 status |= (u8)max_irr; 3915 vmcs_write16(GUEST_INTR_STATUS, status); 3916 } 3917 } 3918 3919 nested_mark_vmcs12_pages_dirty(vcpu); 3920 return 0; 3921 3922 mmio_needed: 3923 kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); 3924 return -ENXIO; 3925 } 3926 3927 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) 3928 { 3929 struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; 3930 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; 3931 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3932 unsigned long exit_qual; 3933 3934 if (ex->has_payload) { 3935 exit_qual = ex->payload; 3936 } else if (ex->vector == PF_VECTOR) { 3937 exit_qual = vcpu->arch.cr2; 3938 } else if (ex->vector == DB_VECTOR) { 3939 exit_qual = vcpu->arch.dr6; 3940 exit_qual &= ~DR6_BT; 3941 exit_qual ^= DR6_ACTIVE_LOW; 3942 } else { 3943 exit_qual = 0; 3944 } 3945 3946 /* 3947 * Unlike AMD's Paged Real Mode, which reports an error code on #PF 3948 * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the 3949 * "has error code" flags on VM-Exit if the CPU is in Real Mode. 3950 */ 3951 if (ex->has_error_code && is_protmode(vcpu)) { 3952 /* 3953 * Intel CPUs do not generate error codes with bits 31:16 set, 3954 * and more importantly VMX disallows setting bits 31:16 in the 3955 * injected error code for VM-Entry. Drop the bits to mimic 3956 * hardware and avoid inducing failure on nested VM-Entry if L1 3957 * chooses to inject the exception back to L2. AMD CPUs _do_ 3958 * generate "full" 32-bit error codes, so KVM allows userspace 3959 * to inject exception error codes with bits 31:16 set. 3960 */ 3961 vmcs12->vm_exit_intr_error_code = (u16)ex->error_code; 3962 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 3963 } 3964 3965 if (kvm_exception_is_soft(ex->vector)) 3966 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 3967 else 3968 intr_info |= INTR_TYPE_HARD_EXCEPTION; 3969 3970 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 3971 vmx_get_nmi_mask(vcpu)) 3972 intr_info |= INTR_INFO_UNBLOCK_NMI; 3973 3974 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 3975 } 3976 3977 /* 3978 * Returns true if a debug trap is (likely) pending delivery. Infer the class 3979 * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6). 3980 * Using the payload is flawed because code breakpoints (fault-like) and data 3981 * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e. 3982 * this will return false positives if a to-be-injected code breakpoint #DB is 3983 * pending (from KVM's perspective, but not "pending" across an instruction 3984 * boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it 3985 * too is trap-like. 3986 * 3987 * KVM "works" despite these flaws as ICEBP isn't currently supported by the 3988 * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the 3989 * #DB has already happened), and MTF isn't marked pending on code breakpoints 3990 * from the emulator (because such #DBs are fault-like and thus don't trigger 3991 * actions that fire on instruction retire). 3992 */ 3993 static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex) 3994 { 3995 if (!ex->pending || ex->vector != DB_VECTOR) 3996 return 0; 3997 3998 /* General Detect #DBs are always fault-like. */ 3999 return ex->payload & ~DR6_BD; 4000 } 4001 4002 /* 4003 * Returns true if there's a pending #DB exception that is lower priority than 4004 * a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by 4005 * KVM, but could theoretically be injected by userspace. Note, this code is 4006 * imperfect, see above. 4007 */ 4008 static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex) 4009 { 4010 return vmx_get_pending_dbg_trap(ex) & ~DR6_BT; 4011 } 4012 4013 /* 4014 * Certain VM-exits set the 'pending debug exceptions' field to indicate a 4015 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM 4016 * represents these debug traps with a payload that is said to be compatible 4017 * with the 'pending debug exceptions' field, write the payload to the VMCS 4018 * field if a VM-exit is delivered before the debug trap. 4019 */ 4020 static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) 4021 { 4022 unsigned long pending_dbg; 4023 4024 pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception); 4025 if (pending_dbg) 4026 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg); 4027 } 4028 4029 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) 4030 { 4031 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 4032 to_vmx(vcpu)->nested.preemption_timer_expired; 4033 } 4034 4035 static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection) 4036 { 4037 struct vcpu_vmx *vmx = to_vmx(vcpu); 4038 void *vapic = vmx->nested.virtual_apic_map.hva; 4039 int max_irr, vppr; 4040 4041 if (nested_vmx_preemption_timer_pending(vcpu) || 4042 vmx->nested.mtf_pending) 4043 return true; 4044 4045 /* 4046 * Virtual Interrupt Delivery doesn't require manual injection. Either 4047 * the interrupt is already in GUEST_RVI and will be recognized by CPU 4048 * at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move 4049 * the interrupt from the PIR to RVI prior to entering the guest. 4050 */ 4051 if (for_injection) 4052 return false; 4053 4054 if (!nested_cpu_has_vid(get_vmcs12(vcpu)) || 4055 __vmx_interrupt_blocked(vcpu)) 4056 return false; 4057 4058 if (!vapic) 4059 return false; 4060 4061 vppr = *((u32 *)(vapic + APIC_PROCPRI)); 4062 4063 max_irr = vmx_get_rvi(); 4064 if ((max_irr & 0xf0) > (vppr & 0xf0)) 4065 return true; 4066 4067 if (vmx->nested.pi_pending && vmx->nested.pi_desc && 4068 pi_test_on(vmx->nested.pi_desc)) { 4069 max_irr = pi_find_highest_vector(vmx->nested.pi_desc); 4070 if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0)) 4071 return true; 4072 } 4073 4074 return false; 4075 } 4076 4077 /* 4078 * Per the Intel SDM's table "Priority Among Concurrent Events", with minor 4079 * edits to fill in missing examples, e.g. #DB due to split-lock accesses, 4080 * and less minor edits to splice in the priority of VMX Non-Root specific 4081 * events, e.g. MTF and NMI/INTR-window exiting. 4082 * 4083 * 1 Hardware Reset and Machine Checks 4084 * - RESET 4085 * - Machine Check 4086 * 4087 * 2 Trap on Task Switch 4088 * - T flag in TSS is set (on task switch) 4089 * 4090 * 3 External Hardware Interventions 4091 * - FLUSH 4092 * - STOPCLK 4093 * - SMI 4094 * - INIT 4095 * 4096 * 3.5 Monitor Trap Flag (MTF) VM-exit[1] 4097 * 4098 * 4 Traps on Previous Instruction 4099 * - Breakpoints 4100 * - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O 4101 * breakpoint, or #DB due to a split-lock access) 4102 * 4103 * 4.3 VMX-preemption timer expired VM-exit 4104 * 4105 * 4.6 NMI-window exiting VM-exit[2] 4106 * 4107 * 5 Nonmaskable Interrupts (NMI) 4108 * 4109 * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery 4110 * 4111 * 6 Maskable Hardware Interrupts 4112 * 4113 * 7 Code Breakpoint Fault 4114 * 4115 * 8 Faults from Fetching Next Instruction 4116 * - Code-Segment Limit Violation 4117 * - Code Page Fault 4118 * - Control protection exception (missing ENDBRANCH at target of indirect 4119 * call or jump) 4120 * 4121 * 9 Faults from Decoding Next Instruction 4122 * - Instruction length > 15 bytes 4123 * - Invalid Opcode 4124 * - Coprocessor Not Available 4125 * 4126 *10 Faults on Executing Instruction 4127 * - Overflow 4128 * - Bound error 4129 * - Invalid TSS 4130 * - Segment Not Present 4131 * - Stack fault 4132 * - General Protection 4133 * - Data Page Fault 4134 * - Alignment Check 4135 * - x86 FPU Floating-point exception 4136 * - SIMD floating-point exception 4137 * - Virtualization exception 4138 * - Control protection exception 4139 * 4140 * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs), 4141 * INIT signals, and higher priority events take priority over MTF VM exits. 4142 * MTF VM exits take priority over debug-trap exceptions and lower priority 4143 * events. 4144 * 4145 * [2] Debug-trap exceptions and higher priority events take priority over VM exits 4146 * caused by the VMX-preemption timer. VM exits caused by the VMX-preemption 4147 * timer take priority over VM exits caused by the "NMI-window exiting" 4148 * VM-execution control and lower priority events. 4149 * 4150 * [3] Debug-trap exceptions and higher priority events take priority over VM exits 4151 * caused by "NMI-window exiting". VM exits caused by this control take 4152 * priority over non-maskable interrupts (NMIs) and lower priority events. 4153 * 4154 * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to 4155 * the 1-setting of the "interrupt-window exiting" VM-execution control. Thus, 4156 * non-maskable interrupts (NMIs) and higher priority events take priority over 4157 * delivery of a virtual interrupt; delivery of a virtual interrupt takes 4158 * priority over external interrupts and lower priority events. 4159 */ 4160 static int vmx_check_nested_events(struct kvm_vcpu *vcpu) 4161 { 4162 struct kvm_lapic *apic = vcpu->arch.apic; 4163 struct vcpu_vmx *vmx = to_vmx(vcpu); 4164 /* 4165 * Only a pending nested run blocks a pending exception. If there is a 4166 * previously injected event, the pending exception occurred while said 4167 * event was being delivered and thus needs to be handled. 4168 */ 4169 bool block_nested_exceptions = vmx->nested.nested_run_pending; 4170 /* 4171 * New events (not exceptions) are only recognized at instruction 4172 * boundaries. If an event needs reinjection, then KVM is handling a 4173 * VM-Exit that occurred _during_ instruction execution; new events are 4174 * blocked until the instruction completes. 4175 */ 4176 bool block_nested_events = block_nested_exceptions || 4177 kvm_event_needs_reinjection(vcpu); 4178 4179 if (lapic_in_kernel(vcpu) && 4180 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 4181 if (block_nested_events) 4182 return -EBUSY; 4183 nested_vmx_update_pending_dbg(vcpu); 4184 clear_bit(KVM_APIC_INIT, &apic->pending_events); 4185 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) 4186 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 4187 4188 /* MTF is discarded if the vCPU is in WFS. */ 4189 vmx->nested.mtf_pending = false; 4190 return 0; 4191 } 4192 4193 if (lapic_in_kernel(vcpu) && 4194 test_bit(KVM_APIC_SIPI, &apic->pending_events)) { 4195 if (block_nested_events) 4196 return -EBUSY; 4197 4198 clear_bit(KVM_APIC_SIPI, &apic->pending_events); 4199 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 4200 nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, 4201 apic->sipi_vector & 0xFFUL); 4202 return 0; 4203 } 4204 /* Fallthrough, the SIPI is completely ignored. */ 4205 } 4206 4207 /* 4208 * Process exceptions that are higher priority than Monitor Trap Flag: 4209 * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but 4210 * could theoretically come in from userspace), and ICEBP (INT1). 4211 * 4212 * TODO: SMIs have higher priority than MTF and trap-like #DBs (except 4213 * for TSS T flag #DBs). KVM also doesn't save/restore pending MTF 4214 * across SMI/RSM as it should; that needs to be addressed in order to 4215 * prioritize SMI over MTF and trap-like #DBs. 4216 */ 4217 if (vcpu->arch.exception_vmexit.pending && 4218 !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) { 4219 if (block_nested_exceptions) 4220 return -EBUSY; 4221 4222 nested_vmx_inject_exception_vmexit(vcpu); 4223 return 0; 4224 } 4225 4226 if (vcpu->arch.exception.pending && 4227 !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) { 4228 if (block_nested_exceptions) 4229 return -EBUSY; 4230 goto no_vmexit; 4231 } 4232 4233 if (vmx->nested.mtf_pending) { 4234 if (block_nested_events) 4235 return -EBUSY; 4236 nested_vmx_update_pending_dbg(vcpu); 4237 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); 4238 return 0; 4239 } 4240 4241 if (vcpu->arch.exception_vmexit.pending) { 4242 if (block_nested_exceptions) 4243 return -EBUSY; 4244 4245 nested_vmx_inject_exception_vmexit(vcpu); 4246 return 0; 4247 } 4248 4249 if (vcpu->arch.exception.pending) { 4250 if (block_nested_exceptions) 4251 return -EBUSY; 4252 goto no_vmexit; 4253 } 4254 4255 if (nested_vmx_preemption_timer_pending(vcpu)) { 4256 if (block_nested_events) 4257 return -EBUSY; 4258 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 4259 return 0; 4260 } 4261 4262 if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 4263 if (block_nested_events) 4264 return -EBUSY; 4265 goto no_vmexit; 4266 } 4267 4268 if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { 4269 if (block_nested_events) 4270 return -EBUSY; 4271 if (!nested_exit_on_nmi(vcpu)) 4272 goto no_vmexit; 4273 4274 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 4275 NMI_VECTOR | INTR_TYPE_NMI_INTR | 4276 INTR_INFO_VALID_MASK, 0); 4277 /* 4278 * The NMI-triggered VM exit counts as injection: 4279 * clear this one and block further NMIs. 4280 */ 4281 vcpu->arch.nmi_pending = 0; 4282 vmx_set_nmi_mask(vcpu, true); 4283 return 0; 4284 } 4285 4286 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { 4287 if (block_nested_events) 4288 return -EBUSY; 4289 if (!nested_exit_on_intr(vcpu)) 4290 goto no_vmexit; 4291 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 4292 return 0; 4293 } 4294 4295 no_vmexit: 4296 return vmx_complete_nested_posted_interrupt(vcpu); 4297 } 4298 4299 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 4300 { 4301 ktime_t remaining = 4302 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 4303 u64 value; 4304 4305 if (ktime_to_ns(remaining) <= 0) 4306 return 0; 4307 4308 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 4309 do_div(value, 1000000); 4310 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 4311 } 4312 4313 static bool is_vmcs12_ext_field(unsigned long field) 4314 { 4315 switch (field) { 4316 case GUEST_ES_SELECTOR: 4317 case GUEST_CS_SELECTOR: 4318 case GUEST_SS_SELECTOR: 4319 case GUEST_DS_SELECTOR: 4320 case GUEST_FS_SELECTOR: 4321 case GUEST_GS_SELECTOR: 4322 case GUEST_LDTR_SELECTOR: 4323 case GUEST_TR_SELECTOR: 4324 case GUEST_ES_LIMIT: 4325 case GUEST_CS_LIMIT: 4326 case GUEST_SS_LIMIT: 4327 case GUEST_DS_LIMIT: 4328 case GUEST_FS_LIMIT: 4329 case GUEST_GS_LIMIT: 4330 case GUEST_LDTR_LIMIT: 4331 case GUEST_TR_LIMIT: 4332 case GUEST_GDTR_LIMIT: 4333 case GUEST_IDTR_LIMIT: 4334 case GUEST_ES_AR_BYTES: 4335 case GUEST_DS_AR_BYTES: 4336 case GUEST_FS_AR_BYTES: 4337 case GUEST_GS_AR_BYTES: 4338 case GUEST_LDTR_AR_BYTES: 4339 case GUEST_TR_AR_BYTES: 4340 case GUEST_ES_BASE: 4341 case GUEST_CS_BASE: 4342 case GUEST_SS_BASE: 4343 case GUEST_DS_BASE: 4344 case GUEST_FS_BASE: 4345 case GUEST_GS_BASE: 4346 case GUEST_LDTR_BASE: 4347 case GUEST_TR_BASE: 4348 case GUEST_GDTR_BASE: 4349 case GUEST_IDTR_BASE: 4350 case GUEST_PENDING_DBG_EXCEPTIONS: 4351 case GUEST_BNDCFGS: 4352 return true; 4353 default: 4354 break; 4355 } 4356 4357 return false; 4358 } 4359 4360 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4361 struct vmcs12 *vmcs12) 4362 { 4363 struct vcpu_vmx *vmx = to_vmx(vcpu); 4364 4365 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 4366 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 4367 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 4368 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 4369 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 4370 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 4371 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 4372 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 4373 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 4374 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 4375 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 4376 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 4377 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 4378 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 4379 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 4380 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 4381 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 4382 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 4383 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 4384 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 4385 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 4386 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 4387 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 4388 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 4389 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 4390 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 4391 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 4392 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 4393 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 4394 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 4395 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 4396 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 4397 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 4398 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 4399 vmcs12->guest_pending_dbg_exceptions = 4400 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 4401 4402 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 4403 } 4404 4405 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4406 struct vmcs12 *vmcs12) 4407 { 4408 struct vcpu_vmx *vmx = to_vmx(vcpu); 4409 int cpu; 4410 4411 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 4412 return; 4413 4414 4415 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 4416 4417 cpu = get_cpu(); 4418 vmx->loaded_vmcs = &vmx->nested.vmcs02; 4419 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01); 4420 4421 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4422 4423 vmx->loaded_vmcs = &vmx->vmcs01; 4424 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02); 4425 put_cpu(); 4426 } 4427 4428 /* 4429 * Update the guest state fields of vmcs12 to reflect changes that 4430 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 4431 * VM-entry controls is also updated, since this is really a guest 4432 * state bit.) 4433 */ 4434 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 4435 { 4436 struct vcpu_vmx *vmx = to_vmx(vcpu); 4437 4438 if (nested_vmx_is_evmptr12_valid(vmx)) 4439 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4440 4441 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = 4442 !nested_vmx_is_evmptr12_valid(vmx); 4443 4444 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 4445 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 4446 4447 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 4448 vmcs12->guest_rip = kvm_rip_read(vcpu); 4449 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 4450 4451 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 4452 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 4453 4454 vmcs12->guest_interruptibility_info = 4455 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 4456 4457 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 4458 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 4459 else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 4460 vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; 4461 else 4462 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 4463 4464 if (nested_cpu_has_preemption_timer(vmcs12) && 4465 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && 4466 !vmx->nested.nested_run_pending) 4467 vmcs12->vmx_preemption_timer_value = 4468 vmx_get_preemption_timer_value(vcpu); 4469 4470 /* 4471 * In some cases (usually, nested EPT), L2 is allowed to change its 4472 * own CR3 without exiting. If it has changed it, we must keep it. 4473 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 4474 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 4475 * 4476 * Additionally, restore L2's PDPTR to vmcs12. 4477 */ 4478 if (enable_ept) { 4479 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 4480 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 4481 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 4482 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 4483 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 4484 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 4485 } 4486 } 4487 4488 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 4489 4490 if (nested_cpu_has_vid(vmcs12)) 4491 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 4492 4493 vmcs12->vm_entry_controls = 4494 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 4495 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 4496 4497 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 4498 vmcs12->guest_dr7 = vcpu->arch.dr7; 4499 4500 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 4501 vmcs12->guest_ia32_efer = vcpu->arch.efer; 4502 } 4503 4504 /* 4505 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 4506 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 4507 * and this function updates it to reflect the changes to the guest state while 4508 * L2 was running (and perhaps made some exits which were handled directly by L0 4509 * without going back to L1), and to reflect the exit reason. 4510 * Note that we do not have to copy here all VMCS fields, just those that 4511 * could have changed by the L2 guest or the exit - i.e., the guest-state and 4512 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 4513 * which already writes to vmcs12 directly. 4514 */ 4515 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 4516 u32 vm_exit_reason, u32 exit_intr_info, 4517 unsigned long exit_qualification) 4518 { 4519 /* update exit information fields: */ 4520 vmcs12->vm_exit_reason = vm_exit_reason; 4521 if (to_vmx(vcpu)->exit_reason.enclave_mode) 4522 vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; 4523 vmcs12->exit_qualification = exit_qualification; 4524 4525 /* 4526 * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched 4527 * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other 4528 * exit info fields are unmodified. 4529 */ 4530 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 4531 vmcs12->launch_state = 1; 4532 4533 /* vm_entry_intr_info_field is cleared on exit. Emulate this 4534 * instead of reading the real value. */ 4535 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 4536 4537 /* 4538 * Transfer the event that L0 or L1 may wanted to inject into 4539 * L2 to IDT_VECTORING_INFO_FIELD. 4540 */ 4541 vmcs12_save_pending_event(vcpu, vmcs12, 4542 vm_exit_reason, exit_intr_info); 4543 4544 vmcs12->vm_exit_intr_info = exit_intr_info; 4545 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4546 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4547 4548 /* 4549 * According to spec, there's no need to store the guest's 4550 * MSRs if the exit is due to a VM-entry failure that occurs 4551 * during or after loading the guest state. Since this exit 4552 * does not fall in that category, we need to save the MSRs. 4553 */ 4554 if (nested_vmx_store_msr(vcpu, 4555 vmcs12->vm_exit_msr_store_addr, 4556 vmcs12->vm_exit_msr_store_count)) 4557 nested_vmx_abort(vcpu, 4558 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 4559 } 4560 } 4561 4562 /* 4563 * A part of what we need to when the nested L2 guest exits and we want to 4564 * run its L1 parent, is to reset L1's guest state to the host state specified 4565 * in vmcs12. 4566 * This function is to be called not only on normal nested exit, but also on 4567 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 4568 * Failures During or After Loading Guest State"). 4569 * This function should be called when the active VMCS is L1's (vmcs01). 4570 */ 4571 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 4572 struct vmcs12 *vmcs12) 4573 { 4574 enum vm_entry_failure_code ignored; 4575 struct kvm_segment seg; 4576 4577 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 4578 vcpu->arch.efer = vmcs12->host_ia32_efer; 4579 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4580 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 4581 else 4582 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 4583 vmx_set_efer(vcpu, vcpu->arch.efer); 4584 4585 kvm_rsp_write(vcpu, vmcs12->host_rsp); 4586 kvm_rip_write(vcpu, vmcs12->host_rip); 4587 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 4588 vmx_set_interrupt_shadow(vcpu, 0); 4589 4590 /* 4591 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 4592 * actually changed, because vmx_set_cr0 refers to efer set above. 4593 * 4594 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 4595 * (KVM doesn't change it); 4596 */ 4597 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4598 vmx_set_cr0(vcpu, vmcs12->host_cr0); 4599 4600 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 4601 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4602 vmx_set_cr4(vcpu, vmcs12->host_cr4); 4603 4604 nested_ept_uninit_mmu_context(vcpu); 4605 4606 /* 4607 * Only PDPTE load can fail as the value of cr3 was checked on entry and 4608 * couldn't have changed. 4609 */ 4610 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored)) 4611 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 4612 4613 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); 4614 4615 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 4616 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 4617 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 4618 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 4619 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 4620 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 4621 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 4622 4623 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 4624 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 4625 vmcs_write64(GUEST_BNDCFGS, 0); 4626 4627 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 4628 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 4629 vcpu->arch.pat = vmcs12->host_ia32_pat; 4630 } 4631 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 4632 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) 4633 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4634 vmcs12->host_ia32_perf_global_ctrl)); 4635 4636 /* Set L1 segment info according to Intel SDM 4637 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 4638 seg = (struct kvm_segment) { 4639 .base = 0, 4640 .limit = 0xFFFFFFFF, 4641 .selector = vmcs12->host_cs_selector, 4642 .type = 11, 4643 .present = 1, 4644 .s = 1, 4645 .g = 1 4646 }; 4647 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4648 seg.l = 1; 4649 else 4650 seg.db = 1; 4651 __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 4652 seg = (struct kvm_segment) { 4653 .base = 0, 4654 .limit = 0xFFFFFFFF, 4655 .type = 3, 4656 .present = 1, 4657 .s = 1, 4658 .db = 1, 4659 .g = 1 4660 }; 4661 seg.selector = vmcs12->host_ds_selector; 4662 __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 4663 seg.selector = vmcs12->host_es_selector; 4664 __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 4665 seg.selector = vmcs12->host_ss_selector; 4666 __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 4667 seg.selector = vmcs12->host_fs_selector; 4668 seg.base = vmcs12->host_fs_base; 4669 __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 4670 seg.selector = vmcs12->host_gs_selector; 4671 seg.base = vmcs12->host_gs_base; 4672 __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 4673 seg = (struct kvm_segment) { 4674 .base = vmcs12->host_tr_base, 4675 .limit = 0x67, 4676 .selector = vmcs12->host_tr_selector, 4677 .type = 11, 4678 .present = 1 4679 }; 4680 __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 4681 4682 memset(&seg, 0, sizeof(seg)); 4683 seg.unusable = 1; 4684 __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); 4685 4686 kvm_set_dr(vcpu, 7, 0x400); 4687 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4688 4689 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 4690 vmcs12->vm_exit_msr_load_count)) 4691 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4692 4693 to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); 4694 } 4695 4696 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 4697 { 4698 struct vmx_uret_msr *efer_msr; 4699 unsigned int i; 4700 4701 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 4702 return vmcs_read64(GUEST_IA32_EFER); 4703 4704 if (cpu_has_load_ia32_efer()) 4705 return kvm_host.efer; 4706 4707 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 4708 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 4709 return vmx->msr_autoload.guest.val[i].value; 4710 } 4711 4712 efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); 4713 if (efer_msr) 4714 return efer_msr->data; 4715 4716 return kvm_host.efer; 4717 } 4718 4719 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 4720 { 4721 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4722 struct vcpu_vmx *vmx = to_vmx(vcpu); 4723 struct vmx_msr_entry g, h; 4724 gpa_t gpa; 4725 u32 i, j; 4726 4727 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 4728 4729 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 4730 /* 4731 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 4732 * as vmcs01.GUEST_DR7 contains a userspace defined value 4733 * and vcpu->arch.dr7 is not squirreled away before the 4734 * nested VMENTER (not worth adding a variable in nested_vmx). 4735 */ 4736 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 4737 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 4738 else 4739 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 4740 } 4741 4742 /* 4743 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 4744 * handle a variety of side effects to KVM's software model. 4745 */ 4746 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 4747 4748 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4749 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 4750 4751 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4752 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 4753 4754 nested_ept_uninit_mmu_context(vcpu); 4755 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 4756 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 4757 4758 /* 4759 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 4760 * from vmcs01 (if necessary). The PDPTRs are not loaded on 4761 * VMFail, like everything else we just need to ensure our 4762 * software model is up-to-date. 4763 */ 4764 if (enable_ept && is_pae_paging(vcpu)) 4765 ept_save_pdptrs(vcpu); 4766 4767 kvm_mmu_reset_context(vcpu); 4768 4769 /* 4770 * This nasty bit of open coding is a compromise between blindly 4771 * loading L1's MSRs using the exit load lists (incorrect emulation 4772 * of VMFail), leaving the nested VM's MSRs in the software model 4773 * (incorrect behavior) and snapshotting the modified MSRs (too 4774 * expensive since the lists are unbound by hardware). For each 4775 * MSR that was (prematurely) loaded from the nested VMEntry load 4776 * list, reload it from the exit load list if it exists and differs 4777 * from the guest value. The intent is to stuff host state as 4778 * silently as possible, not to fully process the exit load list. 4779 */ 4780 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 4781 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 4782 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 4783 pr_debug_ratelimited( 4784 "%s read MSR index failed (%u, 0x%08llx)\n", 4785 __func__, i, gpa); 4786 goto vmabort; 4787 } 4788 4789 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 4790 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 4791 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 4792 pr_debug_ratelimited( 4793 "%s read MSR failed (%u, 0x%08llx)\n", 4794 __func__, j, gpa); 4795 goto vmabort; 4796 } 4797 if (h.index != g.index) 4798 continue; 4799 if (h.value == g.value) 4800 break; 4801 4802 if (nested_vmx_load_msr_check(vcpu, &h)) { 4803 pr_debug_ratelimited( 4804 "%s check failed (%u, 0x%x, 0x%x)\n", 4805 __func__, j, h.index, h.reserved); 4806 goto vmabort; 4807 } 4808 4809 if (kvm_set_msr(vcpu, h.index, h.value)) { 4810 pr_debug_ratelimited( 4811 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 4812 __func__, j, h.index, h.value); 4813 goto vmabort; 4814 } 4815 } 4816 } 4817 4818 return; 4819 4820 vmabort: 4821 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4822 } 4823 4824 /* 4825 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 4826 * and modify vmcs12 to make it see what it would expect to see there if 4827 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 4828 */ 4829 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, 4830 u32 exit_intr_info, unsigned long exit_qualification) 4831 { 4832 struct vcpu_vmx *vmx = to_vmx(vcpu); 4833 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4834 4835 /* Pending MTF traps are discarded on VM-Exit. */ 4836 vmx->nested.mtf_pending = false; 4837 4838 /* trying to cancel vmlaunch/vmresume is a bug */ 4839 WARN_ON_ONCE(vmx->nested.nested_run_pending); 4840 4841 #ifdef CONFIG_KVM_HYPERV 4842 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { 4843 /* 4844 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map 4845 * Enlightened VMCS after migration and we still need to 4846 * do that when something is forcing L2->L1 exit prior to 4847 * the first L2 run. 4848 */ 4849 (void)nested_get_evmcs_page(vcpu); 4850 } 4851 #endif 4852 4853 /* Service pending TLB flush requests for L2 before switching to L1. */ 4854 kvm_service_local_tlb_flush_requests(vcpu); 4855 4856 /* 4857 * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between 4858 * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are 4859 * up-to-date before switching to L1. 4860 */ 4861 if (enable_ept && is_pae_paging(vcpu)) 4862 vmx_ept_load_pdptrs(vcpu); 4863 4864 leave_guest_mode(vcpu); 4865 4866 if (nested_cpu_has_preemption_timer(vmcs12)) 4867 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 4868 4869 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { 4870 vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; 4871 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 4872 vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; 4873 } 4874 4875 if (likely(!vmx->fail)) { 4876 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 4877 4878 if (vm_exit_reason != -1) 4879 prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, 4880 exit_intr_info, exit_qualification); 4881 4882 /* 4883 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 4884 * also be used to capture vmcs12 cache as part of 4885 * capturing nVMX state for snapshot (migration). 4886 * 4887 * Otherwise, this flush will dirty guest memory at a 4888 * point it is already assumed by user-space to be 4889 * immutable. 4890 */ 4891 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 4892 } else { 4893 /* 4894 * The only expected VM-instruction error is "VM entry with 4895 * invalid control field(s)." Anything else indicates a 4896 * problem with L0. And we should never get here with a 4897 * VMFail of any type if early consistency checks are enabled. 4898 */ 4899 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 4900 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4901 WARN_ON_ONCE(nested_early_check); 4902 } 4903 4904 /* 4905 * Drop events/exceptions that were queued for re-injection to L2 4906 * (picked up via vmx_complete_interrupts()), as well as exceptions 4907 * that were pending for L2. Note, this must NOT be hoisted above 4908 * prepare_vmcs12(), events/exceptions queued for re-injection need to 4909 * be captured in vmcs12 (see vmcs12_save_pending_event()). 4910 */ 4911 vcpu->arch.nmi_injected = false; 4912 kvm_clear_exception_queue(vcpu); 4913 kvm_clear_interrupt_queue(vcpu); 4914 4915 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 4916 4917 /* 4918 * If IBRS is advertised to the vCPU, KVM must flush the indirect 4919 * branch predictors when transitioning from L2 to L1, as L1 expects 4920 * hardware (KVM in this case) to provide separate predictor modes. 4921 * Bare metal isolates VMX root (host) from VMX non-root (guest), but 4922 * doesn't isolate different VMCSs, i.e. in this case, doesn't provide 4923 * separate modes for L2 vs L1. 4924 */ 4925 if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) 4926 indirect_branch_prediction_barrier(); 4927 4928 /* Update any VMCS fields that might have changed while L2 ran */ 4929 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 4930 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 4931 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 4932 if (kvm_caps.has_tsc_control) 4933 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 4934 4935 if (vmx->nested.l1_tpr_threshold != -1) 4936 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); 4937 4938 if (vmx->nested.change_vmcs01_virtual_apic_mode) { 4939 vmx->nested.change_vmcs01_virtual_apic_mode = false; 4940 vmx_set_virtual_apic_mode(vcpu); 4941 } 4942 4943 if (vmx->nested.update_vmcs01_cpu_dirty_logging) { 4944 vmx->nested.update_vmcs01_cpu_dirty_logging = false; 4945 vmx_update_cpu_dirty_logging(vcpu); 4946 } 4947 4948 /* Unpin physical memory we referred to in vmcs02 */ 4949 kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); 4950 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 4951 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 4952 vmx->nested.pi_desc = NULL; 4953 4954 if (vmx->nested.reload_vmcs01_apic_access_page) { 4955 vmx->nested.reload_vmcs01_apic_access_page = false; 4956 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4957 } 4958 4959 if (vmx->nested.update_vmcs01_apicv_status) { 4960 vmx->nested.update_vmcs01_apicv_status = false; 4961 kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); 4962 } 4963 4964 if ((vm_exit_reason != -1) && 4965 (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))) 4966 vmx->nested.need_vmcs12_to_shadow_sync = true; 4967 4968 /* in case we halted in L2 */ 4969 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4970 4971 if (likely(!vmx->fail)) { 4972 if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && 4973 nested_exit_intr_ack_set(vcpu)) { 4974 int irq = kvm_cpu_get_interrupt(vcpu); 4975 WARN_ON(irq < 0); 4976 vmcs12->vm_exit_intr_info = irq | 4977 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 4978 } 4979 4980 if (vm_exit_reason != -1) 4981 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 4982 vmcs12->exit_qualification, 4983 vmcs12->idt_vectoring_info_field, 4984 vmcs12->vm_exit_intr_info, 4985 vmcs12->vm_exit_intr_error_code, 4986 KVM_ISA_VMX); 4987 4988 load_vmcs12_host_state(vcpu, vmcs12); 4989 4990 return; 4991 } 4992 4993 /* 4994 * After an early L2 VM-entry failure, we're now back 4995 * in L1 which thinks it just finished a VMLAUNCH or 4996 * VMRESUME instruction, so we need to set the failure 4997 * flag and the VM-instruction error field of the VMCS 4998 * accordingly, and skip the emulated instruction. 4999 */ 5000 (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 5001 5002 /* 5003 * Restore L1's host state to KVM's software model. We're here 5004 * because a consistency check was caught by hardware, which 5005 * means some amount of guest state has been propagated to KVM's 5006 * model and needs to be unwound to the host's state. 5007 */ 5008 nested_vmx_restore_host_state(vcpu); 5009 5010 vmx->fail = 0; 5011 } 5012 5013 static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) 5014 { 5015 kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); 5016 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 5017 } 5018 5019 /* 5020 * Decode the memory-address operand of a vmx instruction, as recorded on an 5021 * exit caused by such an instruction (run by a guest hypervisor). 5022 * On success, returns 0. When the operand is invalid, returns 1 and throws 5023 * #UD, #GP, or #SS. 5024 */ 5025 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 5026 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 5027 { 5028 gva_t off; 5029 bool exn; 5030 struct kvm_segment s; 5031 5032 /* 5033 * According to Vol. 3B, "Information for VM Exits Due to Instruction 5034 * Execution", on an exit, vmx_instruction_info holds most of the 5035 * addressing components of the operand. Only the displacement part 5036 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 5037 * For how an actual address is calculated from all these components, 5038 * refer to Vol. 1, "Operand Addressing". 5039 */ 5040 int scaling = vmx_instruction_info & 3; 5041 int addr_size = (vmx_instruction_info >> 7) & 7; 5042 bool is_reg = vmx_instruction_info & (1u << 10); 5043 int seg_reg = (vmx_instruction_info >> 15) & 7; 5044 int index_reg = (vmx_instruction_info >> 18) & 0xf; 5045 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 5046 int base_reg = (vmx_instruction_info >> 23) & 0xf; 5047 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 5048 5049 if (is_reg) { 5050 kvm_queue_exception(vcpu, UD_VECTOR); 5051 return 1; 5052 } 5053 5054 /* Addr = segment_base + offset */ 5055 /* offset = base + [index * scale] + displacement */ 5056 off = exit_qualification; /* holds the displacement */ 5057 if (addr_size == 1) 5058 off = (gva_t)sign_extend64(off, 31); 5059 else if (addr_size == 0) 5060 off = (gva_t)sign_extend64(off, 15); 5061 if (base_is_valid) 5062 off += kvm_register_read(vcpu, base_reg); 5063 if (index_is_valid) 5064 off += kvm_register_read(vcpu, index_reg) << scaling; 5065 vmx_get_segment(vcpu, &s, seg_reg); 5066 5067 /* 5068 * The effective address, i.e. @off, of a memory operand is truncated 5069 * based on the address size of the instruction. Note that this is 5070 * the *effective address*, i.e. the address prior to accounting for 5071 * the segment's base. 5072 */ 5073 if (addr_size == 1) /* 32 bit */ 5074 off &= 0xffffffff; 5075 else if (addr_size == 0) /* 16 bit */ 5076 off &= 0xffff; 5077 5078 /* Checks for #GP/#SS exceptions. */ 5079 exn = false; 5080 if (is_long_mode(vcpu)) { 5081 /* 5082 * The virtual/linear address is never truncated in 64-bit 5083 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 5084 * address when using FS/GS with a non-zero base. 5085 */ 5086 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 5087 *ret = s.base + off; 5088 else 5089 *ret = off; 5090 5091 *ret = vmx_get_untagged_addr(vcpu, *ret, 0); 5092 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 5093 * non-canonical form. This is the only check on the memory 5094 * destination for long mode! 5095 */ 5096 exn = is_noncanonical_address(*ret, vcpu); 5097 } else { 5098 /* 5099 * When not in long mode, the virtual/linear address is 5100 * unconditionally truncated to 32 bits regardless of the 5101 * address size. 5102 */ 5103 *ret = (s.base + off) & 0xffffffff; 5104 5105 /* Protected mode: apply checks for segment validity in the 5106 * following order: 5107 * - segment type check (#GP(0) may be thrown) 5108 * - usability check (#GP(0)/#SS(0)) 5109 * - limit check (#GP(0)/#SS(0)) 5110 */ 5111 if (wr) 5112 /* #GP(0) if the destination operand is located in a 5113 * read-only data segment or any code segment. 5114 */ 5115 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 5116 else 5117 /* #GP(0) if the source operand is located in an 5118 * execute-only code segment 5119 */ 5120 exn = ((s.type & 0xa) == 8); 5121 if (exn) { 5122 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 5123 return 1; 5124 } 5125 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 5126 */ 5127 exn = (s.unusable != 0); 5128 5129 /* 5130 * Protected mode: #GP(0)/#SS(0) if the memory operand is 5131 * outside the segment limit. All CPUs that support VMX ignore 5132 * limit checks for flat segments, i.e. segments with base==0, 5133 * limit==0xffffffff and of type expand-up data or code. 5134 */ 5135 if (!(s.base == 0 && s.limit == 0xffffffff && 5136 ((s.type & 8) || !(s.type & 4)))) 5137 exn = exn || ((u64)off + len - 1 > s.limit); 5138 } 5139 if (exn) { 5140 kvm_queue_exception_e(vcpu, 5141 seg_reg == VCPU_SREG_SS ? 5142 SS_VECTOR : GP_VECTOR, 5143 0); 5144 return 1; 5145 } 5146 5147 return 0; 5148 } 5149 5150 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, 5151 int *ret) 5152 { 5153 gva_t gva; 5154 struct x86_exception e; 5155 int r; 5156 5157 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5158 vmcs_read32(VMX_INSTRUCTION_INFO), false, 5159 sizeof(*vmpointer), &gva)) { 5160 *ret = 1; 5161 return -EINVAL; 5162 } 5163 5164 r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); 5165 if (r != X86EMUL_CONTINUE) { 5166 *ret = kvm_handle_memory_failure(vcpu, r, &e); 5167 return -EINVAL; 5168 } 5169 5170 return 0; 5171 } 5172 5173 /* 5174 * Allocate a shadow VMCS and associate it with the currently loaded 5175 * VMCS, unless such a shadow VMCS already exists. The newly allocated 5176 * VMCS is also VMCLEARed, so that it is ready for use. 5177 */ 5178 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 5179 { 5180 struct vcpu_vmx *vmx = to_vmx(vcpu); 5181 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 5182 5183 /* 5184 * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it 5185 * when L1 executes VMXOFF or the vCPU is forced out of nested 5186 * operation. VMXON faults if the CPU is already post-VMXON, so it 5187 * should be impossible to already have an allocated shadow VMCS. KVM 5188 * doesn't support virtualization of VMCS shadowing, so vmcs01 should 5189 * always be the loaded VMCS. 5190 */ 5191 if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs)) 5192 return loaded_vmcs->shadow_vmcs; 5193 5194 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 5195 if (loaded_vmcs->shadow_vmcs) 5196 vmcs_clear(loaded_vmcs->shadow_vmcs); 5197 5198 return loaded_vmcs->shadow_vmcs; 5199 } 5200 5201 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 5202 { 5203 struct vcpu_vmx *vmx = to_vmx(vcpu); 5204 int r; 5205 5206 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 5207 if (r < 0) 5208 goto out_vmcs02; 5209 5210 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5211 if (!vmx->nested.cached_vmcs12) 5212 goto out_cached_vmcs12; 5213 5214 vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; 5215 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5216 if (!vmx->nested.cached_shadow_vmcs12) 5217 goto out_cached_shadow_vmcs12; 5218 5219 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 5220 goto out_shadow_vmcs; 5221 5222 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, 5223 HRTIMER_MODE_ABS_PINNED); 5224 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; 5225 5226 vmx->nested.vpid02 = allocate_vpid(); 5227 5228 vmx->nested.vmcs02_initialized = false; 5229 vmx->nested.vmxon = true; 5230 5231 if (vmx_pt_mode_is_host_guest()) { 5232 vmx->pt_desc.guest.ctl = 0; 5233 pt_update_intercept_for_msr(vcpu); 5234 } 5235 5236 return 0; 5237 5238 out_shadow_vmcs: 5239 kfree(vmx->nested.cached_shadow_vmcs12); 5240 5241 out_cached_shadow_vmcs12: 5242 kfree(vmx->nested.cached_vmcs12); 5243 5244 out_cached_vmcs12: 5245 free_loaded_vmcs(&vmx->nested.vmcs02); 5246 5247 out_vmcs02: 5248 return -ENOMEM; 5249 } 5250 5251 /* Emulate the VMXON instruction. */ 5252 static int handle_vmxon(struct kvm_vcpu *vcpu) 5253 { 5254 int ret; 5255 gpa_t vmptr; 5256 uint32_t revision; 5257 struct vcpu_vmx *vmx = to_vmx(vcpu); 5258 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED 5259 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 5260 5261 /* 5262 * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter 5263 * the guest and so cannot rely on hardware to perform the check, 5264 * which has higher priority than VM-Exit (see Intel SDM's pseudocode 5265 * for VMXON). 5266 * 5267 * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 5268 * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't 5269 * force any of the relevant guest state. For a restricted guest, KVM 5270 * does force CR0.PE=1, but only to also force VM86 in order to emulate 5271 * Real Mode, and so there's no need to check CR0.PE manually. 5272 */ 5273 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) { 5274 kvm_queue_exception(vcpu, UD_VECTOR); 5275 return 1; 5276 } 5277 5278 /* 5279 * The CPL is checked for "not in VMX operation" and for "in VMX root", 5280 * and has higher priority than the VM-Fail due to being post-VMXON, 5281 * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, 5282 * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits 5283 * from L2 to L1, i.e. there's no need to check for the vCPU being in 5284 * VMX non-root. 5285 * 5286 * Forwarding the VM-Exit unconditionally, i.e. without performing the 5287 * #UD checks (see above), is functionally ok because KVM doesn't allow 5288 * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's 5289 * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are 5290 * missed by hardware due to shadowing CR0 and/or CR4. 5291 */ 5292 if (vmx_get_cpl(vcpu)) { 5293 kvm_inject_gp(vcpu, 0); 5294 return 1; 5295 } 5296 5297 if (vmx->nested.vmxon) 5298 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 5299 5300 /* 5301 * Invalid CR0/CR4 generates #GP. These checks are performed if and 5302 * only if the vCPU isn't already in VMX operation, i.e. effectively 5303 * have lower priority than the VM-Fail above. 5304 */ 5305 if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || 5306 !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { 5307 kvm_inject_gp(vcpu, 0); 5308 return 1; 5309 } 5310 5311 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 5312 != VMXON_NEEDED_FEATURES) { 5313 kvm_inject_gp(vcpu, 0); 5314 return 1; 5315 } 5316 5317 if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) 5318 return ret; 5319 5320 /* 5321 * SDM 3: 24.11.5 5322 * The first 4 bytes of VMXON region contain the supported 5323 * VMCS revision identifier 5324 * 5325 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 5326 * which replaces physical address width with 32 5327 */ 5328 if (!page_address_valid(vcpu, vmptr)) 5329 return nested_vmx_failInvalid(vcpu); 5330 5331 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 5332 revision != VMCS12_REVISION) 5333 return nested_vmx_failInvalid(vcpu); 5334 5335 vmx->nested.vmxon_ptr = vmptr; 5336 ret = enter_vmx_operation(vcpu); 5337 if (ret) 5338 return ret; 5339 5340 return nested_vmx_succeed(vcpu); 5341 } 5342 5343 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 5344 { 5345 struct vcpu_vmx *vmx = to_vmx(vcpu); 5346 5347 if (vmx->nested.current_vmptr == INVALID_GPA) 5348 return; 5349 5350 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 5351 5352 if (enable_shadow_vmcs) { 5353 /* copy to memory all shadowed fields in case 5354 they were modified */ 5355 copy_shadow_to_vmcs12(vmx); 5356 vmx_disable_shadow_vmcs(vmx); 5357 } 5358 vmx->nested.posted_intr_nv = -1; 5359 5360 /* Flush VMCS12 to guest memory */ 5361 kvm_vcpu_write_guest_page(vcpu, 5362 vmx->nested.current_vmptr >> PAGE_SHIFT, 5363 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 5364 5365 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 5366 5367 vmx->nested.current_vmptr = INVALID_GPA; 5368 } 5369 5370 /* Emulate the VMXOFF instruction */ 5371 static int handle_vmxoff(struct kvm_vcpu *vcpu) 5372 { 5373 if (!nested_vmx_check_permission(vcpu)) 5374 return 1; 5375 5376 free_nested(vcpu); 5377 5378 if (kvm_apic_has_pending_init_or_sipi(vcpu)) 5379 kvm_make_request(KVM_REQ_EVENT, vcpu); 5380 5381 return nested_vmx_succeed(vcpu); 5382 } 5383 5384 /* Emulate the VMCLEAR instruction */ 5385 static int handle_vmclear(struct kvm_vcpu *vcpu) 5386 { 5387 struct vcpu_vmx *vmx = to_vmx(vcpu); 5388 u32 zero = 0; 5389 gpa_t vmptr; 5390 int r; 5391 5392 if (!nested_vmx_check_permission(vcpu)) 5393 return 1; 5394 5395 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5396 return r; 5397 5398 if (!page_address_valid(vcpu, vmptr)) 5399 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); 5400 5401 if (vmptr == vmx->nested.vmxon_ptr) 5402 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); 5403 5404 if (likely(!nested_evmcs_handle_vmclear(vcpu, vmptr))) { 5405 if (vmptr == vmx->nested.current_vmptr) 5406 nested_release_vmcs12(vcpu); 5407 5408 /* 5409 * Silently ignore memory errors on VMCLEAR, Intel's pseudocode 5410 * for VMCLEAR includes a "ensure that data for VMCS referenced 5411 * by the operand is in memory" clause that guards writes to 5412 * memory, i.e. doing nothing for I/O is architecturally valid. 5413 * 5414 * FIXME: Suppress failures if and only if no memslot is found, 5415 * i.e. exit to userspace if __copy_to_user() fails. 5416 */ 5417 (void)kvm_vcpu_write_guest(vcpu, 5418 vmptr + offsetof(struct vmcs12, 5419 launch_state), 5420 &zero, sizeof(zero)); 5421 } 5422 5423 return nested_vmx_succeed(vcpu); 5424 } 5425 5426 /* Emulate the VMLAUNCH instruction */ 5427 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 5428 { 5429 return nested_vmx_run(vcpu, true); 5430 } 5431 5432 /* Emulate the VMRESUME instruction */ 5433 static int handle_vmresume(struct kvm_vcpu *vcpu) 5434 { 5435 5436 return nested_vmx_run(vcpu, false); 5437 } 5438 5439 static int handle_vmread(struct kvm_vcpu *vcpu) 5440 { 5441 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5442 : get_vmcs12(vcpu); 5443 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5444 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5445 struct vcpu_vmx *vmx = to_vmx(vcpu); 5446 struct x86_exception e; 5447 unsigned long field; 5448 u64 value; 5449 gva_t gva = 0; 5450 short offset; 5451 int len, r; 5452 5453 if (!nested_vmx_check_permission(vcpu)) 5454 return 1; 5455 5456 /* Decode instruction info and find the field to read */ 5457 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5458 5459 if (!nested_vmx_is_evmptr12_valid(vmx)) { 5460 /* 5461 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5462 * any VMREAD sets the ALU flags for VMfailInvalid. 5463 */ 5464 if (vmx->nested.current_vmptr == INVALID_GPA || 5465 (is_guest_mode(vcpu) && 5466 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5467 return nested_vmx_failInvalid(vcpu); 5468 5469 offset = get_vmcs12_field_offset(field); 5470 if (offset < 0) 5471 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5472 5473 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 5474 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5475 5476 /* Read the field, zero-extended to a u64 value */ 5477 value = vmcs12_read_any(vmcs12, field, offset); 5478 } else { 5479 /* 5480 * Hyper-V TLFS (as of 6.0b) explicitly states, that while an 5481 * enlightened VMCS is active VMREAD/VMWRITE instructions are 5482 * unsupported. Unfortunately, certain versions of Windows 11 5483 * don't comply with this requirement which is not enforced in 5484 * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a 5485 * workaround, as misbehaving guests will panic on VM-Fail. 5486 * Note, enlightened VMCS is incompatible with shadow VMCS so 5487 * all VMREADs from L2 should go to L1. 5488 */ 5489 if (WARN_ON_ONCE(is_guest_mode(vcpu))) 5490 return nested_vmx_failInvalid(vcpu); 5491 5492 offset = evmcs_field_offset(field, NULL); 5493 if (offset < 0) 5494 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5495 5496 /* Read the field, zero-extended to a u64 value */ 5497 value = evmcs_read_any(nested_vmx_evmcs(vmx), field, offset); 5498 } 5499 5500 /* 5501 * Now copy part of this value to register or memory, as requested. 5502 * Note that the number of bits actually copied is 32 or 64 depending 5503 * on the guest's mode (32 or 64 bit), not on the given field's length. 5504 */ 5505 if (instr_info & BIT(10)) { 5506 kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); 5507 } else { 5508 len = is_64_bit_mode(vcpu) ? 8 : 4; 5509 if (get_vmx_mem_address(vcpu, exit_qualification, 5510 instr_info, true, len, &gva)) 5511 return 1; 5512 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 5513 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); 5514 if (r != X86EMUL_CONTINUE) 5515 return kvm_handle_memory_failure(vcpu, r, &e); 5516 } 5517 5518 return nested_vmx_succeed(vcpu); 5519 } 5520 5521 static bool is_shadow_field_rw(unsigned long field) 5522 { 5523 switch (field) { 5524 #define SHADOW_FIELD_RW(x, y) case x: 5525 #include "vmcs_shadow_fields.h" 5526 return true; 5527 default: 5528 break; 5529 } 5530 return false; 5531 } 5532 5533 static bool is_shadow_field_ro(unsigned long field) 5534 { 5535 switch (field) { 5536 #define SHADOW_FIELD_RO(x, y) case x: 5537 #include "vmcs_shadow_fields.h" 5538 return true; 5539 default: 5540 break; 5541 } 5542 return false; 5543 } 5544 5545 static int handle_vmwrite(struct kvm_vcpu *vcpu) 5546 { 5547 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5548 : get_vmcs12(vcpu); 5549 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5550 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5551 struct vcpu_vmx *vmx = to_vmx(vcpu); 5552 struct x86_exception e; 5553 unsigned long field; 5554 short offset; 5555 gva_t gva; 5556 int len, r; 5557 5558 /* 5559 * The value to write might be 32 or 64 bits, depending on L1's long 5560 * mode, and eventually we need to write that into a field of several 5561 * possible lengths. The code below first zero-extends the value to 64 5562 * bit (value), and then copies only the appropriate number of 5563 * bits into the vmcs12 field. 5564 */ 5565 u64 value = 0; 5566 5567 if (!nested_vmx_check_permission(vcpu)) 5568 return 1; 5569 5570 /* 5571 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5572 * any VMWRITE sets the ALU flags for VMfailInvalid. 5573 */ 5574 if (vmx->nested.current_vmptr == INVALID_GPA || 5575 (is_guest_mode(vcpu) && 5576 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5577 return nested_vmx_failInvalid(vcpu); 5578 5579 if (instr_info & BIT(10)) 5580 value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); 5581 else { 5582 len = is_64_bit_mode(vcpu) ? 8 : 4; 5583 if (get_vmx_mem_address(vcpu, exit_qualification, 5584 instr_info, false, len, &gva)) 5585 return 1; 5586 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); 5587 if (r != X86EMUL_CONTINUE) 5588 return kvm_handle_memory_failure(vcpu, r, &e); 5589 } 5590 5591 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5592 5593 offset = get_vmcs12_field_offset(field); 5594 if (offset < 0) 5595 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5596 5597 /* 5598 * If the vCPU supports "VMWRITE to any supported field in the 5599 * VMCS," then the "read-only" fields are actually read/write. 5600 */ 5601 if (vmcs_field_readonly(field) && 5602 !nested_cpu_has_vmwrite_any_field(vcpu)) 5603 return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 5604 5605 /* 5606 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 5607 * vmcs12, else we may crush a field or consume a stale value. 5608 */ 5609 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) 5610 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5611 5612 /* 5613 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 5614 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 5615 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 5616 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 5617 * from L1 will return a different value than VMREAD from L2 (L1 sees 5618 * the stripped down value, L2 sees the full value as stored by KVM). 5619 */ 5620 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 5621 value &= 0x1f0ff; 5622 5623 vmcs12_write_any(vmcs12, field, offset, value); 5624 5625 /* 5626 * Do not track vmcs12 dirty-state if in guest-mode as we actually 5627 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 5628 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 5629 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 5630 */ 5631 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 5632 /* 5633 * L1 can read these fields without exiting, ensure the 5634 * shadow VMCS is up-to-date. 5635 */ 5636 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 5637 preempt_disable(); 5638 vmcs_load(vmx->vmcs01.shadow_vmcs); 5639 5640 __vmcs_writel(field, value); 5641 5642 vmcs_clear(vmx->vmcs01.shadow_vmcs); 5643 vmcs_load(vmx->loaded_vmcs->vmcs); 5644 preempt_enable(); 5645 } 5646 vmx->nested.dirty_vmcs12 = true; 5647 } 5648 5649 return nested_vmx_succeed(vcpu); 5650 } 5651 5652 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 5653 { 5654 vmx->nested.current_vmptr = vmptr; 5655 if (enable_shadow_vmcs) { 5656 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 5657 vmcs_write64(VMCS_LINK_POINTER, 5658 __pa(vmx->vmcs01.shadow_vmcs)); 5659 vmx->nested.need_vmcs12_to_shadow_sync = true; 5660 } 5661 vmx->nested.dirty_vmcs12 = true; 5662 vmx->nested.force_msr_bitmap_recalc = true; 5663 } 5664 5665 /* Emulate the VMPTRLD instruction */ 5666 static int handle_vmptrld(struct kvm_vcpu *vcpu) 5667 { 5668 struct vcpu_vmx *vmx = to_vmx(vcpu); 5669 gpa_t vmptr; 5670 int r; 5671 5672 if (!nested_vmx_check_permission(vcpu)) 5673 return 1; 5674 5675 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5676 return r; 5677 5678 if (!page_address_valid(vcpu, vmptr)) 5679 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); 5680 5681 if (vmptr == vmx->nested.vmxon_ptr) 5682 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); 5683 5684 /* Forbid normal VMPTRLD if Enlightened version was used */ 5685 if (nested_vmx_is_evmptr12_valid(vmx)) 5686 return 1; 5687 5688 if (vmx->nested.current_vmptr != vmptr) { 5689 struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; 5690 struct vmcs_hdr hdr; 5691 5692 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { 5693 /* 5694 * Reads from an unbacked page return all 1s, 5695 * which means that the 32 bits located at the 5696 * given physical address won't match the required 5697 * VMCS12_REVISION identifier. 5698 */ 5699 return nested_vmx_fail(vcpu, 5700 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5701 } 5702 5703 if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 5704 offsetof(struct vmcs12, hdr), 5705 sizeof(hdr))) { 5706 return nested_vmx_fail(vcpu, 5707 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5708 } 5709 5710 if (hdr.revision_id != VMCS12_REVISION || 5711 (hdr.shadow_vmcs && 5712 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 5713 return nested_vmx_fail(vcpu, 5714 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5715 } 5716 5717 nested_release_vmcs12(vcpu); 5718 5719 /* 5720 * Load VMCS12 from guest memory since it is not already 5721 * cached. 5722 */ 5723 if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12, 5724 VMCS12_SIZE)) { 5725 return nested_vmx_fail(vcpu, 5726 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5727 } 5728 5729 set_current_vmptr(vmx, vmptr); 5730 } 5731 5732 return nested_vmx_succeed(vcpu); 5733 } 5734 5735 /* Emulate the VMPTRST instruction */ 5736 static int handle_vmptrst(struct kvm_vcpu *vcpu) 5737 { 5738 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 5739 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5740 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 5741 struct x86_exception e; 5742 gva_t gva; 5743 int r; 5744 5745 if (!nested_vmx_check_permission(vcpu)) 5746 return 1; 5747 5748 if (unlikely(nested_vmx_is_evmptr12_valid(to_vmx(vcpu)))) 5749 return 1; 5750 5751 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 5752 true, sizeof(gpa_t), &gva)) 5753 return 1; 5754 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 5755 r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 5756 sizeof(gpa_t), &e); 5757 if (r != X86EMUL_CONTINUE) 5758 return kvm_handle_memory_failure(vcpu, r, &e); 5759 5760 return nested_vmx_succeed(vcpu); 5761 } 5762 5763 /* Emulate the INVEPT instruction */ 5764 static int handle_invept(struct kvm_vcpu *vcpu) 5765 { 5766 struct vcpu_vmx *vmx = to_vmx(vcpu); 5767 u32 vmx_instruction_info, types; 5768 unsigned long type, roots_to_free; 5769 struct kvm_mmu *mmu; 5770 gva_t gva; 5771 struct x86_exception e; 5772 struct { 5773 u64 eptp, gpa; 5774 } operand; 5775 int i, r, gpr_index; 5776 5777 if (!(vmx->nested.msrs.secondary_ctls_high & 5778 SECONDARY_EXEC_ENABLE_EPT) || 5779 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 5780 kvm_queue_exception(vcpu, UD_VECTOR); 5781 return 1; 5782 } 5783 5784 if (!nested_vmx_check_permission(vcpu)) 5785 return 1; 5786 5787 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5788 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5789 type = kvm_register_read(vcpu, gpr_index); 5790 5791 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 5792 5793 if (type >= 32 || !(types & (1 << type))) 5794 return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5795 5796 /* According to the Intel VMX instruction reference, the memory 5797 * operand is read even if it isn't needed (e.g., for type==global) 5798 */ 5799 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5800 vmx_instruction_info, false, sizeof(operand), &gva)) 5801 return 1; 5802 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5803 if (r != X86EMUL_CONTINUE) 5804 return kvm_handle_memory_failure(vcpu, r, &e); 5805 5806 /* 5807 * Nested EPT roots are always held through guest_mmu, 5808 * not root_mmu. 5809 */ 5810 mmu = &vcpu->arch.guest_mmu; 5811 5812 switch (type) { 5813 case VMX_EPT_EXTENT_CONTEXT: 5814 if (!nested_vmx_check_eptp(vcpu, operand.eptp)) 5815 return nested_vmx_fail(vcpu, 5816 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5817 5818 roots_to_free = 0; 5819 if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd, 5820 operand.eptp)) 5821 roots_to_free |= KVM_MMU_ROOT_CURRENT; 5822 5823 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 5824 if (nested_ept_root_matches(mmu->prev_roots[i].hpa, 5825 mmu->prev_roots[i].pgd, 5826 operand.eptp)) 5827 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 5828 } 5829 break; 5830 case VMX_EPT_EXTENT_GLOBAL: 5831 roots_to_free = KVM_MMU_ROOTS_ALL; 5832 break; 5833 default: 5834 BUG(); 5835 break; 5836 } 5837 5838 if (roots_to_free) 5839 kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); 5840 5841 return nested_vmx_succeed(vcpu); 5842 } 5843 5844 static int handle_invvpid(struct kvm_vcpu *vcpu) 5845 { 5846 struct vcpu_vmx *vmx = to_vmx(vcpu); 5847 u32 vmx_instruction_info; 5848 unsigned long type, types; 5849 gva_t gva; 5850 struct x86_exception e; 5851 struct { 5852 u64 vpid; 5853 u64 gla; 5854 } operand; 5855 u16 vpid02; 5856 int r, gpr_index; 5857 5858 if (!(vmx->nested.msrs.secondary_ctls_high & 5859 SECONDARY_EXEC_ENABLE_VPID) || 5860 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 5861 kvm_queue_exception(vcpu, UD_VECTOR); 5862 return 1; 5863 } 5864 5865 if (!nested_vmx_check_permission(vcpu)) 5866 return 1; 5867 5868 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5869 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5870 type = kvm_register_read(vcpu, gpr_index); 5871 5872 types = (vmx->nested.msrs.vpid_caps & 5873 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 5874 5875 if (type >= 32 || !(types & (1 << type))) 5876 return nested_vmx_fail(vcpu, 5877 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5878 5879 /* according to the intel vmx instruction reference, the memory 5880 * operand is read even if it isn't needed (e.g., for type==global) 5881 */ 5882 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5883 vmx_instruction_info, false, sizeof(operand), &gva)) 5884 return 1; 5885 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5886 if (r != X86EMUL_CONTINUE) 5887 return kvm_handle_memory_failure(vcpu, r, &e); 5888 5889 if (operand.vpid >> 16) 5890 return nested_vmx_fail(vcpu, 5891 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5892 5893 vpid02 = nested_get_vpid02(vcpu); 5894 switch (type) { 5895 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 5896 /* 5897 * LAM doesn't apply to addresses that are inputs to TLB 5898 * invalidation. 5899 */ 5900 if (!operand.vpid || 5901 is_noncanonical_address(operand.gla, vcpu)) 5902 return nested_vmx_fail(vcpu, 5903 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5904 vpid_sync_vcpu_addr(vpid02, operand.gla); 5905 break; 5906 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 5907 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 5908 if (!operand.vpid) 5909 return nested_vmx_fail(vcpu, 5910 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5911 vpid_sync_context(vpid02); 5912 break; 5913 case VMX_VPID_EXTENT_ALL_CONTEXT: 5914 vpid_sync_context(vpid02); 5915 break; 5916 default: 5917 WARN_ON_ONCE(1); 5918 return kvm_skip_emulated_instruction(vcpu); 5919 } 5920 5921 /* 5922 * Sync the shadow page tables if EPT is disabled, L1 is invalidating 5923 * linear mappings for L2 (tagged with L2's VPID). Free all guest 5924 * roots as VPIDs are not tracked in the MMU role. 5925 * 5926 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share 5927 * an MMU when EPT is disabled. 5928 * 5929 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. 5930 */ 5931 if (!enable_ept) 5932 kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu); 5933 5934 return nested_vmx_succeed(vcpu); 5935 } 5936 5937 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 5938 struct vmcs12 *vmcs12) 5939 { 5940 u32 index = kvm_rcx_read(vcpu); 5941 u64 new_eptp; 5942 5943 if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) 5944 return 1; 5945 if (index >= VMFUNC_EPTP_ENTRIES) 5946 return 1; 5947 5948 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 5949 &new_eptp, index * 8, 8)) 5950 return 1; 5951 5952 /* 5953 * If the (L2) guest does a vmfunc to the currently 5954 * active ept pointer, we don't have to do anything else 5955 */ 5956 if (vmcs12->ept_pointer != new_eptp) { 5957 if (!nested_vmx_check_eptp(vcpu, new_eptp)) 5958 return 1; 5959 5960 vmcs12->ept_pointer = new_eptp; 5961 nested_ept_new_eptp(vcpu); 5962 5963 if (!nested_cpu_has_vpid(vmcs12)) 5964 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 5965 } 5966 5967 return 0; 5968 } 5969 5970 static int handle_vmfunc(struct kvm_vcpu *vcpu) 5971 { 5972 struct vcpu_vmx *vmx = to_vmx(vcpu); 5973 struct vmcs12 *vmcs12; 5974 u32 function = kvm_rax_read(vcpu); 5975 5976 /* 5977 * VMFUNC should never execute cleanly while L1 is active; KVM supports 5978 * VMFUNC for nested VMs, but not for L1. 5979 */ 5980 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) { 5981 kvm_queue_exception(vcpu, UD_VECTOR); 5982 return 1; 5983 } 5984 5985 vmcs12 = get_vmcs12(vcpu); 5986 5987 /* 5988 * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC 5989 * is enabled in vmcs02 if and only if it's enabled in vmcs12. 5990 */ 5991 if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { 5992 kvm_queue_exception(vcpu, UD_VECTOR); 5993 return 1; 5994 } 5995 5996 if (!(vmcs12->vm_function_control & BIT_ULL(function))) 5997 goto fail; 5998 5999 switch (function) { 6000 case 0: 6001 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 6002 goto fail; 6003 break; 6004 default: 6005 goto fail; 6006 } 6007 return kvm_skip_emulated_instruction(vcpu); 6008 6009 fail: 6010 /* 6011 * This is effectively a reflected VM-Exit, as opposed to a synthesized 6012 * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode 6013 * EXIT_REASON_VMFUNC as the exit reason. 6014 */ 6015 nested_vmx_vmexit(vcpu, vmx->exit_reason.full, 6016 vmx_get_intr_info(vcpu), 6017 vmx_get_exit_qual(vcpu)); 6018 return 1; 6019 } 6020 6021 /* 6022 * Return true if an IO instruction with the specified port and size should cause 6023 * a VM-exit into L1. 6024 */ 6025 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, 6026 int size) 6027 { 6028 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6029 gpa_t bitmap, last_bitmap; 6030 u8 b; 6031 6032 last_bitmap = INVALID_GPA; 6033 b = -1; 6034 6035 while (size > 0) { 6036 if (port < 0x8000) 6037 bitmap = vmcs12->io_bitmap_a; 6038 else if (port < 0x10000) 6039 bitmap = vmcs12->io_bitmap_b; 6040 else 6041 return true; 6042 bitmap += (port & 0x7fff) / 8; 6043 6044 if (last_bitmap != bitmap) 6045 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 6046 return true; 6047 if (b & (1 << (port & 7))) 6048 return true; 6049 6050 port++; 6051 size--; 6052 last_bitmap = bitmap; 6053 } 6054 6055 return false; 6056 } 6057 6058 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 6059 struct vmcs12 *vmcs12) 6060 { 6061 unsigned long exit_qualification; 6062 unsigned short port; 6063 int size; 6064 6065 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 6066 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 6067 6068 exit_qualification = vmx_get_exit_qual(vcpu); 6069 6070 port = exit_qualification >> 16; 6071 size = (exit_qualification & 7) + 1; 6072 6073 return nested_vmx_check_io_bitmaps(vcpu, port, size); 6074 } 6075 6076 /* 6077 * Return 1 if we should exit from L2 to L1 to handle an MSR access, 6078 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 6079 * disinterest in the current event (read or write a specific MSR) by using an 6080 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 6081 */ 6082 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 6083 struct vmcs12 *vmcs12, 6084 union vmx_exit_reason exit_reason) 6085 { 6086 u32 msr_index = kvm_rcx_read(vcpu); 6087 gpa_t bitmap; 6088 6089 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 6090 return true; 6091 6092 /* 6093 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 6094 * for the four combinations of read/write and low/high MSR numbers. 6095 * First we need to figure out which of the four to use: 6096 */ 6097 bitmap = vmcs12->msr_bitmap; 6098 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 6099 bitmap += 2048; 6100 if (msr_index >= 0xc0000000) { 6101 msr_index -= 0xc0000000; 6102 bitmap += 1024; 6103 } 6104 6105 /* Then read the msr_index'th bit from this bitmap: */ 6106 if (msr_index < 1024*8) { 6107 unsigned char b; 6108 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 6109 return true; 6110 return 1 & (b >> (msr_index & 7)); 6111 } else 6112 return true; /* let L1 handle the wrong parameter */ 6113 } 6114 6115 /* 6116 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 6117 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 6118 * intercept (via guest_host_mask etc.) the current event. 6119 */ 6120 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 6121 struct vmcs12 *vmcs12) 6122 { 6123 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 6124 int cr = exit_qualification & 15; 6125 int reg; 6126 unsigned long val; 6127 6128 switch ((exit_qualification >> 4) & 3) { 6129 case 0: /* mov to cr */ 6130 reg = (exit_qualification >> 8) & 15; 6131 val = kvm_register_read(vcpu, reg); 6132 switch (cr) { 6133 case 0: 6134 if (vmcs12->cr0_guest_host_mask & 6135 (val ^ vmcs12->cr0_read_shadow)) 6136 return true; 6137 break; 6138 case 3: 6139 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 6140 return true; 6141 break; 6142 case 4: 6143 if (vmcs12->cr4_guest_host_mask & 6144 (vmcs12->cr4_read_shadow ^ val)) 6145 return true; 6146 break; 6147 case 8: 6148 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 6149 return true; 6150 break; 6151 } 6152 break; 6153 case 2: /* clts */ 6154 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 6155 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 6156 return true; 6157 break; 6158 case 1: /* mov from cr */ 6159 switch (cr) { 6160 case 3: 6161 if (vmcs12->cpu_based_vm_exec_control & 6162 CPU_BASED_CR3_STORE_EXITING) 6163 return true; 6164 break; 6165 case 8: 6166 if (vmcs12->cpu_based_vm_exec_control & 6167 CPU_BASED_CR8_STORE_EXITING) 6168 return true; 6169 break; 6170 } 6171 break; 6172 case 3: /* lmsw */ 6173 /* 6174 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 6175 * cr0. Other attempted changes are ignored, with no exit. 6176 */ 6177 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 6178 if (vmcs12->cr0_guest_host_mask & 0xe & 6179 (val ^ vmcs12->cr0_read_shadow)) 6180 return true; 6181 if ((vmcs12->cr0_guest_host_mask & 0x1) && 6182 !(vmcs12->cr0_read_shadow & 0x1) && 6183 (val & 0x1)) 6184 return true; 6185 break; 6186 } 6187 return false; 6188 } 6189 6190 static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, 6191 struct vmcs12 *vmcs12) 6192 { 6193 u32 encls_leaf; 6194 6195 if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) || 6196 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) 6197 return false; 6198 6199 encls_leaf = kvm_rax_read(vcpu); 6200 if (encls_leaf > 62) 6201 encls_leaf = 63; 6202 return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); 6203 } 6204 6205 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 6206 struct vmcs12 *vmcs12, gpa_t bitmap) 6207 { 6208 u32 vmx_instruction_info; 6209 unsigned long field; 6210 u8 b; 6211 6212 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 6213 return true; 6214 6215 /* Decode instruction info and find the field to access */ 6216 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6217 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 6218 6219 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 6220 if (field >> 15) 6221 return true; 6222 6223 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 6224 return true; 6225 6226 return 1 & (b >> (field & 7)); 6227 } 6228 6229 static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) 6230 { 6231 u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; 6232 6233 if (nested_cpu_has_mtf(vmcs12)) 6234 return true; 6235 6236 /* 6237 * An MTF VM-exit may be injected into the guest by setting the 6238 * interruption-type to 7 (other event) and the vector field to 0. Such 6239 * is the case regardless of the 'monitor trap flag' VM-execution 6240 * control. 6241 */ 6242 return entry_intr_info == (INTR_INFO_VALID_MASK 6243 | INTR_TYPE_OTHER_EVENT); 6244 } 6245 6246 /* 6247 * Return true if L0 wants to handle an exit from L2 regardless of whether or not 6248 * L1 wants the exit. Only call this when in is_guest_mode (L2). 6249 */ 6250 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, 6251 union vmx_exit_reason exit_reason) 6252 { 6253 u32 intr_info; 6254 6255 switch ((u16)exit_reason.basic) { 6256 case EXIT_REASON_EXCEPTION_NMI: 6257 intr_info = vmx_get_intr_info(vcpu); 6258 if (is_nmi(intr_info)) 6259 return true; 6260 else if (is_page_fault(intr_info)) 6261 return vcpu->arch.apf.host_apf_flags || 6262 vmx_need_pf_intercept(vcpu); 6263 else if (is_debug(intr_info) && 6264 vcpu->guest_debug & 6265 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 6266 return true; 6267 else if (is_breakpoint(intr_info) && 6268 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 6269 return true; 6270 else if (is_alignment_check(intr_info) && 6271 !vmx_guest_inject_ac(vcpu)) 6272 return true; 6273 else if (is_ve_fault(intr_info)) 6274 return true; 6275 return false; 6276 case EXIT_REASON_EXTERNAL_INTERRUPT: 6277 return true; 6278 case EXIT_REASON_MCE_DURING_VMENTRY: 6279 return true; 6280 case EXIT_REASON_EPT_VIOLATION: 6281 /* 6282 * L0 always deals with the EPT violation. If nested EPT is 6283 * used, and the nested mmu code discovers that the address is 6284 * missing in the guest EPT table (EPT12), the EPT violation 6285 * will be injected with nested_ept_inject_page_fault() 6286 */ 6287 return true; 6288 case EXIT_REASON_EPT_MISCONFIG: 6289 /* 6290 * L2 never uses directly L1's EPT, but rather L0's own EPT 6291 * table (shadow on EPT) or a merged EPT table that L0 built 6292 * (EPT on EPT). So any problems with the structure of the 6293 * table is L0's fault. 6294 */ 6295 return true; 6296 case EXIT_REASON_PREEMPTION_TIMER: 6297 return true; 6298 case EXIT_REASON_PML_FULL: 6299 /* 6300 * PML is emulated for an L1 VMM and should never be enabled in 6301 * vmcs02, always "handle" PML_FULL by exiting to userspace. 6302 */ 6303 return true; 6304 case EXIT_REASON_VMFUNC: 6305 /* VM functions are emulated through L2->L0 vmexits. */ 6306 return true; 6307 case EXIT_REASON_BUS_LOCK: 6308 /* 6309 * At present, bus lock VM exit is never exposed to L1. 6310 * Handle L2's bus locks in L0 directly. 6311 */ 6312 return true; 6313 #ifdef CONFIG_KVM_HYPERV 6314 case EXIT_REASON_VMCALL: 6315 /* Hyper-V L2 TLB flush hypercall is handled by L0 */ 6316 return guest_hv_cpuid_has_l2_tlb_flush(vcpu) && 6317 nested_evmcs_l2_tlb_flush_enabled(vcpu) && 6318 kvm_hv_is_tlb_flush_hcall(vcpu); 6319 #endif 6320 default: 6321 break; 6322 } 6323 return false; 6324 } 6325 6326 /* 6327 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in 6328 * is_guest_mode (L2). 6329 */ 6330 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, 6331 union vmx_exit_reason exit_reason) 6332 { 6333 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6334 u32 intr_info; 6335 6336 switch ((u16)exit_reason.basic) { 6337 case EXIT_REASON_EXCEPTION_NMI: 6338 intr_info = vmx_get_intr_info(vcpu); 6339 if (is_nmi(intr_info)) 6340 return true; 6341 else if (is_page_fault(intr_info)) 6342 return true; 6343 return vmcs12->exception_bitmap & 6344 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 6345 case EXIT_REASON_EXTERNAL_INTERRUPT: 6346 return nested_exit_on_intr(vcpu); 6347 case EXIT_REASON_TRIPLE_FAULT: 6348 return true; 6349 case EXIT_REASON_INTERRUPT_WINDOW: 6350 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); 6351 case EXIT_REASON_NMI_WINDOW: 6352 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); 6353 case EXIT_REASON_TASK_SWITCH: 6354 return true; 6355 case EXIT_REASON_CPUID: 6356 return true; 6357 case EXIT_REASON_HLT: 6358 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 6359 case EXIT_REASON_INVD: 6360 return true; 6361 case EXIT_REASON_INVLPG: 6362 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6363 case EXIT_REASON_RDPMC: 6364 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 6365 case EXIT_REASON_RDRAND: 6366 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 6367 case EXIT_REASON_RDSEED: 6368 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 6369 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 6370 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 6371 case EXIT_REASON_VMREAD: 6372 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6373 vmcs12->vmread_bitmap); 6374 case EXIT_REASON_VMWRITE: 6375 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6376 vmcs12->vmwrite_bitmap); 6377 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 6378 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 6379 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 6380 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 6381 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 6382 /* 6383 * VMX instructions trap unconditionally. This allows L1 to 6384 * emulate them for its L2 guest, i.e., allows 3-level nesting! 6385 */ 6386 return true; 6387 case EXIT_REASON_CR_ACCESS: 6388 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 6389 case EXIT_REASON_DR_ACCESS: 6390 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 6391 case EXIT_REASON_IO_INSTRUCTION: 6392 return nested_vmx_exit_handled_io(vcpu, vmcs12); 6393 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 6394 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 6395 case EXIT_REASON_MSR_READ: 6396 case EXIT_REASON_MSR_WRITE: 6397 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 6398 case EXIT_REASON_INVALID_STATE: 6399 return true; 6400 case EXIT_REASON_MWAIT_INSTRUCTION: 6401 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 6402 case EXIT_REASON_MONITOR_TRAP_FLAG: 6403 return nested_vmx_exit_handled_mtf(vmcs12); 6404 case EXIT_REASON_MONITOR_INSTRUCTION: 6405 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 6406 case EXIT_REASON_PAUSE_INSTRUCTION: 6407 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 6408 nested_cpu_has2(vmcs12, 6409 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 6410 case EXIT_REASON_MCE_DURING_VMENTRY: 6411 return true; 6412 case EXIT_REASON_TPR_BELOW_THRESHOLD: 6413 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 6414 case EXIT_REASON_APIC_ACCESS: 6415 case EXIT_REASON_APIC_WRITE: 6416 case EXIT_REASON_EOI_INDUCED: 6417 /* 6418 * The controls for "virtualize APIC accesses," "APIC- 6419 * register virtualization," and "virtual-interrupt 6420 * delivery" only come from vmcs12. 6421 */ 6422 return true; 6423 case EXIT_REASON_INVPCID: 6424 return 6425 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 6426 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6427 case EXIT_REASON_WBINVD: 6428 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 6429 case EXIT_REASON_XSETBV: 6430 return true; 6431 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 6432 /* 6433 * This should never happen, since it is not possible to 6434 * set XSS to a non-zero value---neither in L1 nor in L2. 6435 * If if it were, XSS would have to be checked against 6436 * the XSS exit bitmap in vmcs12. 6437 */ 6438 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES); 6439 case EXIT_REASON_UMWAIT: 6440 case EXIT_REASON_TPAUSE: 6441 return nested_cpu_has2(vmcs12, 6442 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); 6443 case EXIT_REASON_ENCLS: 6444 return nested_vmx_exit_handled_encls(vcpu, vmcs12); 6445 case EXIT_REASON_NOTIFY: 6446 /* Notify VM exit is not exposed to L1 */ 6447 return false; 6448 default: 6449 return true; 6450 } 6451 } 6452 6453 /* 6454 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was 6455 * reflected into L1. 6456 */ 6457 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) 6458 { 6459 struct vcpu_vmx *vmx = to_vmx(vcpu); 6460 union vmx_exit_reason exit_reason = vmx->exit_reason; 6461 unsigned long exit_qual; 6462 u32 exit_intr_info; 6463 6464 WARN_ON_ONCE(vmx->nested.nested_run_pending); 6465 6466 /* 6467 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM 6468 * has already loaded L2's state. 6469 */ 6470 if (unlikely(vmx->fail)) { 6471 trace_kvm_nested_vmenter_failed( 6472 "hardware VM-instruction error: ", 6473 vmcs_read32(VM_INSTRUCTION_ERROR)); 6474 exit_intr_info = 0; 6475 exit_qual = 0; 6476 goto reflect_vmexit; 6477 } 6478 6479 trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); 6480 6481 /* If L0 (KVM) wants the exit, it trumps L1's desires. */ 6482 if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) 6483 return false; 6484 6485 /* If L1 doesn't want the exit, handle it in L0. */ 6486 if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) 6487 return false; 6488 6489 /* 6490 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For 6491 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would 6492 * need to be synthesized by querying the in-kernel LAPIC, but external 6493 * interrupts are never reflected to L1 so it's a non-issue. 6494 */ 6495 exit_intr_info = vmx_get_intr_info(vcpu); 6496 if (is_exception_with_error_code(exit_intr_info)) { 6497 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6498 6499 vmcs12->vm_exit_intr_error_code = 6500 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6501 } 6502 exit_qual = vmx_get_exit_qual(vcpu); 6503 6504 reflect_vmexit: 6505 nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); 6506 return true; 6507 } 6508 6509 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 6510 struct kvm_nested_state __user *user_kvm_nested_state, 6511 u32 user_data_size) 6512 { 6513 struct vcpu_vmx *vmx; 6514 struct vmcs12 *vmcs12; 6515 struct kvm_nested_state kvm_state = { 6516 .flags = 0, 6517 .format = KVM_STATE_NESTED_FORMAT_VMX, 6518 .size = sizeof(kvm_state), 6519 .hdr.vmx.flags = 0, 6520 .hdr.vmx.vmxon_pa = INVALID_GPA, 6521 .hdr.vmx.vmcs12_pa = INVALID_GPA, 6522 .hdr.vmx.preemption_timer_deadline = 0, 6523 }; 6524 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6525 &user_kvm_nested_state->data.vmx[0]; 6526 6527 if (!vcpu) 6528 return kvm_state.size + sizeof(*user_vmx_nested_state); 6529 6530 vmx = to_vmx(vcpu); 6531 vmcs12 = get_vmcs12(vcpu); 6532 6533 if (guest_can_use(vcpu, X86_FEATURE_VMX) && 6534 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 6535 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 6536 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 6537 6538 if (vmx_has_valid_vmcs12(vcpu)) { 6539 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 6540 6541 /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ 6542 if (nested_vmx_is_evmptr12_set(vmx)) 6543 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 6544 6545 if (is_guest_mode(vcpu) && 6546 nested_cpu_has_shadow_vmcs(vmcs12) && 6547 vmcs12->vmcs_link_pointer != INVALID_GPA) 6548 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 6549 } 6550 6551 if (vmx->nested.smm.vmxon) 6552 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 6553 6554 if (vmx->nested.smm.guest_mode) 6555 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 6556 6557 if (is_guest_mode(vcpu)) { 6558 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 6559 6560 if (vmx->nested.nested_run_pending) 6561 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 6562 6563 if (vmx->nested.mtf_pending) 6564 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; 6565 6566 if (nested_cpu_has_preemption_timer(vmcs12) && 6567 vmx->nested.has_preemption_timer_deadline) { 6568 kvm_state.hdr.vmx.flags |= 6569 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; 6570 kvm_state.hdr.vmx.preemption_timer_deadline = 6571 vmx->nested.preemption_timer_deadline; 6572 } 6573 } 6574 } 6575 6576 if (user_data_size < kvm_state.size) 6577 goto out; 6578 6579 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 6580 return -EFAULT; 6581 6582 if (!vmx_has_valid_vmcs12(vcpu)) 6583 goto out; 6584 6585 /* 6586 * When running L2, the authoritative vmcs12 state is in the 6587 * vmcs02. When running L1, the authoritative vmcs12 state is 6588 * in the shadow or enlightened vmcs linked to vmcs01, unless 6589 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 6590 * vmcs12 state is in the vmcs12 already. 6591 */ 6592 if (is_guest_mode(vcpu)) { 6593 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 6594 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 6595 } else { 6596 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 6597 if (!vmx->nested.need_vmcs12_to_shadow_sync) { 6598 if (nested_vmx_is_evmptr12_valid(vmx)) 6599 /* 6600 * L1 hypervisor is not obliged to keep eVMCS 6601 * clean fields data always up-to-date while 6602 * not in guest mode, 'hv_clean_fields' is only 6603 * supposed to be actual upon vmentry so we need 6604 * to ignore it here and do full copy. 6605 */ 6606 copy_enlightened_to_vmcs12(vmx, 0); 6607 else if (enable_shadow_vmcs) 6608 copy_shadow_to_vmcs12(vmx); 6609 } 6610 } 6611 6612 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 6613 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 6614 6615 /* 6616 * Copy over the full allocated size of vmcs12 rather than just the size 6617 * of the struct. 6618 */ 6619 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 6620 return -EFAULT; 6621 6622 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6623 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6624 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 6625 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 6626 return -EFAULT; 6627 } 6628 out: 6629 return kvm_state.size; 6630 } 6631 6632 void vmx_leave_nested(struct kvm_vcpu *vcpu) 6633 { 6634 if (is_guest_mode(vcpu)) { 6635 to_vmx(vcpu)->nested.nested_run_pending = 0; 6636 nested_vmx_vmexit(vcpu, -1, 0, 0); 6637 } 6638 free_nested(vcpu); 6639 } 6640 6641 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 6642 struct kvm_nested_state __user *user_kvm_nested_state, 6643 struct kvm_nested_state *kvm_state) 6644 { 6645 struct vcpu_vmx *vmx = to_vmx(vcpu); 6646 struct vmcs12 *vmcs12; 6647 enum vm_entry_failure_code ignored; 6648 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6649 &user_kvm_nested_state->data.vmx[0]; 6650 int ret; 6651 6652 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 6653 return -EINVAL; 6654 6655 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { 6656 if (kvm_state->hdr.vmx.smm.flags) 6657 return -EINVAL; 6658 6659 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) 6660 return -EINVAL; 6661 6662 /* 6663 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 6664 * enable eVMCS capability on vCPU. However, since then 6665 * code was changed such that flag signals vmcs12 should 6666 * be copied into eVMCS in guest memory. 6667 * 6668 * To preserve backwards compatibility, allow user 6669 * to set this flag even when there is no VMXON region. 6670 */ 6671 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 6672 return -EINVAL; 6673 } else { 6674 if (!guest_can_use(vcpu, X86_FEATURE_VMX)) 6675 return -EINVAL; 6676 6677 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 6678 return -EINVAL; 6679 } 6680 6681 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6682 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6683 return -EINVAL; 6684 6685 if (kvm_state->hdr.vmx.smm.flags & 6686 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 6687 return -EINVAL; 6688 6689 if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) 6690 return -EINVAL; 6691 6692 /* 6693 * SMM temporarily disables VMX, so we cannot be in guest mode, 6694 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 6695 * must be zero. 6696 */ 6697 if (is_smm(vcpu) ? 6698 (kvm_state->flags & 6699 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 6700 : kvm_state->hdr.vmx.smm.flags) 6701 return -EINVAL; 6702 6703 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6704 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 6705 return -EINVAL; 6706 6707 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 6708 (!guest_can_use(vcpu, X86_FEATURE_VMX) || 6709 !vmx->nested.enlightened_vmcs_enabled)) 6710 return -EINVAL; 6711 6712 vmx_leave_nested(vcpu); 6713 6714 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) 6715 return 0; 6716 6717 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 6718 ret = enter_vmx_operation(vcpu); 6719 if (ret) 6720 return ret; 6721 6722 /* Empty 'VMXON' state is permitted if no VMCS loaded */ 6723 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { 6724 /* See vmx_has_valid_vmcs12. */ 6725 if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || 6726 (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || 6727 (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) 6728 return -EINVAL; 6729 else 6730 return 0; 6731 } 6732 6733 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { 6734 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 6735 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 6736 return -EINVAL; 6737 6738 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 6739 #ifdef CONFIG_KVM_HYPERV 6740 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 6741 /* 6742 * nested_vmx_handle_enlightened_vmptrld() cannot be called 6743 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be 6744 * restored yet. EVMCS will be mapped from 6745 * nested_get_vmcs12_pages(). 6746 */ 6747 vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; 6748 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 6749 #endif 6750 } else { 6751 return -EINVAL; 6752 } 6753 6754 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 6755 vmx->nested.smm.vmxon = true; 6756 vmx->nested.vmxon = false; 6757 6758 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 6759 vmx->nested.smm.guest_mode = true; 6760 } 6761 6762 vmcs12 = get_vmcs12(vcpu); 6763 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 6764 return -EFAULT; 6765 6766 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 6767 return -EINVAL; 6768 6769 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6770 return 0; 6771 6772 vmx->nested.nested_run_pending = 6773 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 6774 6775 vmx->nested.mtf_pending = 6776 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); 6777 6778 ret = -EINVAL; 6779 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6780 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6781 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 6782 6783 if (kvm_state->size < 6784 sizeof(*kvm_state) + 6785 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 6786 goto error_guest_mode; 6787 6788 if (copy_from_user(shadow_vmcs12, 6789 user_vmx_nested_state->shadow_vmcs12, 6790 sizeof(*shadow_vmcs12))) { 6791 ret = -EFAULT; 6792 goto error_guest_mode; 6793 } 6794 6795 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 6796 !shadow_vmcs12->hdr.shadow_vmcs) 6797 goto error_guest_mode; 6798 } 6799 6800 vmx->nested.has_preemption_timer_deadline = false; 6801 if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { 6802 vmx->nested.has_preemption_timer_deadline = true; 6803 vmx->nested.preemption_timer_deadline = 6804 kvm_state->hdr.vmx.preemption_timer_deadline; 6805 } 6806 6807 if (nested_vmx_check_controls(vcpu, vmcs12) || 6808 nested_vmx_check_host_state(vcpu, vmcs12) || 6809 nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) 6810 goto error_guest_mode; 6811 6812 vmx->nested.dirty_vmcs12 = true; 6813 vmx->nested.force_msr_bitmap_recalc = true; 6814 ret = nested_vmx_enter_non_root_mode(vcpu, false); 6815 if (ret) 6816 goto error_guest_mode; 6817 6818 if (vmx->nested.mtf_pending) 6819 kvm_make_request(KVM_REQ_EVENT, vcpu); 6820 6821 return 0; 6822 6823 error_guest_mode: 6824 vmx->nested.nested_run_pending = 0; 6825 return ret; 6826 } 6827 6828 void nested_vmx_set_vmcs_shadowing_bitmap(void) 6829 { 6830 if (enable_shadow_vmcs) { 6831 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 6832 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 6833 } 6834 } 6835 6836 /* 6837 * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo 6838 * that madness to get the encoding for comparison. 6839 */ 6840 #define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10))) 6841 6842 static u64 nested_vmx_calc_vmcs_enum_msr(void) 6843 { 6844 /* 6845 * Note these are the so called "index" of the VMCS field encoding, not 6846 * the index into vmcs12. 6847 */ 6848 unsigned int max_idx, idx; 6849 int i; 6850 6851 /* 6852 * For better or worse, KVM allows VMREAD/VMWRITE to all fields in 6853 * vmcs12, regardless of whether or not the associated feature is 6854 * exposed to L1. Simply find the field with the highest index. 6855 */ 6856 max_idx = 0; 6857 for (i = 0; i < nr_vmcs12_fields; i++) { 6858 /* The vmcs12 table is very, very sparsely populated. */ 6859 if (!vmcs12_field_offsets[i]) 6860 continue; 6861 6862 idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); 6863 if (idx > max_idx) 6864 max_idx = idx; 6865 } 6866 6867 return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; 6868 } 6869 6870 static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf, 6871 struct nested_vmx_msrs *msrs) 6872 { 6873 msrs->pinbased_ctls_low = 6874 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6875 6876 msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl; 6877 msrs->pinbased_ctls_high &= 6878 PIN_BASED_EXT_INTR_MASK | 6879 PIN_BASED_NMI_EXITING | 6880 PIN_BASED_VIRTUAL_NMIS | 6881 (enable_apicv ? PIN_BASED_POSTED_INTR : 0); 6882 msrs->pinbased_ctls_high |= 6883 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6884 PIN_BASED_VMX_PREEMPTION_TIMER; 6885 } 6886 6887 static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf, 6888 struct nested_vmx_msrs *msrs) 6889 { 6890 msrs->exit_ctls_low = 6891 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 6892 6893 msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl; 6894 msrs->exit_ctls_high &= 6895 #ifdef CONFIG_X86_64 6896 VM_EXIT_HOST_ADDR_SPACE_SIZE | 6897 #endif 6898 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | 6899 VM_EXIT_CLEAR_BNDCFGS; 6900 msrs->exit_ctls_high |= 6901 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 6902 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 6903 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT | 6904 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 6905 6906 /* We support free control of debug control saving. */ 6907 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 6908 } 6909 6910 static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf, 6911 struct nested_vmx_msrs *msrs) 6912 { 6913 msrs->entry_ctls_low = 6914 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 6915 6916 msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl; 6917 msrs->entry_ctls_high &= 6918 #ifdef CONFIG_X86_64 6919 VM_ENTRY_IA32E_MODE | 6920 #endif 6921 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; 6922 msrs->entry_ctls_high |= 6923 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER | 6924 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 6925 6926 /* We support free control of debug control loading. */ 6927 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 6928 } 6929 6930 static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf, 6931 struct nested_vmx_msrs *msrs) 6932 { 6933 msrs->procbased_ctls_low = 6934 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6935 6936 msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl; 6937 msrs->procbased_ctls_high &= 6938 CPU_BASED_INTR_WINDOW_EXITING | 6939 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | 6940 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 6941 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 6942 CPU_BASED_CR3_STORE_EXITING | 6943 #ifdef CONFIG_X86_64 6944 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 6945 #endif 6946 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 6947 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 6948 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 6949 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 6950 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 6951 /* 6952 * We can allow some features even when not supported by the 6953 * hardware. For example, L1 can specify an MSR bitmap - and we 6954 * can use it to avoid exits to L1 - even when L0 runs L2 6955 * without MSR bitmaps. 6956 */ 6957 msrs->procbased_ctls_high |= 6958 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6959 CPU_BASED_USE_MSR_BITMAPS; 6960 6961 /* We support free control of CR3 access interception. */ 6962 msrs->procbased_ctls_low &= 6963 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 6964 } 6965 6966 static void nested_vmx_setup_secondary_ctls(u32 ept_caps, 6967 struct vmcs_config *vmcs_conf, 6968 struct nested_vmx_msrs *msrs) 6969 { 6970 msrs->secondary_ctls_low = 0; 6971 6972 msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl; 6973 msrs->secondary_ctls_high &= 6974 SECONDARY_EXEC_DESC | 6975 SECONDARY_EXEC_ENABLE_RDTSCP | 6976 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 6977 SECONDARY_EXEC_WBINVD_EXITING | 6978 SECONDARY_EXEC_APIC_REGISTER_VIRT | 6979 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 6980 SECONDARY_EXEC_RDRAND_EXITING | 6981 SECONDARY_EXEC_ENABLE_INVPCID | 6982 SECONDARY_EXEC_ENABLE_VMFUNC | 6983 SECONDARY_EXEC_RDSEED_EXITING | 6984 SECONDARY_EXEC_ENABLE_XSAVES | 6985 SECONDARY_EXEC_TSC_SCALING | 6986 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; 6987 6988 /* 6989 * We can emulate "VMCS shadowing," even if the hardware 6990 * doesn't support it. 6991 */ 6992 msrs->secondary_ctls_high |= 6993 SECONDARY_EXEC_SHADOW_VMCS; 6994 6995 if (enable_ept) { 6996 /* nested EPT: emulate EPT also to L1 */ 6997 msrs->secondary_ctls_high |= 6998 SECONDARY_EXEC_ENABLE_EPT; 6999 msrs->ept_caps = 7000 VMX_EPT_PAGE_WALK_4_BIT | 7001 VMX_EPT_PAGE_WALK_5_BIT | 7002 VMX_EPTP_WB_BIT | 7003 VMX_EPT_INVEPT_BIT | 7004 VMX_EPT_EXECUTE_ONLY_BIT; 7005 7006 msrs->ept_caps &= ept_caps; 7007 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 7008 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 7009 VMX_EPT_1GB_PAGE_BIT; 7010 if (enable_ept_ad_bits) { 7011 msrs->secondary_ctls_high |= 7012 SECONDARY_EXEC_ENABLE_PML; 7013 msrs->ept_caps |= VMX_EPT_AD_BIT; 7014 } 7015 7016 /* 7017 * Advertise EPTP switching irrespective of hardware support, 7018 * KVM emulates it in software so long as VMFUNC is supported. 7019 */ 7020 if (cpu_has_vmx_vmfunc()) 7021 msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING; 7022 } 7023 7024 /* 7025 * Old versions of KVM use the single-context version without 7026 * checking for support, so declare that it is supported even 7027 * though it is treated as global context. The alternative is 7028 * not failing the single-context invvpid, and it is worse. 7029 */ 7030 if (enable_vpid) { 7031 msrs->secondary_ctls_high |= 7032 SECONDARY_EXEC_ENABLE_VPID; 7033 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 7034 VMX_VPID_EXTENT_SUPPORTED_MASK; 7035 } 7036 7037 if (enable_unrestricted_guest) 7038 msrs->secondary_ctls_high |= 7039 SECONDARY_EXEC_UNRESTRICTED_GUEST; 7040 7041 if (flexpriority_enabled) 7042 msrs->secondary_ctls_high |= 7043 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 7044 7045 if (enable_sgx) 7046 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; 7047 } 7048 7049 static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf, 7050 struct nested_vmx_msrs *msrs) 7051 { 7052 msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; 7053 msrs->misc_low |= 7054 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 7055 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 7056 VMX_MISC_ACTIVITY_HLT | 7057 VMX_MISC_ACTIVITY_WAIT_SIPI; 7058 msrs->misc_high = 0; 7059 } 7060 7061 static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) 7062 { 7063 /* 7064 * This MSR reports some information about VMX support. We 7065 * should return information about the VMX we emulate for the 7066 * guest, and the VMCS structure we give it - not about the 7067 * VMX support of the underlying hardware. 7068 */ 7069 msrs->basic = 7070 VMCS12_REVISION | 7071 VMX_BASIC_TRUE_CTLS | 7072 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | 7073 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); 7074 7075 if (cpu_has_vmx_basic_inout()) 7076 msrs->basic |= VMX_BASIC_INOUT; 7077 } 7078 7079 static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) 7080 { 7081 /* 7082 * These MSRs specify bits which the guest must keep fixed on 7083 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 7084 * We picked the standard core2 setting. 7085 */ 7086 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 7087 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 7088 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 7089 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 7090 7091 /* These MSRs specify bits which the guest must keep fixed off. */ 7092 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 7093 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 7094 7095 if (vmx_umip_emulated()) 7096 msrs->cr4_fixed1 |= X86_CR4_UMIP; 7097 } 7098 7099 /* 7100 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 7101 * returned for the various VMX controls MSRs when nested VMX is enabled. 7102 * The same values should also be used to verify that vmcs12 control fields are 7103 * valid during nested entry from L1 to L2. 7104 * Each of these control msrs has a low and high 32-bit half: A low bit is on 7105 * if the corresponding bit in the (32-bit) control field *must* be on, and a 7106 * bit in the high half is on if the corresponding bit in the control field 7107 * may be on. See also vmx_control_verify(). 7108 */ 7109 void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) 7110 { 7111 struct nested_vmx_msrs *msrs = &vmcs_conf->nested; 7112 7113 /* 7114 * Note that as a general rule, the high half of the MSRs (bits in 7115 * the control fields which may be 1) should be initialized by the 7116 * intersection of the underlying hardware's MSR (i.e., features which 7117 * can be supported) and the list of features we want to expose - 7118 * because they are known to be properly supported in our code. 7119 * Also, usually, the low half of the MSRs (bits which must be 1) can 7120 * be set to 0, meaning that L1 may turn off any of these bits. The 7121 * reason is that if one of these bits is necessary, it will appear 7122 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 7123 * fields of vmcs01 and vmcs02, will turn these bits off - and 7124 * nested_vmx_l1_wants_exit() will not pass related exits to L1. 7125 * These rules have exceptions below. 7126 */ 7127 nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs); 7128 7129 nested_vmx_setup_exit_ctls(vmcs_conf, msrs); 7130 7131 nested_vmx_setup_entry_ctls(vmcs_conf, msrs); 7132 7133 nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs); 7134 7135 nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs); 7136 7137 nested_vmx_setup_misc_data(vmcs_conf, msrs); 7138 7139 nested_vmx_setup_basic(msrs); 7140 7141 nested_vmx_setup_cr_fixed(msrs); 7142 7143 msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); 7144 } 7145 7146 void nested_vmx_hardware_unsetup(void) 7147 { 7148 int i; 7149 7150 if (enable_shadow_vmcs) { 7151 for (i = 0; i < VMX_BITMAP_NR; i++) 7152 free_page((unsigned long)vmx_bitmap[i]); 7153 } 7154 } 7155 7156 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 7157 { 7158 int i; 7159 7160 if (!cpu_has_vmx_shadow_vmcs()) 7161 enable_shadow_vmcs = 0; 7162 if (enable_shadow_vmcs) { 7163 for (i = 0; i < VMX_BITMAP_NR; i++) { 7164 /* 7165 * The vmx_bitmap is not tied to a VM and so should 7166 * not be charged to a memcg. 7167 */ 7168 vmx_bitmap[i] = (unsigned long *) 7169 __get_free_page(GFP_KERNEL); 7170 if (!vmx_bitmap[i]) { 7171 nested_vmx_hardware_unsetup(); 7172 return -ENOMEM; 7173 } 7174 } 7175 7176 init_vmcs_shadow_fields(); 7177 } 7178 7179 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; 7180 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; 7181 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; 7182 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; 7183 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; 7184 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; 7185 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; 7186 exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff; 7187 exit_handlers[EXIT_REASON_VMON] = handle_vmxon; 7188 exit_handlers[EXIT_REASON_INVEPT] = handle_invept; 7189 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; 7190 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; 7191 7192 return 0; 7193 } 7194 7195 struct kvm_x86_nested_ops vmx_nested_ops = { 7196 .leave_nested = vmx_leave_nested, 7197 .is_exception_vmexit = nested_vmx_is_exception_vmexit, 7198 .check_events = vmx_check_nested_events, 7199 .has_events = vmx_has_nested_events, 7200 .triple_fault = nested_vmx_triple_fault, 7201 .get_state = vmx_get_nested_state, 7202 .set_state = vmx_set_nested_state, 7203 .get_nested_state_pages = vmx_get_nested_state_pages, 7204 .write_log_dirty = nested_vmx_write_pml_buffer, 7205 #ifdef CONFIG_KVM_HYPERV 7206 .enable_evmcs = nested_enable_evmcs, 7207 .get_evmcs_version = nested_get_evmcs_version, 7208 .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush, 7209 #endif 7210 }; 7211