1 // SPDX-License-Identifier: GPL-2.0 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include <linux/objtool.h> 5 #include <linux/percpu.h> 6 7 #include <asm/debugreg.h> 8 #include <asm/mmu_context.h> 9 #include <asm/msr.h> 10 11 #include "x86.h" 12 #include "cpuid.h" 13 #include "hyperv.h" 14 #include "mmu.h" 15 #include "nested.h" 16 #include "pmu.h" 17 #include "posted_intr.h" 18 #include "sgx.h" 19 #include "trace.h" 20 #include "vmx.h" 21 #include "smm.h" 22 #include "x86_ops.h" 23 24 static bool __read_mostly enable_shadow_vmcs = 1; 25 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 26 27 static bool __ro_after_init warn_on_missed_cc; 28 module_param(warn_on_missed_cc, bool, 0444); 29 30 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK 31 32 /* 33 * Hyper-V requires all of these, so mark them as supported even though 34 * they are just treated the same as all-context. 35 */ 36 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 37 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 38 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 39 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 40 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 41 42 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 43 44 enum { 45 VMX_VMREAD_BITMAP, 46 VMX_VMWRITE_BITMAP, 47 VMX_BITMAP_NR 48 }; 49 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 50 51 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 52 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 53 54 struct shadow_vmcs_field { 55 u16 encoding; 56 u16 offset; 57 }; 58 static struct shadow_vmcs_field shadow_read_only_fields[] = { 59 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 60 #include "vmcs_shadow_fields.h" 61 }; 62 static int max_shadow_read_only_fields = 63 ARRAY_SIZE(shadow_read_only_fields); 64 65 static struct shadow_vmcs_field shadow_read_write_fields[] = { 66 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 67 #include "vmcs_shadow_fields.h" 68 }; 69 static int max_shadow_read_write_fields = 70 ARRAY_SIZE(shadow_read_write_fields); 71 72 static void init_vmcs_shadow_fields(void) 73 { 74 int i, j; 75 76 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 77 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 78 79 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 80 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 81 u16 field = entry.encoding; 82 83 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 84 (i + 1 == max_shadow_read_only_fields || 85 shadow_read_only_fields[i + 1].encoding != field + 1)) 86 pr_err("Missing field from shadow_read_only_field %x\n", 87 field + 1); 88 89 if (get_vmcs12_field_offset(field) < 0) 90 continue; 91 92 clear_bit(field, vmx_vmread_bitmap); 93 if (field & 1) 94 #ifdef CONFIG_X86_64 95 continue; 96 #else 97 entry.offset += sizeof(u32); 98 #endif 99 shadow_read_only_fields[j++] = entry; 100 } 101 max_shadow_read_only_fields = j; 102 103 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 104 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 105 u16 field = entry.encoding; 106 107 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 108 (i + 1 == max_shadow_read_write_fields || 109 shadow_read_write_fields[i + 1].encoding != field + 1)) 110 pr_err("Missing field from shadow_read_write_field %x\n", 111 field + 1); 112 113 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 114 field <= GUEST_TR_AR_BYTES, 115 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 116 117 if (get_vmcs12_field_offset(field) < 0) 118 continue; 119 120 /* 121 * KVM emulates PML and the VMX preemption timer irrespective 122 * of hardware support, but shadowing their related VMCS fields 123 * requires hardware support as the CPU will reject VMWRITEs to 124 * fields that don't exist. 125 */ 126 switch (field) { 127 case GUEST_PML_INDEX: 128 if (!cpu_has_vmx_pml()) 129 continue; 130 break; 131 case VMX_PREEMPTION_TIMER_VALUE: 132 if (!cpu_has_vmx_preemption_timer()) 133 continue; 134 break; 135 default: 136 break; 137 } 138 139 clear_bit(field, vmx_vmwrite_bitmap); 140 clear_bit(field, vmx_vmread_bitmap); 141 if (field & 1) 142 #ifdef CONFIG_X86_64 143 continue; 144 #else 145 entry.offset += sizeof(u32); 146 #endif 147 shadow_read_write_fields[j++] = entry; 148 } 149 max_shadow_read_write_fields = j; 150 } 151 152 /* 153 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 154 * set the success or error code of an emulated VMX instruction (as specified 155 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 156 * instruction. 157 */ 158 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 159 { 160 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 161 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 162 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 163 return kvm_skip_emulated_instruction(vcpu); 164 } 165 166 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 167 { 168 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 169 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 170 X86_EFLAGS_SF | X86_EFLAGS_OF)) 171 | X86_EFLAGS_CF); 172 return kvm_skip_emulated_instruction(vcpu); 173 } 174 175 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 176 u32 vm_instruction_error) 177 { 178 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 179 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 180 X86_EFLAGS_SF | X86_EFLAGS_OF)) 181 | X86_EFLAGS_ZF); 182 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 183 /* 184 * We don't need to force sync to shadow VMCS because 185 * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all 186 * fields and thus must be synced. 187 */ 188 if (nested_vmx_is_evmptr12_set(to_vmx(vcpu))) 189 to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; 190 191 return kvm_skip_emulated_instruction(vcpu); 192 } 193 194 static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) 195 { 196 struct vcpu_vmx *vmx = to_vmx(vcpu); 197 198 /* 199 * failValid writes the error number to the current VMCS, which 200 * can't be done if there isn't a current VMCS. 201 */ 202 if (vmx->nested.current_vmptr == INVALID_GPA && 203 !nested_vmx_is_evmptr12_valid(vmx)) 204 return nested_vmx_failInvalid(vcpu); 205 206 return nested_vmx_failValid(vcpu, vm_instruction_error); 207 } 208 209 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 210 { 211 /* TODO: not to reset guest simply here. */ 212 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 213 pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator); 214 } 215 216 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 217 { 218 return fixed_bits_valid(control, low, high); 219 } 220 221 static inline u64 vmx_control_msr(u32 low, u32 high) 222 { 223 return low | ((u64)high << 32); 224 } 225 226 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 227 { 228 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 229 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 230 vmx->nested.need_vmcs12_to_shadow_sync = false; 231 } 232 233 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 234 { 235 #ifdef CONFIG_KVM_HYPERV 236 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); 237 struct vcpu_vmx *vmx = to_vmx(vcpu); 238 239 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map); 240 vmx->nested.hv_evmcs = NULL; 241 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 242 243 if (hv_vcpu) { 244 hv_vcpu->nested.pa_page_gpa = INVALID_GPA; 245 hv_vcpu->nested.vm_id = 0; 246 hv_vcpu->nested.vp_id = 0; 247 } 248 #endif 249 } 250 251 static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr) 252 { 253 #ifdef CONFIG_KVM_HYPERV 254 struct vcpu_vmx *vmx = to_vmx(vcpu); 255 /* 256 * When Enlightened VMEntry is enabled on the calling CPU we treat 257 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 258 * way to distinguish it from VMCS12) and we must not corrupt it by 259 * writing to the non-existent 'launch_state' field. The area doesn't 260 * have to be the currently active EVMCS on the calling CPU and there's 261 * nothing KVM has to do to transition it from 'active' to 'non-active' 262 * state. It is possible that the area will stay mapped as 263 * vmx->nested.hv_evmcs but this shouldn't be a problem. 264 */ 265 if (!guest_cpu_cap_has_evmcs(vcpu) || 266 !evmptr_is_valid(nested_get_evmptr(vcpu))) 267 return false; 268 269 if (nested_vmx_evmcs(vmx) && vmptr == vmx->nested.hv_evmcs_vmptr) 270 nested_release_evmcs(vcpu); 271 272 return true; 273 #else 274 return false; 275 #endif 276 } 277 278 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 279 struct loaded_vmcs *prev) 280 { 281 struct vmcs_host_state *dest, *src; 282 283 if (unlikely(!vmx->vt.guest_state_loaded)) 284 return; 285 286 src = &prev->host_state; 287 dest = &vmx->loaded_vmcs->host_state; 288 289 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 290 dest->ldt_sel = src->ldt_sel; 291 #ifdef CONFIG_X86_64 292 dest->ds_sel = src->ds_sel; 293 dest->es_sel = src->es_sel; 294 #endif 295 } 296 297 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 298 { 299 struct vcpu_vmx *vmx = to_vmx(vcpu); 300 struct loaded_vmcs *prev; 301 int cpu; 302 303 if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) 304 return; 305 306 cpu = get_cpu(); 307 prev = vmx->loaded_vmcs; 308 vmx->loaded_vmcs = vmcs; 309 vmx_vcpu_load_vmcs(vcpu, cpu); 310 vmx_sync_vmcs_host_state(vmx, prev); 311 put_cpu(); 312 313 kvm_clear_available_registers(vcpu, VMX_REGS_LAZY_LOAD_SET); 314 315 /* 316 * All lazily updated registers will be reloaded from VMCS12 on both 317 * vmentry and vmexit. 318 */ 319 kvm_reset_dirty_registers(vcpu); 320 } 321 322 static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu) 323 { 324 struct vcpu_vmx *vmx = to_vmx(vcpu); 325 326 kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map); 327 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map); 328 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map); 329 vmx->nested.pi_desc = NULL; 330 } 331 332 /* 333 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 334 * just stops using VMX. 335 */ 336 static void free_nested(struct kvm_vcpu *vcpu) 337 { 338 struct vcpu_vmx *vmx = to_vmx(vcpu); 339 340 if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) 341 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 342 343 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 344 return; 345 346 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 347 348 vmx->nested.vmxon = false; 349 vmx->nested.smm.vmxon = false; 350 vmx->nested.vmxon_ptr = INVALID_GPA; 351 free_vpid(vmx->nested.vpid02); 352 vmx->nested.posted_intr_nv = -1; 353 vmx->nested.current_vmptr = INVALID_GPA; 354 if (enable_shadow_vmcs) { 355 vmx_disable_shadow_vmcs(vmx); 356 vmcs_clear(vmx->vmcs01.shadow_vmcs); 357 free_vmcs(vmx->vmcs01.shadow_vmcs); 358 vmx->vmcs01.shadow_vmcs = NULL; 359 } 360 kfree(vmx->nested.cached_vmcs12); 361 vmx->nested.cached_vmcs12 = NULL; 362 kfree(vmx->nested.cached_shadow_vmcs12); 363 vmx->nested.cached_shadow_vmcs12 = NULL; 364 365 nested_put_vmcs12_pages(vcpu); 366 367 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 368 369 nested_release_evmcs(vcpu); 370 371 free_loaded_vmcs(&vmx->nested.vmcs02); 372 } 373 374 /* 375 * Ensure that the current vmcs of the logical processor is the 376 * vmcs01 of the vcpu before calling free_nested(). 377 */ 378 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 379 { 380 vcpu_load(vcpu); 381 vmx_leave_nested(vcpu); 382 vcpu_put(vcpu); 383 } 384 385 #define EPTP_PA_MASK GENMASK_ULL(51, 12) 386 387 static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) 388 { 389 return VALID_PAGE(root_hpa) && 390 ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); 391 } 392 393 static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, 394 gpa_t addr) 395 { 396 unsigned long roots = 0; 397 uint i; 398 struct kvm_mmu_root_info *cached_root; 399 400 WARN_ON_ONCE(!mmu_is_nested(vcpu)); 401 402 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 403 cached_root = &vcpu->arch.mmu->prev_roots[i]; 404 405 if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, 406 eptp)) 407 roots |= KVM_MMU_ROOT_PREVIOUS(i); 408 } 409 if (roots) 410 kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots); 411 } 412 413 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 414 struct x86_exception *fault) 415 { 416 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 417 struct vcpu_vmx *vmx = to_vmx(vcpu); 418 unsigned long exit_qualification; 419 u32 vm_exit_reason; 420 421 if (vmx->nested.pml_full) { 422 vm_exit_reason = EXIT_REASON_PML_FULL; 423 vmx->nested.pml_full = false; 424 425 /* 426 * It should be impossible to trigger a nested PML Full VM-Exit 427 * for anything other than an EPT Violation from L2. KVM *can* 428 * trigger nEPT page fault injection in response to an EPT 429 * Misconfig, e.g. if the MMIO SPTE was stale and L1's EPT 430 * tables also changed, but KVM should not treat EPT Misconfig 431 * VM-Exits as writes. 432 */ 433 WARN_ON_ONCE(vmx->vt.exit_reason.basic != EXIT_REASON_EPT_VIOLATION); 434 435 /* 436 * PML Full and EPT Violation VM-Exits both use bit 12 to report 437 * "NMI unblocking due to IRET", i.e. the bit can be propagated 438 * as-is from the original EXIT_QUALIFICATION. 439 */ 440 exit_qualification = vmx_get_exit_qual(vcpu) & INTR_INFO_UNBLOCK_NMI; 441 } else { 442 if (fault->error_code & PFERR_RSVD_MASK) { 443 vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 444 exit_qualification = 0; 445 } else { 446 u64 mask = EPT_VIOLATION_GVA_IS_VALID | 447 EPT_VIOLATION_GVA_TRANSLATED; 448 if (vmx->nested.msrs.ept_caps & VMX_EPT_ADVANCED_VMEXIT_INFO_BIT) 449 mask |= EPT_VIOLATION_GVA_USER | 450 EPT_VIOLATION_GVA_WRITABLE | 451 EPT_VIOLATION_GVA_NX; 452 exit_qualification = fault->exit_qualification; 453 exit_qualification |= vmx_get_exit_qual(vcpu) & mask; 454 vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 455 } 456 457 /* 458 * Although the caller (kvm_inject_emulated_page_fault) would 459 * have already synced the faulting address in the shadow EPT 460 * tables for the current EPTP12, we also need to sync it for 461 * any other cached EPTP02s based on the same EP4TA, since the 462 * TLB associates mappings to the EP4TA rather than the full EPTP. 463 */ 464 nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, 465 fault->address); 466 } 467 468 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); 469 vmcs12->guest_physical_address = fault->address; 470 } 471 472 static inline bool nested_ept_mbec_enabled(struct kvm_vcpu *vcpu) 473 { 474 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 475 476 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC); 477 } 478 479 static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) 480 { 481 struct vcpu_vmx *vmx = to_vmx(vcpu); 482 bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT; 483 int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps); 484 485 kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level, 486 nested_ept_ad_enabled(vcpu), 487 nested_ept_mbec_enabled(vcpu), 488 nested_ept_get_eptp(vcpu)); 489 } 490 491 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 492 { 493 WARN_ON(mmu_is_nested(vcpu)); 494 495 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 496 nested_ept_new_eptp(vcpu); 497 vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; 498 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 499 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 500 501 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 502 } 503 504 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 505 { 506 vcpu->arch.mmu = &vcpu->arch.root_mmu; 507 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 508 } 509 510 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 511 u16 error_code) 512 { 513 bool inequality, bit; 514 515 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 516 inequality = 517 (error_code & vmcs12->page_fault_error_code_mask) != 518 vmcs12->page_fault_error_code_match; 519 return inequality ^ bit; 520 } 521 522 static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector, 523 u32 error_code) 524 { 525 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 526 527 /* 528 * Drop bits 31:16 of the error code when performing the #PF mask+match 529 * check. All VMCS fields involved are 32 bits, but Intel CPUs never 530 * set bits 31:16 and VMX disallows setting bits 31:16 in the injected 531 * error code. Including the to-be-dropped bits in the check might 532 * result in an "impossible" or missed exit from L1's perspective. 533 */ 534 if (vector == PF_VECTOR) 535 return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code); 536 537 return (vmcs12->exception_bitmap & (1u << vector)); 538 } 539 540 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 541 struct vmcs12 *vmcs12) 542 { 543 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 544 return 0; 545 546 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 547 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 548 return -EINVAL; 549 550 return 0; 551 } 552 553 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 554 struct vmcs12 *vmcs12) 555 { 556 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 557 return 0; 558 559 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 560 return -EINVAL; 561 562 return 0; 563 } 564 565 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 566 struct vmcs12 *vmcs12) 567 { 568 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 569 return 0; 570 571 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 572 return -EINVAL; 573 574 if (CC(!nested_cpu_has_vid(vmcs12) && vmcs12->tpr_threshold >> 4)) 575 return -EINVAL; 576 577 return 0; 578 } 579 580 /* 581 * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 582 * itself utilizing x2APIC. All MSRs were previously set to be intercepted, 583 * only the "disable intercept" case needs to be handled. 584 */ 585 static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, 586 unsigned long *msr_bitmap_l0, 587 u32 msr, int type) 588 { 589 if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) 590 vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); 591 592 if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) 593 vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); 594 } 595 596 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) 597 { 598 int msr; 599 600 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 601 unsigned word = msr / BITS_PER_LONG; 602 603 msr_bitmap[word] = ~0; 604 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 605 } 606 } 607 608 #define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ 609 static inline \ 610 void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ 611 unsigned long *msr_bitmap_l1, \ 612 unsigned long *msr_bitmap_l0, u32 msr) \ 613 { \ 614 if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ 615 vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ 616 vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 617 else \ 618 vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 619 } 620 BUILD_NVMX_MSR_INTERCEPT_HELPER(read) 621 BUILD_NVMX_MSR_INTERCEPT_HELPER(write) 622 623 static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, 624 unsigned long *msr_bitmap_l1, 625 unsigned long *msr_bitmap_l0, 626 u32 msr, int types) 627 { 628 if (types & MSR_TYPE_R) 629 nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, 630 msr_bitmap_l0, msr); 631 if (types & MSR_TYPE_W) 632 nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, 633 msr_bitmap_l0, msr); 634 } 635 636 #define nested_vmx_merge_msr_bitmaps(msr, type) \ 637 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, \ 638 msr_bitmap_l0, msr, type) 639 640 #define nested_vmx_merge_msr_bitmaps_read(msr) \ 641 nested_vmx_merge_msr_bitmaps(msr, MSR_TYPE_R) 642 643 #define nested_vmx_merge_msr_bitmaps_write(msr) \ 644 nested_vmx_merge_msr_bitmaps(msr, MSR_TYPE_W) 645 646 #define nested_vmx_merge_msr_bitmaps_rw(msr) \ 647 nested_vmx_merge_msr_bitmaps(msr, MSR_TYPE_RW) 648 649 static void nested_vmx_merge_pmu_msr_bitmaps(struct kvm_vcpu *vcpu, 650 unsigned long *msr_bitmap_l1, 651 unsigned long *msr_bitmap_l0) 652 { 653 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 654 struct vcpu_vmx *vmx = to_vmx(vcpu); 655 int i; 656 657 /* 658 * Skip the merges if the vCPU doesn't have a mediated PMU MSR, i.e. if 659 * none of the MSRs can possibly be passed through to L1. 660 */ 661 if (!kvm_vcpu_has_mediated_pmu(vcpu)) 662 return; 663 664 for (i = 0; i < pmu->nr_arch_gp_counters; i++) { 665 nested_vmx_merge_msr_bitmaps_rw(MSR_IA32_PERFCTR0 + i); 666 nested_vmx_merge_msr_bitmaps_rw(MSR_IA32_PMC0 + i); 667 } 668 669 for (i = 0; i < pmu->nr_arch_fixed_counters; i++) 670 nested_vmx_merge_msr_bitmaps_rw(MSR_CORE_PERF_FIXED_CTR0 + i); 671 672 nested_vmx_merge_msr_bitmaps_rw(MSR_CORE_PERF_GLOBAL_CTRL); 673 nested_vmx_merge_msr_bitmaps_read(MSR_CORE_PERF_GLOBAL_STATUS); 674 nested_vmx_merge_msr_bitmaps_write(MSR_CORE_PERF_GLOBAL_OVF_CTRL); 675 } 676 677 /* 678 * Merge L0's and L1's MSR bitmap, return false to indicate that 679 * we do not use the hardware. 680 */ 681 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 682 struct vmcs12 *vmcs12) 683 { 684 struct vcpu_vmx *vmx = to_vmx(vcpu); 685 int msr; 686 unsigned long *msr_bitmap_l1; 687 unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; 688 struct kvm_host_map map; 689 690 /* Nothing to do if the MSR bitmap is not in use. */ 691 if (!cpu_has_vmx_msr_bitmap() || 692 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 693 return false; 694 695 /* 696 * MSR bitmap update can be skipped when: 697 * - MSR bitmap for L1 hasn't changed. 698 * - Nested hypervisor (L1) is attempting to launch the same L2 as 699 * before. 700 * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature 701 * and tells KVM (L0) there were no changes in MSR bitmap for L2. 702 */ 703 if (!vmx->nested.force_msr_bitmap_recalc) { 704 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 705 706 if (evmcs && evmcs->hv_enlightenments_control.msr_bitmap && 707 evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) 708 return true; 709 } 710 711 if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map)) 712 return false; 713 714 msr_bitmap_l1 = (unsigned long *)map.hva; 715 716 /* 717 * To keep the control flow simple, pay eight 8-byte writes (sixteen 718 * 4-byte writes on 32-bit systems) up front to enable intercepts for 719 * the x2APIC MSR range and selectively toggle those relevant to L2. 720 */ 721 enable_x2apic_msr_intercepts(msr_bitmap_l0); 722 723 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 724 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 725 /* 726 * L0 need not intercept reads for MSRs between 0x800 727 * and 0x8ff, it just lets the processor take the value 728 * from the virtual-APIC page; take those 256 bits 729 * directly from the L1 bitmap. 730 */ 731 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 732 unsigned word = msr / BITS_PER_LONG; 733 734 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 735 } 736 } 737 738 nested_vmx_disable_intercept_for_x2apic_msr( 739 msr_bitmap_l1, msr_bitmap_l0, 740 X2APIC_MSR(APIC_TASKPRI), 741 MSR_TYPE_R | MSR_TYPE_W); 742 743 if (nested_cpu_has_vid(vmcs12)) { 744 nested_vmx_disable_intercept_for_x2apic_msr( 745 msr_bitmap_l1, msr_bitmap_l0, 746 X2APIC_MSR(APIC_EOI), 747 MSR_TYPE_W); 748 nested_vmx_disable_intercept_for_x2apic_msr( 749 msr_bitmap_l1, msr_bitmap_l0, 750 X2APIC_MSR(APIC_SELF_IPI), 751 MSR_TYPE_W); 752 } 753 } 754 755 /* 756 * Always check vmcs01's bitmap to honor userspace MSR filters and any 757 * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. 758 */ 759 #ifdef CONFIG_X86_64 760 nested_vmx_merge_msr_bitmaps_rw(MSR_FS_BASE); 761 nested_vmx_merge_msr_bitmaps_rw(MSR_GS_BASE); 762 nested_vmx_merge_msr_bitmaps_rw(MSR_KERNEL_GS_BASE); 763 #endif 764 nested_vmx_merge_msr_bitmaps_rw(MSR_IA32_SPEC_CTRL); 765 nested_vmx_merge_msr_bitmaps_write(MSR_IA32_PRED_CMD); 766 nested_vmx_merge_msr_bitmaps_write(MSR_IA32_FLUSH_CMD); 767 768 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 769 MSR_IA32_APERF, MSR_TYPE_R); 770 771 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 772 MSR_IA32_MPERF, MSR_TYPE_R); 773 774 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 775 MSR_IA32_U_CET, MSR_TYPE_RW); 776 777 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 778 MSR_IA32_S_CET, MSR_TYPE_RW); 779 780 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 781 MSR_IA32_PL0_SSP, MSR_TYPE_RW); 782 783 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 784 MSR_IA32_PL1_SSP, MSR_TYPE_RW); 785 786 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 787 MSR_IA32_PL2_SSP, MSR_TYPE_RW); 788 789 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 790 MSR_IA32_PL3_SSP, MSR_TYPE_RW); 791 792 nested_vmx_merge_pmu_msr_bitmaps(vcpu, msr_bitmap_l1, msr_bitmap_l0); 793 794 kvm_vcpu_unmap(vcpu, &map); 795 796 vmx->nested.force_msr_bitmap_recalc = false; 797 798 return true; 799 } 800 801 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 802 struct vmcs12 *vmcs12) 803 { 804 struct vcpu_vmx *vmx = to_vmx(vcpu); 805 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 806 807 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 808 vmcs12->vmcs_link_pointer == INVALID_GPA) 809 return; 810 811 if (ghc->gpa != vmcs12->vmcs_link_pointer && 812 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 813 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 814 return; 815 816 kvm_read_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu), 817 VMCS12_SIZE); 818 } 819 820 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 821 struct vmcs12 *vmcs12) 822 { 823 struct vcpu_vmx *vmx = to_vmx(vcpu); 824 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 825 826 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 827 vmcs12->vmcs_link_pointer == INVALID_GPA) 828 return; 829 830 if (ghc->gpa != vmcs12->vmcs_link_pointer && 831 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 832 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 833 return; 834 835 kvm_write_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu), 836 VMCS12_SIZE); 837 } 838 839 /* 840 * In nested virtualization, check if L1 has set 841 * VM_EXIT_ACK_INTR_ON_EXIT 842 */ 843 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 844 { 845 return get_vmcs12(vcpu)->vm_exit_controls & 846 VM_EXIT_ACK_INTR_ON_EXIT; 847 } 848 849 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 850 struct vmcs12 *vmcs12) 851 { 852 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 853 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 854 return -EINVAL; 855 else 856 return 0; 857 } 858 859 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 860 struct vmcs12 *vmcs12) 861 { 862 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 863 !nested_cpu_has_apic_reg_virt(vmcs12) && 864 !nested_cpu_has_vid(vmcs12) && 865 !nested_cpu_has_posted_intr(vmcs12)) 866 return 0; 867 868 /* 869 * If virtualize x2apic mode is enabled, 870 * virtualize apic access must be disabled. 871 */ 872 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 873 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 874 return -EINVAL; 875 876 /* 877 * If virtual interrupt delivery is enabled, 878 * we must exit on external interrupts. 879 */ 880 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 881 return -EINVAL; 882 883 /* 884 * bits 15:8 should be zero in posted_intr_nv, 885 * the descriptor address has been already checked 886 * in nested_get_vmcs12_pages. 887 * 888 * bits 5:0 of posted_intr_desc_addr should be zero. 889 */ 890 if (nested_cpu_has_posted_intr(vmcs12) && 891 (CC(!nested_cpu_has_vid(vmcs12)) || 892 CC(!nested_exit_intr_ack_set(vcpu)) || 893 CC((vmcs12->posted_intr_nv & 0xff00)) || 894 CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) 895 return -EINVAL; 896 897 /* tpr shadow is needed by all apicv features. */ 898 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 899 return -EINVAL; 900 901 return 0; 902 } 903 904 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) 905 { 906 struct vcpu_vmx *vmx = to_vmx(vcpu); 907 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 908 vmx->nested.msrs.misc_high); 909 910 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; 911 } 912 913 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 914 u32 count, u64 addr) 915 { 916 if (count == 0) 917 return 0; 918 919 /* 920 * Exceeding the limit results in architecturally _undefined_ behavior, 921 * i.e. KVM is allowed to do literally anything in response to a bad 922 * limit. Immediately generate a consistency check so that code that 923 * consumes the count doesn't need to worry about extreme edge cases. 924 */ 925 if (count > nested_vmx_max_atomic_switch_msrs(vcpu)) 926 return -EINVAL; 927 928 if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || 929 !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) 930 return -EINVAL; 931 932 return 0; 933 } 934 935 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 936 struct vmcs12 *vmcs12) 937 { 938 if (CC(nested_vmx_check_msr_switch(vcpu, 939 vmcs12->vm_exit_msr_load_count, 940 vmcs12->vm_exit_msr_load_addr)) || 941 CC(nested_vmx_check_msr_switch(vcpu, 942 vmcs12->vm_exit_msr_store_count, 943 vmcs12->vm_exit_msr_store_addr))) 944 return -EINVAL; 945 946 return 0; 947 } 948 949 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 950 struct vmcs12 *vmcs12) 951 { 952 if (CC(nested_vmx_check_msr_switch(vcpu, 953 vmcs12->vm_entry_msr_load_count, 954 vmcs12->vm_entry_msr_load_addr))) 955 return -EINVAL; 956 957 return 0; 958 } 959 960 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 961 struct vmcs12 *vmcs12) 962 { 963 if (!nested_cpu_has_pml(vmcs12)) 964 return 0; 965 966 if (CC(!nested_cpu_has_ept(vmcs12)) || 967 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 968 return -EINVAL; 969 970 return 0; 971 } 972 973 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 974 struct vmcs12 *vmcs12) 975 { 976 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 977 !nested_cpu_has_ept(vmcs12))) 978 return -EINVAL; 979 return 0; 980 } 981 982 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 983 struct vmcs12 *vmcs12) 984 { 985 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 986 !nested_cpu_has_ept(vmcs12))) 987 return -EINVAL; 988 return 0; 989 } 990 991 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 992 struct vmcs12 *vmcs12) 993 { 994 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 995 return 0; 996 997 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 998 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 999 return -EINVAL; 1000 1001 return 0; 1002 } 1003 1004 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 1005 struct vmx_msr_entry *e) 1006 { 1007 /* x2APIC MSR accesses are not allowed */ 1008 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 1009 return -EINVAL; 1010 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 1011 CC(e->index == MSR_IA32_UCODE_REV)) 1012 return -EINVAL; 1013 if (CC(e->reserved != 0)) 1014 return -EINVAL; 1015 return 0; 1016 } 1017 1018 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 1019 struct vmx_msr_entry *e) 1020 { 1021 if (CC(e->index == MSR_FS_BASE) || 1022 CC(e->index == MSR_GS_BASE) || 1023 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 1024 nested_vmx_msr_check_common(vcpu, e)) 1025 return -EINVAL; 1026 return 0; 1027 } 1028 1029 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 1030 struct vmx_msr_entry *e) 1031 { 1032 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 1033 nested_vmx_msr_check_common(vcpu, e)) 1034 return -EINVAL; 1035 return 0; 1036 } 1037 1038 /* 1039 * Load guest's/host's msr at nested entry/exit. 1040 * return 0 for success, entry index for failure. 1041 * 1042 * One of the failure modes for MSR load/store is when a list exceeds the 1043 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch 1044 * as possible, process all valid entries before failing rather than precheck 1045 * for a capacity violation. 1046 */ 1047 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 1048 { 1049 u32 i; 1050 struct vmx_msr_entry e; 1051 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 1052 1053 for (i = 0; i < count; i++) { 1054 if (WARN_ON_ONCE(i >= max_msr_list_size)) 1055 goto fail; 1056 1057 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 1058 &e, sizeof(e))) { 1059 pr_debug_ratelimited( 1060 "%s cannot read MSR entry (%u, 0x%08llx)\n", 1061 __func__, i, gpa + i * sizeof(e)); 1062 goto fail; 1063 } 1064 if (nested_vmx_load_msr_check(vcpu, &e)) { 1065 pr_debug_ratelimited( 1066 "%s check failed (%u, 0x%x, 0x%x)\n", 1067 __func__, i, e.index, e.reserved); 1068 goto fail; 1069 } 1070 if (kvm_emulate_msr_write(vcpu, e.index, e.value)) { 1071 pr_debug_ratelimited( 1072 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1073 __func__, i, e.index, e.value); 1074 goto fail; 1075 } 1076 } 1077 return 0; 1078 fail: 1079 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ 1080 return i + 1; 1081 } 1082 1083 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, 1084 u32 msr_index, 1085 u64 *data) 1086 { 1087 struct vcpu_vmx *vmx = to_vmx(vcpu); 1088 1089 /* 1090 * If the L0 hypervisor stored a more accurate value for the TSC that 1091 * does not include the time taken for emulation of the L2->L1 1092 * VM-exit in L0, use the more accurate value. 1093 */ 1094 if (msr_index == MSR_IA32_TSC && vmx->nested.tsc_autostore_slot >= 0) { 1095 int slot = vmx->nested.tsc_autostore_slot; 1096 u64 host_tsc = vmx->msr_autostore.val[slot].value; 1097 1098 *data = kvm_read_l1_tsc(vcpu, host_tsc); 1099 return true; 1100 } 1101 1102 if (kvm_emulate_msr_read(vcpu, msr_index, data)) { 1103 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 1104 msr_index); 1105 return false; 1106 } 1107 return true; 1108 } 1109 1110 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, 1111 struct vmx_msr_entry *e) 1112 { 1113 if (kvm_vcpu_read_guest(vcpu, 1114 gpa + i * sizeof(*e), 1115 e, 2 * sizeof(u32))) { 1116 pr_debug_ratelimited( 1117 "%s cannot read MSR entry (%u, 0x%08llx)\n", 1118 __func__, i, gpa + i * sizeof(*e)); 1119 return false; 1120 } 1121 if (nested_vmx_store_msr_check(vcpu, e)) { 1122 pr_debug_ratelimited( 1123 "%s check failed (%u, 0x%x, 0x%x)\n", 1124 __func__, i, e->index, e->reserved); 1125 return false; 1126 } 1127 return true; 1128 } 1129 1130 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 1131 { 1132 u64 data; 1133 u32 i; 1134 struct vmx_msr_entry e; 1135 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 1136 1137 for (i = 0; i < count; i++) { 1138 if (WARN_ON_ONCE(i >= max_msr_list_size)) 1139 return -EINVAL; 1140 1141 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1142 return -EINVAL; 1143 1144 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) 1145 return -EINVAL; 1146 1147 if (kvm_vcpu_write_guest(vcpu, 1148 gpa + i * sizeof(e) + 1149 offsetof(struct vmx_msr_entry, value), 1150 &data, sizeof(data))) { 1151 pr_debug_ratelimited( 1152 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1153 __func__, i, e.index, data); 1154 return -EINVAL; 1155 } 1156 } 1157 return 0; 1158 } 1159 1160 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) 1161 { 1162 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1163 u32 count = vmcs12->vm_exit_msr_store_count; 1164 u64 gpa = vmcs12->vm_exit_msr_store_addr; 1165 struct vmx_msr_entry e; 1166 u32 i; 1167 1168 for (i = 0; i < count; i++) { 1169 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1170 return false; 1171 1172 if (e.index == msr_index) 1173 return true; 1174 } 1175 return false; 1176 } 1177 1178 /* 1179 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are 1180 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected 1181 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to 1182 * @entry_failure_code. 1183 */ 1184 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, 1185 bool nested_ept, bool reload_pdptrs, 1186 enum vm_entry_failure_code *entry_failure_code) 1187 { 1188 if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) { 1189 *entry_failure_code = ENTRY_FAIL_DEFAULT; 1190 return -EINVAL; 1191 } 1192 1193 /* 1194 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 1195 * must not be dereferenced. 1196 */ 1197 if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && 1198 CC(!load_pdptrs(vcpu, cr3))) { 1199 *entry_failure_code = ENTRY_FAIL_PDPTE; 1200 return -EINVAL; 1201 } 1202 1203 vcpu->arch.cr3 = cr3; 1204 kvm_register_mark_dirty(vcpu, VCPU_REG_CR3); 1205 1206 /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ 1207 kvm_init_mmu(vcpu); 1208 1209 if (!nested_ept) 1210 kvm_mmu_new_pgd(vcpu, cr3); 1211 1212 return 0; 1213 } 1214 1215 /* 1216 * Returns if KVM is able to config CPU to tag TLB entries 1217 * populated by L2 differently than TLB entries populated 1218 * by L1. 1219 * 1220 * If L0 uses EPT, L1 and L2 run with different EPTP because 1221 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries 1222 * are tagged with different EPTP. 1223 * 1224 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 1225 * with different VPID (L1 entries are tagged with vmx->vpid 1226 * while L2 entries are tagged with vmx->nested.vpid02). 1227 */ 1228 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1229 { 1230 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1231 1232 return enable_ept || 1233 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1234 } 1235 1236 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, 1237 struct vmcs12 *vmcs12, 1238 bool is_vmenter) 1239 { 1240 struct vcpu_vmx *vmx = to_vmx(vcpu); 1241 1242 /* Handle pending Hyper-V TLB flush requests */ 1243 kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept); 1244 1245 /* 1246 * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the 1247 * same VPID as the host, and so architecturally, linear and combined 1248 * mappings for VPID=0 must be flushed at VM-Enter and VM-Exit. KVM 1249 * emulates L2 sharing L1's VPID=0 by using vpid01 while running L2, 1250 * and so KVM must also emulate TLB flush of VPID=0, i.e. vpid01. This 1251 * is required if VPID is disabled in KVM, as a TLB flush (there are no 1252 * VPIDs) still occurs from L1's perspective, and KVM may need to 1253 * synchronize the MMU in response to the guest TLB flush. 1254 * 1255 * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. 1256 * EPT is a special snowflake, as guest-physical mappings aren't 1257 * flushed on VPID invalidations, including VM-Enter or VM-Exit with 1258 * VPID disabled. As a result, KVM _never_ needs to sync nEPT 1259 * entries on VM-Enter because L1 can't rely on VM-Enter to flush 1260 * those mappings. 1261 */ 1262 if (!nested_cpu_has_vpid(vmcs12)) { 1263 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1264 return; 1265 } 1266 1267 /* L2 should never have a VPID if VPID is disabled. */ 1268 WARN_ON(!enable_vpid); 1269 1270 /* 1271 * VPID is enabled and in use by vmcs12. If vpid12 is changing, then 1272 * emulate a guest TLB flush as KVM does not track vpid12 history nor 1273 * is the VPID incorporated into the MMU context. I.e. KVM must assume 1274 * that the new vpid12 has never been used and thus represents a new 1275 * guest ASID that cannot have entries in the TLB. 1276 */ 1277 if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 1278 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 1279 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1280 return; 1281 } 1282 1283 /* 1284 * If VPID is enabled, used by vmc12, and vpid12 is not changing but 1285 * does not have a unique TLB tag (ASID), i.e. EPT is disabled and 1286 * KVM was unable to allocate a VPID for L2, flush the current context 1287 * as the effective ASID is common to both L1 and L2. 1288 */ 1289 if (!nested_has_guest_tlb_tag(vcpu)) 1290 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 1291 } 1292 1293 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1294 { 1295 superset &= mask; 1296 subset &= mask; 1297 1298 return (superset | subset) == superset; 1299 } 1300 1301 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1302 { 1303 const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT | 1304 VMX_BASIC_INOUT | 1305 VMX_BASIC_TRUE_CTLS | 1306 VMX_BASIC_NO_HW_ERROR_CODE_CC; 1307 1308 const u64 reserved_bits = GENMASK_ULL(63, 57) | 1309 GENMASK_ULL(47, 45) | 1310 BIT_ULL(31); 1311 1312 u64 vmx_basic = vmcs_config.nested.basic; 1313 1314 BUILD_BUG_ON(feature_bits & reserved_bits); 1315 1316 /* 1317 * Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has 1318 * inverted polarity), the incoming value must not set feature bits or 1319 * reserved bits that aren't allowed/supported by KVM. Fields, i.e. 1320 * multi-bit values, are explicitly checked below. 1321 */ 1322 if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits)) 1323 return -EINVAL; 1324 1325 /* 1326 * KVM does not emulate a version of VMX that constrains physical 1327 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1328 */ 1329 if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) 1330 return -EINVAL; 1331 1332 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1333 vmx_basic_vmcs_revision_id(data)) 1334 return -EINVAL; 1335 1336 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1337 return -EINVAL; 1338 1339 vmx->nested.msrs.basic = data; 1340 return 0; 1341 } 1342 1343 static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index, 1344 u32 **low, u32 **high) 1345 { 1346 switch (msr_index) { 1347 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1348 *low = &msrs->pinbased_ctls_low; 1349 *high = &msrs->pinbased_ctls_high; 1350 break; 1351 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1352 *low = &msrs->procbased_ctls_low; 1353 *high = &msrs->procbased_ctls_high; 1354 break; 1355 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1356 *low = &msrs->exit_ctls_low; 1357 *high = &msrs->exit_ctls_high; 1358 break; 1359 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1360 *low = &msrs->entry_ctls_low; 1361 *high = &msrs->entry_ctls_high; 1362 break; 1363 case MSR_IA32_VMX_PROCBASED_CTLS2: 1364 *low = &msrs->secondary_ctls_low; 1365 *high = &msrs->secondary_ctls_high; 1366 break; 1367 default: 1368 BUG(); 1369 } 1370 } 1371 1372 static int 1373 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1374 { 1375 u32 *lowp, *highp; 1376 u64 supported; 1377 1378 vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp); 1379 1380 supported = vmx_control_msr(*lowp, *highp); 1381 1382 /* Check must-be-1 bits are still 1. */ 1383 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1384 return -EINVAL; 1385 1386 /* Check must-be-0 bits are still 0. */ 1387 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1388 return -EINVAL; 1389 1390 vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp); 1391 *lowp = data; 1392 *highp = data >> 32; 1393 return 0; 1394 } 1395 1396 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1397 { 1398 const u64 feature_bits = VMX_MISC_SAVE_EFER_LMA | 1399 VMX_MISC_ACTIVITY_HLT | 1400 VMX_MISC_ACTIVITY_SHUTDOWN | 1401 VMX_MISC_ACTIVITY_WAIT_SIPI | 1402 VMX_MISC_INTEL_PT | 1403 VMX_MISC_RDMSR_IN_SMM | 1404 VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 1405 VMX_MISC_VMXOFF_BLOCK_SMI | 1406 VMX_MISC_ZERO_LEN_INS; 1407 1408 const u64 reserved_bits = BIT_ULL(31) | GENMASK_ULL(13, 9); 1409 1410 u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, 1411 vmcs_config.nested.misc_high); 1412 1413 BUILD_BUG_ON(feature_bits & reserved_bits); 1414 1415 /* 1416 * The incoming value must not set feature bits or reserved bits that 1417 * aren't allowed/supported by KVM. Fields, i.e. multi-bit values, are 1418 * explicitly checked below. 1419 */ 1420 if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits)) 1421 return -EINVAL; 1422 1423 if ((vmx->nested.msrs.pinbased_ctls_high & 1424 PIN_BASED_VMX_PREEMPTION_TIMER) && 1425 vmx_misc_preemption_timer_rate(data) != 1426 vmx_misc_preemption_timer_rate(vmx_misc)) 1427 return -EINVAL; 1428 1429 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1430 return -EINVAL; 1431 1432 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1433 return -EINVAL; 1434 1435 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1436 return -EINVAL; 1437 1438 vmx->nested.msrs.misc_low = data; 1439 vmx->nested.msrs.misc_high = data >> 32; 1440 1441 return 0; 1442 } 1443 1444 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1445 { 1446 u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps, 1447 vmcs_config.nested.vpid_caps); 1448 1449 /* Every bit is either reserved or a feature bit. */ 1450 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1451 return -EINVAL; 1452 1453 vmx->nested.msrs.ept_caps = data; 1454 vmx->nested.msrs.vpid_caps = data >> 32; 1455 return 0; 1456 } 1457 1458 static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index) 1459 { 1460 switch (msr_index) { 1461 case MSR_IA32_VMX_CR0_FIXED0: 1462 return &msrs->cr0_fixed0; 1463 case MSR_IA32_VMX_CR4_FIXED0: 1464 return &msrs->cr4_fixed0; 1465 default: 1466 BUG(); 1467 } 1468 } 1469 1470 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1471 { 1472 const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index); 1473 1474 /* 1475 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1476 * must be 1 in the restored value. 1477 */ 1478 if (!is_bitwise_subset(data, *msr, -1ULL)) 1479 return -EINVAL; 1480 1481 *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data; 1482 return 0; 1483 } 1484 1485 /* 1486 * Called when userspace is restoring VMX MSRs. 1487 * 1488 * Returns 0 on success, non-0 otherwise. 1489 */ 1490 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1491 { 1492 struct vcpu_vmx *vmx = to_vmx(vcpu); 1493 1494 /* 1495 * Don't allow changes to the VMX capability MSRs while the vCPU 1496 * is in VMX operation. 1497 */ 1498 if (vmx->nested.vmxon) 1499 return -EBUSY; 1500 1501 switch (msr_index) { 1502 case MSR_IA32_VMX_BASIC: 1503 return vmx_restore_vmx_basic(vmx, data); 1504 case MSR_IA32_VMX_PINBASED_CTLS: 1505 case MSR_IA32_VMX_PROCBASED_CTLS: 1506 case MSR_IA32_VMX_EXIT_CTLS: 1507 case MSR_IA32_VMX_ENTRY_CTLS: 1508 /* 1509 * The "non-true" VMX capability MSRs are generated from the 1510 * "true" MSRs, so we do not support restoring them directly. 1511 * 1512 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1513 * should restore the "true" MSRs with the must-be-1 bits 1514 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1515 * DEFAULT SETTINGS". 1516 */ 1517 return -EINVAL; 1518 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1519 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1520 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1521 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1522 case MSR_IA32_VMX_PROCBASED_CTLS2: 1523 return vmx_restore_control_msr(vmx, msr_index, data); 1524 case MSR_IA32_VMX_MISC: 1525 return vmx_restore_vmx_misc(vmx, data); 1526 case MSR_IA32_VMX_CR0_FIXED0: 1527 case MSR_IA32_VMX_CR4_FIXED0: 1528 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1529 case MSR_IA32_VMX_CR0_FIXED1: 1530 case MSR_IA32_VMX_CR4_FIXED1: 1531 /* 1532 * These MSRs are generated based on the vCPU's CPUID, so we 1533 * do not support restoring them directly. 1534 */ 1535 return -EINVAL; 1536 case MSR_IA32_VMX_EPT_VPID_CAP: 1537 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1538 case MSR_IA32_VMX_VMCS_ENUM: 1539 vmx->nested.msrs.vmcs_enum = data; 1540 return 0; 1541 case MSR_IA32_VMX_VMFUNC: 1542 if (data & ~vmcs_config.nested.vmfunc_controls) 1543 return -EINVAL; 1544 vmx->nested.msrs.vmfunc_controls = data; 1545 return 0; 1546 default: 1547 /* 1548 * The rest of the VMX capability MSRs do not support restore. 1549 */ 1550 return -EINVAL; 1551 } 1552 } 1553 1554 /* Returns 0 on success, non-0 otherwise. */ 1555 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1556 { 1557 switch (msr_index) { 1558 case MSR_IA32_VMX_BASIC: 1559 *pdata = msrs->basic; 1560 break; 1561 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1562 case MSR_IA32_VMX_PINBASED_CTLS: 1563 *pdata = vmx_control_msr( 1564 msrs->pinbased_ctls_low, 1565 msrs->pinbased_ctls_high); 1566 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1567 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1568 break; 1569 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1570 case MSR_IA32_VMX_PROCBASED_CTLS: 1571 *pdata = vmx_control_msr( 1572 msrs->procbased_ctls_low, 1573 msrs->procbased_ctls_high); 1574 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1575 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1576 break; 1577 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1578 case MSR_IA32_VMX_EXIT_CTLS: 1579 *pdata = vmx_control_msr( 1580 msrs->exit_ctls_low, 1581 msrs->exit_ctls_high); 1582 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1583 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1584 break; 1585 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1586 case MSR_IA32_VMX_ENTRY_CTLS: 1587 *pdata = vmx_control_msr( 1588 msrs->entry_ctls_low, 1589 msrs->entry_ctls_high); 1590 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1591 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1592 break; 1593 case MSR_IA32_VMX_MISC: 1594 *pdata = vmx_control_msr( 1595 msrs->misc_low, 1596 msrs->misc_high); 1597 break; 1598 case MSR_IA32_VMX_CR0_FIXED0: 1599 *pdata = msrs->cr0_fixed0; 1600 break; 1601 case MSR_IA32_VMX_CR0_FIXED1: 1602 *pdata = msrs->cr0_fixed1; 1603 break; 1604 case MSR_IA32_VMX_CR4_FIXED0: 1605 *pdata = msrs->cr4_fixed0; 1606 break; 1607 case MSR_IA32_VMX_CR4_FIXED1: 1608 *pdata = msrs->cr4_fixed1; 1609 break; 1610 case MSR_IA32_VMX_VMCS_ENUM: 1611 *pdata = msrs->vmcs_enum; 1612 break; 1613 case MSR_IA32_VMX_PROCBASED_CTLS2: 1614 *pdata = vmx_control_msr( 1615 msrs->secondary_ctls_low, 1616 msrs->secondary_ctls_high); 1617 break; 1618 case MSR_IA32_VMX_EPT_VPID_CAP: 1619 *pdata = msrs->ept_caps | 1620 ((u64)msrs->vpid_caps << 32); 1621 break; 1622 case MSR_IA32_VMX_VMFUNC: 1623 *pdata = msrs->vmfunc_controls; 1624 break; 1625 default: 1626 return 1; 1627 } 1628 1629 return 0; 1630 } 1631 1632 /* 1633 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1634 * been modified by the L1 guest. Note, "writable" in this context means 1635 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1636 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1637 * VM-exit information fields (which are actually writable if the vCPU is 1638 * configured to support "VMWRITE to any supported field in the VMCS"). 1639 */ 1640 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1641 { 1642 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1643 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1644 struct shadow_vmcs_field field; 1645 unsigned long val; 1646 int i; 1647 1648 if (WARN_ON(!shadow_vmcs)) 1649 return; 1650 1651 preempt_disable(); 1652 1653 vmcs_load(shadow_vmcs); 1654 1655 for (i = 0; i < max_shadow_read_write_fields; i++) { 1656 field = shadow_read_write_fields[i]; 1657 val = __vmcs_readl(field.encoding); 1658 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1659 } 1660 1661 vmcs_clear(shadow_vmcs); 1662 vmcs_load(vmx->loaded_vmcs->vmcs); 1663 1664 preempt_enable(); 1665 } 1666 1667 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1668 { 1669 const struct shadow_vmcs_field *fields[] = { 1670 shadow_read_write_fields, 1671 shadow_read_only_fields 1672 }; 1673 const int max_fields[] = { 1674 max_shadow_read_write_fields, 1675 max_shadow_read_only_fields 1676 }; 1677 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1678 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1679 struct shadow_vmcs_field field; 1680 unsigned long val; 1681 int i, q; 1682 1683 if (WARN_ON(!shadow_vmcs)) 1684 return; 1685 1686 vmcs_load(shadow_vmcs); 1687 1688 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1689 for (i = 0; i < max_fields[q]; i++) { 1690 field = fields[q][i]; 1691 val = vmcs12_read_any(vmcs12, field.encoding, 1692 field.offset); 1693 __vmcs_writel(field.encoding, val); 1694 } 1695 } 1696 1697 vmcs_clear(shadow_vmcs); 1698 vmcs_load(vmx->loaded_vmcs->vmcs); 1699 } 1700 1701 static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) 1702 { 1703 #ifdef CONFIG_KVM_HYPERV 1704 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1705 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 1706 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu); 1707 1708 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1709 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1710 vmcs12->guest_rip = evmcs->guest_rip; 1711 1712 if (unlikely(!(hv_clean_fields & 1713 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) { 1714 hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page; 1715 hv_vcpu->nested.vm_id = evmcs->hv_vm_id; 1716 hv_vcpu->nested.vp_id = evmcs->hv_vp_id; 1717 } 1718 1719 if (unlikely(!(hv_clean_fields & 1720 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1721 vmcs12->guest_rsp = evmcs->guest_rsp; 1722 vmcs12->guest_rflags = evmcs->guest_rflags; 1723 vmcs12->guest_interruptibility_info = 1724 evmcs->guest_interruptibility_info; 1725 /* 1726 * Not present in struct vmcs12: 1727 * vmcs12->guest_ssp = evmcs->guest_ssp; 1728 */ 1729 } 1730 1731 if (unlikely(!(hv_clean_fields & 1732 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1733 vmcs12->cpu_based_vm_exec_control = 1734 evmcs->cpu_based_vm_exec_control; 1735 } 1736 1737 if (unlikely(!(hv_clean_fields & 1738 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1739 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1740 } 1741 1742 if (unlikely(!(hv_clean_fields & 1743 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1744 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1745 } 1746 1747 if (unlikely(!(hv_clean_fields & 1748 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1749 vmcs12->vm_entry_intr_info_field = 1750 evmcs->vm_entry_intr_info_field; 1751 vmcs12->vm_entry_exception_error_code = 1752 evmcs->vm_entry_exception_error_code; 1753 vmcs12->vm_entry_instruction_len = 1754 evmcs->vm_entry_instruction_len; 1755 } 1756 1757 if (unlikely(!(hv_clean_fields & 1758 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1759 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1760 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1761 vmcs12->host_cr0 = evmcs->host_cr0; 1762 vmcs12->host_cr3 = evmcs->host_cr3; 1763 vmcs12->host_cr4 = evmcs->host_cr4; 1764 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1765 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1766 vmcs12->host_rip = evmcs->host_rip; 1767 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1768 vmcs12->host_es_selector = evmcs->host_es_selector; 1769 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1770 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1771 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1772 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1773 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1774 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1775 vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl; 1776 /* 1777 * Not present in struct vmcs12: 1778 * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet; 1779 * vmcs12->host_ssp = evmcs->host_ssp; 1780 * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr; 1781 */ 1782 } 1783 1784 if (unlikely(!(hv_clean_fields & 1785 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1786 vmcs12->pin_based_vm_exec_control = 1787 evmcs->pin_based_vm_exec_control; 1788 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1789 vmcs12->secondary_vm_exec_control = 1790 evmcs->secondary_vm_exec_control; 1791 } 1792 1793 if (unlikely(!(hv_clean_fields & 1794 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1795 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1796 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1797 } 1798 1799 if (unlikely(!(hv_clean_fields & 1800 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1801 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1802 } 1803 1804 if (unlikely(!(hv_clean_fields & 1805 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1806 vmcs12->guest_es_base = evmcs->guest_es_base; 1807 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1808 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1809 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1810 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1811 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1812 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1813 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1814 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1815 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1816 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1817 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1818 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1819 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1820 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1821 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1822 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1823 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1824 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1825 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1826 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1827 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1828 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1829 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1830 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1831 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1832 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1833 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1834 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1835 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1836 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1837 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1838 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1839 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1840 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1841 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1842 } 1843 1844 if (unlikely(!(hv_clean_fields & 1845 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1846 vmcs12->tsc_offset = evmcs->tsc_offset; 1847 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1848 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1849 vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap; 1850 vmcs12->tsc_multiplier = evmcs->tsc_multiplier; 1851 } 1852 1853 if (unlikely(!(hv_clean_fields & 1854 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1855 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1856 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1857 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1858 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1859 vmcs12->guest_cr0 = evmcs->guest_cr0; 1860 vmcs12->guest_cr3 = evmcs->guest_cr3; 1861 vmcs12->guest_cr4 = evmcs->guest_cr4; 1862 vmcs12->guest_dr7 = evmcs->guest_dr7; 1863 } 1864 1865 if (unlikely(!(hv_clean_fields & 1866 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1867 vmcs12->host_fs_base = evmcs->host_fs_base; 1868 vmcs12->host_gs_base = evmcs->host_gs_base; 1869 vmcs12->host_tr_base = evmcs->host_tr_base; 1870 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1871 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1872 vmcs12->host_rsp = evmcs->host_rsp; 1873 } 1874 1875 if (unlikely(!(hv_clean_fields & 1876 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1877 vmcs12->ept_pointer = evmcs->ept_pointer; 1878 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1879 } 1880 1881 if (unlikely(!(hv_clean_fields & 1882 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1883 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1884 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1885 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1886 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1887 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1888 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1889 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1890 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1891 vmcs12->guest_pending_dbg_exceptions = 1892 evmcs->guest_pending_dbg_exceptions; 1893 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1894 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1895 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1896 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1897 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1898 vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl; 1899 /* 1900 * Not present in struct vmcs12: 1901 * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet; 1902 * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl; 1903 * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr; 1904 */ 1905 } 1906 1907 /* 1908 * Not used? 1909 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1910 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1911 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1912 * vmcs12->page_fault_error_code_mask = 1913 * evmcs->page_fault_error_code_mask; 1914 * vmcs12->page_fault_error_code_match = 1915 * evmcs->page_fault_error_code_match; 1916 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1917 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1918 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1919 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1920 */ 1921 1922 /* 1923 * Read only fields: 1924 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1925 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1926 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1927 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1928 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1929 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1930 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1931 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1932 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1933 * vmcs12->exit_qualification = evmcs->exit_qualification; 1934 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1935 * 1936 * Not present in struct vmcs12: 1937 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1938 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1939 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1940 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1941 */ 1942 1943 return; 1944 #else /* CONFIG_KVM_HYPERV */ 1945 KVM_BUG_ON(1, vmx->vcpu.kvm); 1946 #endif /* CONFIG_KVM_HYPERV */ 1947 } 1948 1949 static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1950 { 1951 #ifdef CONFIG_KVM_HYPERV 1952 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1953 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 1954 1955 /* 1956 * Should not be changed by KVM: 1957 * 1958 * evmcs->host_es_selector = vmcs12->host_es_selector; 1959 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1960 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1961 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1962 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1963 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1964 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1965 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1966 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1967 * evmcs->host_cr0 = vmcs12->host_cr0; 1968 * evmcs->host_cr3 = vmcs12->host_cr3; 1969 * evmcs->host_cr4 = vmcs12->host_cr4; 1970 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1971 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1972 * evmcs->host_rip = vmcs12->host_rip; 1973 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1974 * evmcs->host_fs_base = vmcs12->host_fs_base; 1975 * evmcs->host_gs_base = vmcs12->host_gs_base; 1976 * evmcs->host_tr_base = vmcs12->host_tr_base; 1977 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1978 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1979 * evmcs->host_rsp = vmcs12->host_rsp; 1980 * sync_vmcs02_to_vmcs12() doesn't read these: 1981 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1982 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1983 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1984 * evmcs->ept_pointer = vmcs12->ept_pointer; 1985 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 1986 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 1987 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 1988 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 1989 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 1990 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 1991 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 1992 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 1993 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 1994 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 1995 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 1996 * evmcs->page_fault_error_code_mask = 1997 * vmcs12->page_fault_error_code_mask; 1998 * evmcs->page_fault_error_code_match = 1999 * vmcs12->page_fault_error_code_match; 2000 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 2001 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 2002 * evmcs->tsc_offset = vmcs12->tsc_offset; 2003 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 2004 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 2005 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 2006 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 2007 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 2008 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 2009 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 2010 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 2011 * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl; 2012 * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl; 2013 * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap; 2014 * evmcs->tsc_multiplier = vmcs12->tsc_multiplier; 2015 * 2016 * Not present in struct vmcs12: 2017 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 2018 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 2019 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 2020 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 2021 * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet; 2022 * evmcs->host_ssp = vmcs12->host_ssp; 2023 * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr; 2024 * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet; 2025 * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl; 2026 * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr; 2027 * evmcs->guest_ssp = vmcs12->guest_ssp; 2028 */ 2029 2030 evmcs->guest_es_selector = vmcs12->guest_es_selector; 2031 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 2032 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 2033 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 2034 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 2035 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 2036 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 2037 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 2038 2039 evmcs->guest_es_limit = vmcs12->guest_es_limit; 2040 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 2041 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 2042 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 2043 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 2044 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 2045 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 2046 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 2047 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 2048 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 2049 2050 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 2051 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 2052 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 2053 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 2054 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 2055 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 2056 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 2057 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 2058 2059 evmcs->guest_es_base = vmcs12->guest_es_base; 2060 evmcs->guest_cs_base = vmcs12->guest_cs_base; 2061 evmcs->guest_ss_base = vmcs12->guest_ss_base; 2062 evmcs->guest_ds_base = vmcs12->guest_ds_base; 2063 evmcs->guest_fs_base = vmcs12->guest_fs_base; 2064 evmcs->guest_gs_base = vmcs12->guest_gs_base; 2065 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 2066 evmcs->guest_tr_base = vmcs12->guest_tr_base; 2067 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 2068 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 2069 2070 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 2071 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 2072 2073 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 2074 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 2075 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 2076 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 2077 2078 evmcs->guest_pending_dbg_exceptions = 2079 vmcs12->guest_pending_dbg_exceptions; 2080 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 2081 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 2082 2083 evmcs->guest_activity_state = vmcs12->guest_activity_state; 2084 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 2085 2086 evmcs->guest_cr0 = vmcs12->guest_cr0; 2087 evmcs->guest_cr3 = vmcs12->guest_cr3; 2088 evmcs->guest_cr4 = vmcs12->guest_cr4; 2089 evmcs->guest_dr7 = vmcs12->guest_dr7; 2090 2091 evmcs->guest_physical_address = vmcs12->guest_physical_address; 2092 2093 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 2094 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 2095 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 2096 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 2097 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 2098 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 2099 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 2100 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 2101 2102 evmcs->exit_qualification = vmcs12->exit_qualification; 2103 2104 evmcs->guest_linear_address = vmcs12->guest_linear_address; 2105 evmcs->guest_rsp = vmcs12->guest_rsp; 2106 evmcs->guest_rflags = vmcs12->guest_rflags; 2107 2108 evmcs->guest_interruptibility_info = 2109 vmcs12->guest_interruptibility_info; 2110 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 2111 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 2112 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 2113 evmcs->vm_entry_exception_error_code = 2114 vmcs12->vm_entry_exception_error_code; 2115 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 2116 2117 evmcs->guest_rip = vmcs12->guest_rip; 2118 2119 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 2120 2121 return; 2122 #else /* CONFIG_KVM_HYPERV */ 2123 KVM_BUG_ON(1, vmx->vcpu.kvm); 2124 #endif /* CONFIG_KVM_HYPERV */ 2125 } 2126 2127 /* 2128 * This is an equivalent of the nested hypervisor executing the vmptrld 2129 * instruction. 2130 */ 2131 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( 2132 struct kvm_vcpu *vcpu, bool from_launch) 2133 { 2134 #ifdef CONFIG_KVM_HYPERV 2135 struct vcpu_vmx *vmx = to_vmx(vcpu); 2136 bool evmcs_gpa_changed = false; 2137 u64 evmcs_gpa; 2138 2139 if (likely(!guest_cpu_cap_has_evmcs(vcpu))) 2140 return EVMPTRLD_DISABLED; 2141 2142 evmcs_gpa = nested_get_evmptr(vcpu); 2143 if (!evmptr_is_valid(evmcs_gpa)) { 2144 nested_release_evmcs(vcpu); 2145 return EVMPTRLD_DISABLED; 2146 } 2147 2148 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 2149 vmx->nested.current_vmptr = INVALID_GPA; 2150 2151 nested_release_evmcs(vcpu); 2152 2153 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 2154 &vmx->nested.hv_evmcs_map)) 2155 return EVMPTRLD_ERROR; 2156 2157 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 2158 2159 /* 2160 * Currently, KVM only supports eVMCS version 1 2161 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 2162 * value to first u32 field of eVMCS which should specify eVMCS 2163 * VersionNumber. 2164 * 2165 * Guest should be aware of supported eVMCS versions by host by 2166 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 2167 * expected to set this CPUID leaf according to the value 2168 * returned in vmcs_version from nested_enable_evmcs(). 2169 * 2170 * However, it turns out that Microsoft Hyper-V fails to comply 2171 * to their own invented interface: When Hyper-V use eVMCS, it 2172 * just sets first u32 field of eVMCS to revision_id specified 2173 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 2174 * which is one of the supported versions specified in 2175 * CPUID.0x4000000A.EAX[0:15]. 2176 * 2177 * To overcome Hyper-V bug, we accept here either a supported 2178 * eVMCS version or VMCS12 revision_id as valid values for first 2179 * u32 field of eVMCS. 2180 */ 2181 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 2182 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 2183 nested_release_evmcs(vcpu); 2184 return EVMPTRLD_VMFAIL; 2185 } 2186 2187 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 2188 2189 evmcs_gpa_changed = true; 2190 /* 2191 * Unlike normal vmcs12, enlightened vmcs12 is not fully 2192 * reloaded from guest's memory (read only fields, fields not 2193 * present in struct hv_enlightened_vmcs, ...). Make sure there 2194 * are no leftovers. 2195 */ 2196 if (from_launch) { 2197 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2198 memset(vmcs12, 0, sizeof(*vmcs12)); 2199 vmcs12->hdr.revision_id = VMCS12_REVISION; 2200 } 2201 2202 } 2203 2204 /* 2205 * Clean fields data can't be used on VMLAUNCH and when we switch 2206 * between different L2 guests as KVM keeps a single VMCS12 per L1. 2207 */ 2208 if (from_launch || evmcs_gpa_changed) { 2209 vmx->nested.hv_evmcs->hv_clean_fields &= 2210 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2211 2212 vmx->nested.force_msr_bitmap_recalc = true; 2213 } 2214 2215 return EVMPTRLD_SUCCEEDED; 2216 #else 2217 return EVMPTRLD_DISABLED; 2218 #endif 2219 } 2220 2221 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 2222 { 2223 struct vcpu_vmx *vmx = to_vmx(vcpu); 2224 2225 if (nested_vmx_is_evmptr12_valid(vmx)) 2226 copy_vmcs12_to_enlightened(vmx); 2227 else 2228 copy_vmcs12_to_shadow(vmx); 2229 2230 vmx->nested.need_vmcs12_to_shadow_sync = false; 2231 } 2232 2233 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 2234 { 2235 struct vcpu_vmx *vmx = 2236 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 2237 2238 vmx->nested.preemption_timer_expired = true; 2239 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 2240 kvm_vcpu_kick(&vmx->vcpu); 2241 2242 return HRTIMER_NORESTART; 2243 } 2244 2245 static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) 2246 { 2247 struct vcpu_vmx *vmx = to_vmx(vcpu); 2248 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2249 2250 u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> 2251 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2252 2253 if (!vmx->nested.has_preemption_timer_deadline) { 2254 vmx->nested.preemption_timer_deadline = 2255 vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; 2256 vmx->nested.has_preemption_timer_deadline = true; 2257 } 2258 return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; 2259 } 2260 2261 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, 2262 u64 preemption_timeout) 2263 { 2264 struct vcpu_vmx *vmx = to_vmx(vcpu); 2265 2266 /* 2267 * A timer value of zero is architecturally guaranteed to cause 2268 * a VMExit prior to executing any instructions in the guest. 2269 */ 2270 if (preemption_timeout == 0) { 2271 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 2272 return; 2273 } 2274 2275 if (vcpu->arch.virtual_tsc_khz == 0) 2276 return; 2277 2278 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2279 preemption_timeout *= 1000000; 2280 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 2281 hrtimer_start(&vmx->nested.preemption_timer, 2282 ktime_add_ns(ktime_get(), preemption_timeout), 2283 HRTIMER_MODE_ABS_PINNED); 2284 } 2285 2286 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2287 { 2288 if (vmx->vcpu.arch.nested_run_pending && 2289 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 2290 return vmcs12->guest_ia32_efer; 2291 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 2292 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 2293 else 2294 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 2295 } 2296 2297 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 2298 { 2299 struct kvm *kvm = vmx->vcpu.kvm; 2300 2301 /* 2302 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 2303 * according to L0's settings (vmcs12 is irrelevant here). Host 2304 * fields that come from L0 and are not constant, e.g. HOST_CR3, 2305 * will be set as needed prior to VMLAUNCH/VMRESUME. 2306 */ 2307 if (vmx->nested.vmcs02_initialized) 2308 return; 2309 vmx->nested.vmcs02_initialized = true; 2310 2311 if (vmx->ve_info) 2312 vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info)); 2313 2314 /* All VMFUNCs are currently emulated through L0 vmexits. */ 2315 if (cpu_has_vmx_vmfunc()) 2316 vmcs_write64(VM_FUNCTION_CONTROL, 0); 2317 2318 if (cpu_has_vmx_posted_intr()) 2319 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 2320 2321 if (cpu_has_vmx_msr_bitmap()) 2322 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2323 2324 /* 2325 * PML is emulated for L2, but never enabled in hardware as the MMU 2326 * handles A/D emulation. Disabling PML for L2 also avoids having to 2327 * deal with filtering out L2 GPAs from the buffer. 2328 */ 2329 if (enable_pml) { 2330 vmcs_write64(PML_ADDRESS, 0); 2331 vmcs_write16(GUEST_PML_INDEX, -1); 2332 } 2333 2334 if (cpu_has_vmx_encls_vmexit()) 2335 vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); 2336 2337 if (kvm_notify_vmexit_enabled(kvm)) 2338 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 2339 2340 /* 2341 * Set the MSR load/store lists to match L0's settings. Only the 2342 * addresses are constant (for vmcs02), the counts can change based 2343 * on L2's behavior, e.g. switching to/from long mode. 2344 */ 2345 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.val)); 2346 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2347 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2348 2349 vmx_set_constant_host_state(vmx); 2350 } 2351 2352 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2353 struct vmcs12 *vmcs12) 2354 { 2355 prepare_vmcs02_constant_state(vmx); 2356 2357 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 2358 2359 /* 2360 * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the 2361 * same VPID as the host. Emulate this behavior by using vpid01 for L2 2362 * if VPID is disabled in vmcs12. Note, if VPID is disabled, VM-Enter 2363 * and VM-Exit are architecturally required to flush VPID=0, but *only* 2364 * VPID=0. I.e. using vpid02 would be ok (so long as KVM emulates the 2365 * required flushes), but doing so would cause KVM to over-flush. E.g. 2366 * if L1 runs L2 X with VPID12=1, then runs L2 Y with VPID12 disabled, 2367 * and then runs L2 X again, then KVM can and should retain TLB entries 2368 * for VPID12=1. 2369 */ 2370 if (enable_vpid) { 2371 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2372 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2373 else 2374 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2375 } 2376 } 2377 2378 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, 2379 struct vmcs12 *vmcs12) 2380 { 2381 u32 exec_control; 2382 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2383 2384 if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) 2385 prepare_vmcs02_early_rare(vmx, vmcs12); 2386 2387 /* 2388 * PIN CONTROLS 2389 */ 2390 exec_control = __pin_controls_get(vmcs01); 2391 exec_control |= (vmcs12->pin_based_vm_exec_control & 2392 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2393 2394 /* Posted interrupts setting is only taken from vmcs12. */ 2395 vmx->nested.pi_pending = false; 2396 if (nested_cpu_has_posted_intr(vmcs12)) { 2397 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2398 } else { 2399 vmx->nested.posted_intr_nv = -1; 2400 exec_control &= ~PIN_BASED_POSTED_INTR; 2401 } 2402 pin_controls_set(vmx, exec_control); 2403 2404 /* 2405 * EXEC CONTROLS 2406 */ 2407 exec_control = __exec_controls_get(vmcs01); /* L0's desires */ 2408 exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; 2409 exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; 2410 exec_control &= ~CPU_BASED_TPR_SHADOW; 2411 exec_control |= vmcs12->cpu_based_vm_exec_control; 2412 2413 if (exec_control & CPU_BASED_TPR_SHADOW) 2414 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2415 #ifdef CONFIG_X86_64 2416 else 2417 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2418 CPU_BASED_CR8_STORE_EXITING; 2419 #endif 2420 2421 /* 2422 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2423 * for I/O port accesses. 2424 */ 2425 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2426 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2427 2428 /* 2429 * This bit will be computed in nested_get_vmcs12_pages, because 2430 * we do not have access to L1's MSR bitmap yet. For now, keep 2431 * the same bit as before, hoping to avoid multiple VMWRITEs that 2432 * only set/clear this bit. 2433 */ 2434 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2435 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2436 2437 exec_controls_set(vmx, exec_control); 2438 2439 /* 2440 * SECONDARY EXEC CONTROLS 2441 */ 2442 if (cpu_has_secondary_exec_ctrls()) { 2443 exec_control = __secondary_exec_controls_get(vmcs01); 2444 2445 /* Take the following fields only from vmcs12 */ 2446 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2447 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2448 SECONDARY_EXEC_ENABLE_INVPCID | 2449 SECONDARY_EXEC_ENABLE_RDTSCP | 2450 SECONDARY_EXEC_ENABLE_XSAVES | 2451 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2452 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2453 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2454 SECONDARY_EXEC_ENABLE_VMFUNC | 2455 SECONDARY_EXEC_MODE_BASED_EPT_EXEC | 2456 SECONDARY_EXEC_DESC); 2457 2458 if (nested_cpu_has(vmcs12, 2459 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 2460 exec_control |= vmcs12->secondary_vm_exec_control; 2461 2462 /* PML is emulated and never enabled in hardware for L2. */ 2463 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 2464 2465 /* VMCS shadowing for L2 is emulated for now */ 2466 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2467 2468 /* 2469 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2470 * will not have to rewrite the controls just for this bit. 2471 */ 2472 if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2473 exec_control |= SECONDARY_EXEC_DESC; 2474 2475 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2476 vmcs_write16(GUEST_INTR_STATUS, 2477 vmcs12->guest_intr_status); 2478 2479 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 2480 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2481 2482 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) 2483 vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); 2484 2485 secondary_exec_controls_set(vmx, exec_control); 2486 } 2487 2488 /* 2489 * ENTRY CONTROLS 2490 * 2491 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2492 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2493 * on the related bits (if supported by the CPU) in the hope that 2494 * we can avoid VMWrites during vmx_set_efer(). 2495 * 2496 * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is 2497 * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to 2498 * do the same for L2. 2499 */ 2500 exec_control = __vm_entry_controls_get(vmcs01); 2501 exec_control |= (vmcs12->vm_entry_controls & 2502 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 2503 exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); 2504 if (cpu_has_load_ia32_efer()) { 2505 if (guest_efer & EFER_LMA) 2506 exec_control |= VM_ENTRY_IA32E_MODE; 2507 if (guest_efer != kvm_host.efer) 2508 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2509 } 2510 vm_entry_controls_set(vmx, exec_control); 2511 2512 /* 2513 * EXIT CONTROLS 2514 * 2515 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2516 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2517 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2518 */ 2519 exec_control = __vm_exit_controls_get(vmcs01); 2520 if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer) 2521 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2522 else 2523 exec_control &= ~VM_EXIT_LOAD_IA32_EFER; 2524 vm_exit_controls_set(vmx, exec_control); 2525 2526 /* 2527 * Interrupt/Exception Fields 2528 */ 2529 if (vmx->vcpu.arch.nested_run_pending) { 2530 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2531 vmcs12->vm_entry_intr_info_field); 2532 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2533 vmcs12->vm_entry_exception_error_code); 2534 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2535 vmcs12->vm_entry_instruction_len); 2536 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2537 vmcs12->guest_interruptibility_info); 2538 vmx->loaded_vmcs->nmi_known_unmasked = 2539 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2540 } else { 2541 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2542 } 2543 } 2544 2545 static void vmcs_read_cet_state(struct kvm_vcpu *vcpu, u64 *s_cet, 2546 u64 *ssp, u64 *ssp_tbl) 2547 { 2548 if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || 2549 guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) 2550 *s_cet = vmcs_readl(GUEST_S_CET); 2551 2552 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { 2553 *ssp = vmcs_readl(GUEST_SSP); 2554 *ssp_tbl = vmcs_readl(GUEST_INTR_SSP_TABLE); 2555 } 2556 } 2557 2558 static void vmcs_write_cet_state(struct kvm_vcpu *vcpu, u64 s_cet, 2559 u64 ssp, u64 ssp_tbl) 2560 { 2561 if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || 2562 guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) 2563 vmcs_writel(GUEST_S_CET, s_cet); 2564 2565 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { 2566 vmcs_writel(GUEST_SSP, ssp); 2567 vmcs_writel(GUEST_INTR_SSP_TABLE, ssp_tbl); 2568 } 2569 } 2570 2571 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2572 { 2573 struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx); 2574 2575 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2576 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2577 2578 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2579 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2580 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2581 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2582 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2583 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2584 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2585 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2586 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2587 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2588 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2589 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2590 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2591 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2592 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2593 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2594 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2595 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2596 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2597 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2598 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2599 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2600 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2601 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2602 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2603 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2604 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2605 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2606 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2607 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2608 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2609 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2610 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2611 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2612 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2613 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2614 2615 vmx_segment_cache_clear(vmx); 2616 } 2617 2618 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2619 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2620 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2621 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2622 vmcs12->guest_pending_dbg_exceptions); 2623 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2624 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2625 2626 /* 2627 * L1 may access the L2's PDPTR, so save them to construct 2628 * vmcs12 2629 */ 2630 if (enable_ept) { 2631 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2632 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2633 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2634 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2635 } 2636 2637 if (kvm_mpx_supported() && vmx->vcpu.arch.nested_run_pending && 2638 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2639 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2640 } 2641 2642 if (nested_cpu_has_xsaves(vmcs12)) 2643 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2644 2645 /* 2646 * Whether page-faults are trapped is determined by a combination of 2647 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 2648 * doesn't care about page faults then we should set all of these to 2649 * L1's desires. However, if L0 does care about (some) page faults, it 2650 * is not easy (if at all possible?) to merge L0 and L1's desires, we 2651 * simply ask to exit on each and every L2 page fault. This is done by 2652 * setting MASK=MATCH=0 and (see below) EB.PF=1. 2653 * Note that below we don't need special code to set EB.PF beyond the 2654 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2655 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2656 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2657 */ 2658 if (vmx_need_pf_intercept(&vmx->vcpu)) { 2659 /* 2660 * TODO: if both L0 and L1 need the same MASK and MATCH, 2661 * go ahead and use it? 2662 */ 2663 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 2664 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 2665 } else { 2666 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); 2667 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); 2668 } 2669 2670 if (cpu_has_vmx_apicv()) { 2671 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2672 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2673 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2674 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2675 } 2676 2677 /* 2678 * If vmcs12 is configured to save TSC on exit via the auto-store list, 2679 * append the MSR to vmcs02's auto-store list so that KVM effectively 2680 * reads TSC at the time of VM-Exit from L2. The saved value will be 2681 * propagated to vmcs12's list on nested VM-Exit. 2682 * 2683 * Don't increment the number of MSRs in the vCPU structure, as saving 2684 * TSC is specific to this particular incarnation of vmcb02, i.e. must 2685 * not bleed into vmcs01. 2686 */ 2687 if (nested_msr_store_list_has_msr(&vmx->vcpu, MSR_IA32_TSC) && 2688 !WARN_ON_ONCE(vmx->msr_autostore.nr >= ARRAY_SIZE(vmx->msr_autostore.val))) { 2689 vmx->nested.tsc_autostore_slot = vmx->msr_autostore.nr; 2690 vmx->msr_autostore.val[vmx->msr_autostore.nr].index = MSR_IA32_TSC; 2691 2692 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr + 1); 2693 } else { 2694 vmx->nested.tsc_autostore_slot = -1; 2695 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr); 2696 } 2697 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2698 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2699 2700 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) 2701 vmcs_write_cet_state(&vmx->vcpu, vmcs12->guest_s_cet, 2702 vmcs12->guest_ssp, vmcs12->guest_ssp_tbl); 2703 2704 set_cr4_guest_host_mask(vmx); 2705 } 2706 2707 /* 2708 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2709 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2710 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2711 * guest in a way that will both be appropriate to L1's requests, and our 2712 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2713 * function also has additional necessary side-effects, like setting various 2714 * vcpu->arch fields. 2715 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2716 * is assigned to entry_failure_code on failure. 2717 */ 2718 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2719 bool from_vmentry, 2720 enum vm_entry_failure_code *entry_failure_code) 2721 { 2722 struct vcpu_vmx *vmx = to_vmx(vcpu); 2723 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 2724 bool load_guest_pdptrs_vmcs12 = false; 2725 2726 if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) { 2727 prepare_vmcs02_rare(vmx, vmcs12); 2728 vmx->nested.dirty_vmcs12 = false; 2729 2730 load_guest_pdptrs_vmcs12 = !nested_vmx_is_evmptr12_valid(vmx) || 2731 !(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2732 } 2733 2734 if (vcpu->arch.nested_run_pending && 2735 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2736 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2737 vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl & 2738 vmx_get_supported_debugctl(vcpu, false)); 2739 } else { 2740 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2741 vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl); 2742 } 2743 2744 if (!vcpu->arch.nested_run_pending || 2745 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) 2746 vmcs_write_cet_state(vcpu, vmx->nested.pre_vmenter_s_cet, 2747 vmx->nested.pre_vmenter_ssp, 2748 vmx->nested.pre_vmenter_ssp_tbl); 2749 2750 if (kvm_mpx_supported() && (!vcpu->arch.nested_run_pending || 2751 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2752 vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); 2753 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2754 2755 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2756 * bitwise-or of what L1 wants to trap for L2, and what we want to 2757 * trap. Note that CR0.TS also needs updating - we do this later. 2758 */ 2759 vmx_update_exception_bitmap(vcpu); 2760 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2761 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2762 2763 if (vcpu->arch.nested_run_pending && 2764 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2765 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2766 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2767 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2768 vmcs_write64(GUEST_IA32_PAT, vcpu->arch.pat); 2769 } 2770 2771 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( 2772 vcpu->arch.l1_tsc_offset, 2773 vmx_get_l2_tsc_offset(vcpu), 2774 vmx_get_l2_tsc_multiplier(vcpu)); 2775 2776 vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( 2777 vcpu->arch.l1_tsc_scaling_ratio, 2778 vmx_get_l2_tsc_multiplier(vcpu)); 2779 2780 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2781 if (kvm_caps.has_tsc_control) 2782 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 2783 2784 nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); 2785 2786 if (nested_cpu_has_ept(vmcs12)) 2787 nested_ept_init_mmu_context(vcpu); 2788 2789 /* 2790 * Override the CR0/CR4 read shadows after setting the effective guest 2791 * CR0/CR4. The common helpers also set the shadows, but they don't 2792 * account for vmcs12's cr0/4_guest_host_mask. 2793 */ 2794 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2795 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2796 2797 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2798 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2799 2800 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2801 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2802 vmx_set_efer(vcpu, vcpu->arch.efer); 2803 2804 /* 2805 * Guest state is invalid and unrestricted guest is disabled, 2806 * which means L1 attempted VMEntry to L2 with invalid state. 2807 * Fail the VMEntry. 2808 * 2809 * However when force loading the guest state (SMM exit or 2810 * loading nested state after migration, it is possible to 2811 * have invalid guest state now, which will be later fixed by 2812 * restoring L2 register state 2813 */ 2814 if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { 2815 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2816 return -EINVAL; 2817 } 2818 2819 /* Shadow page tables on either EPT or shadow page tables. */ 2820 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2821 from_vmentry, entry_failure_code)) 2822 return -EINVAL; 2823 2824 /* 2825 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 2826 * on nested VM-Exit, which can occur without actually running L2 and 2827 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with 2828 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the 2829 * transition to HLT instead of running L2. 2830 */ 2831 if (enable_ept) 2832 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); 2833 2834 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2835 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2836 is_pae_paging(vcpu)) { 2837 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2838 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2839 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2840 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2841 } 2842 2843 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2844 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && 2845 WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2846 vmcs12->guest_ia32_perf_global_ctrl))) { 2847 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2848 return -EINVAL; 2849 } 2850 2851 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2852 kvm_rip_write(vcpu, vmcs12->guest_rip); 2853 2854 /* 2855 * It was observed that genuine Hyper-V running in L1 doesn't reset 2856 * 'hv_clean_fields' by itself, it only sets the corresponding dirty 2857 * bits when it changes a field in eVMCS. Mark all fields as clean 2858 * here. 2859 */ 2860 if (nested_vmx_is_evmptr12_valid(vmx)) 2861 evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2862 2863 return 0; 2864 } 2865 2866 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2867 { 2868 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2869 nested_cpu_has_virtual_nmis(vmcs12))) 2870 return -EINVAL; 2871 2872 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2873 nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) 2874 return -EINVAL; 2875 2876 return 0; 2877 } 2878 2879 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) 2880 { 2881 struct vcpu_vmx *vmx = to_vmx(vcpu); 2882 2883 /* Check for memory type validity */ 2884 switch (new_eptp & VMX_EPTP_MT_MASK) { 2885 case VMX_EPTP_MT_UC: 2886 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2887 return false; 2888 break; 2889 case VMX_EPTP_MT_WB: 2890 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2891 return false; 2892 break; 2893 default: 2894 return false; 2895 } 2896 2897 /* Page-walk levels validity. */ 2898 switch (new_eptp & VMX_EPTP_PWL_MASK) { 2899 case VMX_EPTP_PWL_5: 2900 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) 2901 return false; 2902 break; 2903 case VMX_EPTP_PWL_4: 2904 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) 2905 return false; 2906 break; 2907 default: 2908 return false; 2909 } 2910 2911 /* Reserved bits should not be set */ 2912 if (CC(!kvm_vcpu_is_legal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) 2913 return false; 2914 2915 /* AD, if set, should be supported */ 2916 if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { 2917 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2918 return false; 2919 } 2920 2921 return true; 2922 } 2923 2924 /* 2925 * Checks related to VM-Execution Control Fields 2926 */ 2927 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2928 struct vmcs12 *vmcs12) 2929 { 2930 struct vcpu_vmx *vmx = to_vmx(vcpu); 2931 2932 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2933 vmx->nested.msrs.pinbased_ctls_low, 2934 vmx->nested.msrs.pinbased_ctls_high)) || 2935 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2936 vmx->nested.msrs.procbased_ctls_low, 2937 vmx->nested.msrs.procbased_ctls_high))) 2938 return -EINVAL; 2939 2940 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2941 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2942 vmx->nested.msrs.secondary_ctls_low, 2943 vmx->nested.msrs.secondary_ctls_high))) 2944 return -EINVAL; 2945 2946 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2947 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2948 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2949 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2950 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2951 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2952 nested_vmx_check_nmi_controls(vmcs12) || 2953 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2954 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2955 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2956 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2957 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2958 return -EINVAL; 2959 2960 if (!nested_cpu_has_preemption_timer(vmcs12) && 2961 nested_cpu_has_save_preemption_timer(vmcs12)) 2962 return -EINVAL; 2963 2964 if (nested_cpu_has_ept(vmcs12) && 2965 CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) 2966 return -EINVAL; 2967 2968 if (nested_cpu_has_vmfunc(vmcs12)) { 2969 if (CC(vmcs12->vm_function_control & 2970 ~vmx->nested.msrs.vmfunc_controls)) 2971 return -EINVAL; 2972 2973 if (nested_cpu_has_eptp_switching(vmcs12)) { 2974 if (CC(!nested_cpu_has_ept(vmcs12)) || 2975 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2976 return -EINVAL; 2977 } 2978 } 2979 2980 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING) && 2981 CC(!vmcs12->tsc_multiplier)) 2982 return -EINVAL; 2983 2984 return 0; 2985 } 2986 2987 /* 2988 * Checks related to VM-Exit Control Fields 2989 */ 2990 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2991 struct vmcs12 *vmcs12) 2992 { 2993 struct vcpu_vmx *vmx = to_vmx(vcpu); 2994 2995 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 2996 vmx->nested.msrs.exit_ctls_low, 2997 vmx->nested.msrs.exit_ctls_high)) || 2998 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 2999 return -EINVAL; 3000 3001 return 0; 3002 } 3003 3004 /* 3005 * Checks related to VM-Entry Control Fields 3006 */ 3007 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 3008 struct vmcs12 *vmcs12) 3009 { 3010 struct vcpu_vmx *vmx = to_vmx(vcpu); 3011 3012 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 3013 vmx->nested.msrs.entry_ctls_low, 3014 vmx->nested.msrs.entry_ctls_high))) 3015 return -EINVAL; 3016 3017 /* 3018 * From the Intel SDM, volume 3: 3019 * Fields relevant to VM-entry event injection must be set properly. 3020 * These fields are the VM-entry interruption-information field, the 3021 * VM-entry exception error code, and the VM-entry instruction length. 3022 */ 3023 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 3024 u32 intr_info = vmcs12->vm_entry_intr_info_field; 3025 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 3026 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 3027 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 3028 bool urg = nested_cpu_has2(vmcs12, 3029 SECONDARY_EXEC_UNRESTRICTED_GUEST); 3030 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 3031 3032 /* VM-entry interruption-info field: interruption type */ 3033 if (CC(intr_type == INTR_TYPE_RESERVED) || 3034 CC(intr_type == INTR_TYPE_OTHER_EVENT && 3035 !nested_cpu_supports_monitor_trap_flag(vcpu))) 3036 return -EINVAL; 3037 3038 /* VM-entry interruption-info field: vector */ 3039 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 3040 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 3041 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 3042 return -EINVAL; 3043 3044 /* 3045 * Cannot deliver error code in real mode or if the interrupt 3046 * type is not hardware exception. For other cases, do the 3047 * consistency check only if the vCPU doesn't enumerate 3048 * VMX_BASIC_NO_HW_ERROR_CODE_CC. 3049 */ 3050 if (!prot_mode || intr_type != INTR_TYPE_HARD_EXCEPTION) { 3051 if (CC(has_error_code)) 3052 return -EINVAL; 3053 } else if (!nested_cpu_has_no_hw_errcode_cc(vcpu)) { 3054 if (CC(has_error_code != x86_exception_has_error_code(vector))) 3055 return -EINVAL; 3056 } 3057 3058 /* VM-entry exception error code */ 3059 if (CC(has_error_code && 3060 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) 3061 return -EINVAL; 3062 3063 /* VM-entry interruption-info field: reserved bits */ 3064 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 3065 return -EINVAL; 3066 3067 /* VM-entry instruction length */ 3068 switch (intr_type) { 3069 case INTR_TYPE_SOFT_EXCEPTION: 3070 case INTR_TYPE_SOFT_INTR: 3071 case INTR_TYPE_PRIV_SW_EXCEPTION: 3072 if (CC(vmcs12->vm_entry_instruction_len > X86_MAX_INSTRUCTION_LENGTH) || 3073 CC(vmcs12->vm_entry_instruction_len == 0 && 3074 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 3075 return -EINVAL; 3076 } 3077 } 3078 3079 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 3080 return -EINVAL; 3081 3082 return 0; 3083 } 3084 3085 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 3086 struct vmcs12 *vmcs12) 3087 { 3088 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 3089 nested_check_vm_exit_controls(vcpu, vmcs12) || 3090 nested_check_vm_entry_controls(vcpu, vmcs12)) 3091 return -EINVAL; 3092 3093 #ifdef CONFIG_KVM_HYPERV 3094 if (guest_cpu_cap_has_evmcs(vcpu)) 3095 return nested_evmcs_check_controls(vmcs12); 3096 #endif 3097 3098 return 0; 3099 } 3100 3101 static int nested_vmx_check_controls_late(struct kvm_vcpu *vcpu, 3102 struct vmcs12 *vmcs12) 3103 { 3104 void *vapic = to_vmx(vcpu)->nested.virtual_apic_map.hva; 3105 u32 vtpr = vapic ? (*(u32 *)(vapic + APIC_TASKPRI)) >> 4 : 0; 3106 3107 /* 3108 * Don't bother with the consistency checks if KVM isn't configured to 3109 * WARN on missed consistency checks, as KVM needs to rely on hardware 3110 * to fully detect an illegal vTPR vs. TRP Threshold combination due to 3111 * the vTPR being writable by L1 at all times (it's an in-memory value, 3112 * not a VMCS field). I.e. even if the check passes now, it might fail 3113 * at the actual VM-Enter. 3114 * 3115 * Keying off the module param also allows treating an invalid vAPIC 3116 * mapping as a consistency check failure without increasing the risk 3117 * of breaking a "real" VM. 3118 */ 3119 if (!warn_on_missed_cc) 3120 return 0; 3121 3122 if ((exec_controls_get(to_vmx(vcpu)) & CPU_BASED_TPR_SHADOW) && 3123 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW) && 3124 !nested_cpu_has_vid(vmcs12) && 3125 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 3126 (CC(!vapic) || 3127 CC((vmcs12->tpr_threshold & GENMASK(3, 0)) > (vtpr & GENMASK(3, 0))))) 3128 return -EINVAL; 3129 3130 return 0; 3131 } 3132 3133 static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, 3134 struct vmcs12 *vmcs12) 3135 { 3136 #ifdef CONFIG_X86_64 3137 if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != 3138 !!(vcpu->arch.efer & EFER_LMA))) 3139 return -EINVAL; 3140 #endif 3141 return 0; 3142 } 3143 3144 static bool is_l1_noncanonical_address_on_vmexit(u64 la, struct vmcs12 *vmcs12) 3145 { 3146 /* 3147 * Check that the given linear address is canonical after a VM exit 3148 * from L2, based on HOST_CR4.LA57 value that will be loaded for L1. 3149 */ 3150 u8 l1_address_bits_on_exit = (vmcs12->host_cr4 & X86_CR4_LA57) ? 57 : 48; 3151 3152 return !__is_canonical_address(la, l1_address_bits_on_exit); 3153 } 3154 3155 static int nested_vmx_check_cet_state_common(struct kvm_vcpu *vcpu, u64 s_cet, 3156 u64 ssp, u64 ssp_tbl) 3157 { 3158 if (CC(!kvm_is_valid_u_s_cet(vcpu, s_cet)) || CC(!IS_ALIGNED(ssp, 4)) || 3159 CC(is_noncanonical_msr_address(ssp_tbl, vcpu))) 3160 return -EINVAL; 3161 3162 return 0; 3163 } 3164 3165 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 3166 struct vmcs12 *vmcs12) 3167 { 3168 bool ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); 3169 3170 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 3171 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 3172 CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3))) 3173 return -EINVAL; 3174 3175 if (CC(vmcs12->host_cr4 & X86_CR4_CET && !(vmcs12->host_cr0 & X86_CR0_WP))) 3176 return -EINVAL; 3177 3178 if (CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 3179 CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 3180 return -EINVAL; 3181 3182 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 3183 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 3184 return -EINVAL; 3185 3186 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 3187 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 3188 vmcs12->host_ia32_perf_global_ctrl))) 3189 return -EINVAL; 3190 3191 if (ia32e) { 3192 if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) 3193 return -EINVAL; 3194 } else { 3195 if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || 3196 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || 3197 CC((vmcs12->host_rip) >> 32)) 3198 return -EINVAL; 3199 } 3200 3201 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3202 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3203 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3204 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3205 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3206 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3207 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3208 CC(vmcs12->host_cs_selector == 0) || 3209 CC(vmcs12->host_tr_selector == 0) || 3210 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 3211 return -EINVAL; 3212 3213 if (CC(is_noncanonical_base_address(vmcs12->host_fs_base, vcpu)) || 3214 CC(is_noncanonical_base_address(vmcs12->host_gs_base, vcpu)) || 3215 CC(is_noncanonical_base_address(vmcs12->host_gdtr_base, vcpu)) || 3216 CC(is_noncanonical_base_address(vmcs12->host_idtr_base, vcpu)) || 3217 CC(is_noncanonical_base_address(vmcs12->host_tr_base, vcpu)) || 3218 CC(is_l1_noncanonical_address_on_vmexit(vmcs12->host_rip, vmcs12))) 3219 return -EINVAL; 3220 3221 /* 3222 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 3223 * IA32_EFER MSR must be 0 in the field for that register. In addition, 3224 * the values of the LMA and LME bits in the field must each be that of 3225 * the host address-space size VM-exit control. 3226 */ 3227 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 3228 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 3229 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 3230 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 3231 return -EINVAL; 3232 } 3233 3234 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) { 3235 if (nested_vmx_check_cet_state_common(vcpu, vmcs12->host_s_cet, 3236 vmcs12->host_ssp, 3237 vmcs12->host_ssp_tbl)) 3238 return -EINVAL; 3239 3240 /* 3241 * IA32_S_CET and SSP must be canonical if the host will 3242 * enter 64-bit mode after VM-exit; otherwise, higher 3243 * 32-bits must be all 0s. 3244 */ 3245 if (ia32e) { 3246 if (CC(is_noncanonical_msr_address(vmcs12->host_s_cet, vcpu)) || 3247 CC(is_noncanonical_msr_address(vmcs12->host_ssp, vcpu))) 3248 return -EINVAL; 3249 } else { 3250 if (CC(vmcs12->host_s_cet >> 32) || CC(vmcs12->host_ssp >> 32)) 3251 return -EINVAL; 3252 } 3253 } 3254 3255 return 0; 3256 } 3257 3258 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 3259 struct vmcs12 *vmcs12) 3260 { 3261 struct vcpu_vmx *vmx = to_vmx(vcpu); 3262 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 3263 struct vmcs_hdr hdr; 3264 3265 if (vmcs12->vmcs_link_pointer == INVALID_GPA) 3266 return 0; 3267 3268 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 3269 return -EINVAL; 3270 3271 if (ghc->gpa != vmcs12->vmcs_link_pointer && 3272 CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 3273 vmcs12->vmcs_link_pointer, VMCS12_SIZE))) 3274 return -EINVAL; 3275 3276 if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 3277 offsetof(struct vmcs12, hdr), 3278 sizeof(hdr)))) 3279 return -EINVAL; 3280 3281 if (CC(hdr.revision_id != VMCS12_REVISION) || 3282 CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 3283 return -EINVAL; 3284 3285 return 0; 3286 } 3287 3288 /* 3289 * Checks related to Guest Non-register State 3290 */ 3291 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 3292 { 3293 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 3294 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && 3295 vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) 3296 return -EINVAL; 3297 3298 return 0; 3299 } 3300 3301 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 3302 struct vmcs12 *vmcs12, 3303 enum vm_entry_failure_code *entry_failure_code) 3304 { 3305 bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE); 3306 3307 *entry_failure_code = ENTRY_FAIL_DEFAULT; 3308 3309 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 3310 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 3311 return -EINVAL; 3312 3313 if (CC(vmcs12->guest_cr4 & X86_CR4_CET && !(vmcs12->guest_cr0 & X86_CR0_WP))) 3314 return -EINVAL; 3315 3316 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 3317 u64 debugctl = vmcs12->guest_ia32_debugctl; 3318 3319 /* 3320 * FREEZE_IN_SMM is not virtualized, but allow L1 to set it in 3321 * vmcs12's DEBUGCTL under a quirk for backwards compatibility. 3322 * Note that the quirk only relaxes the consistency check. The 3323 * vmcc02 bit is still under the control of the host. In 3324 * particular, if a host administrator decides to clear the bit, 3325 * then L1 has no say in the matter. 3326 */ 3327 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_VMCS12_ALLOW_FREEZE_IN_SMM)) 3328 debugctl &= ~DEBUGCTLMSR_FREEZE_IN_SMM; 3329 3330 if (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) || 3331 CC(!vmx_is_valid_debugctl(vcpu, debugctl, false))) 3332 return -EINVAL; 3333 } 3334 3335 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 3336 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 3337 return -EINVAL; 3338 3339 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 3340 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; 3341 return -EINVAL; 3342 } 3343 3344 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 3345 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 3346 vmcs12->guest_ia32_perf_global_ctrl))) 3347 return -EINVAL; 3348 3349 if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG)) 3350 return -EINVAL; 3351 3352 if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) || 3353 CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG))) 3354 return -EINVAL; 3355 3356 /* 3357 * If the load IA32_EFER VM-entry control is 1, the following checks 3358 * are performed on the field for the IA32_EFER MSR: 3359 * - Bits reserved in the IA32_EFER MSR must be 0. 3360 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 3361 * the IA-32e mode guest VM-exit control. It must also be identical 3362 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 3363 * CR0.PG) is 1. 3364 */ 3365 if (vcpu->arch.nested_run_pending && 3366 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 3367 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 3368 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 3369 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 3370 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 3371 return -EINVAL; 3372 } 3373 3374 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 3375 (CC(is_noncanonical_msr_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 3376 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 3377 return -EINVAL; 3378 3379 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) { 3380 if (nested_vmx_check_cet_state_common(vcpu, vmcs12->guest_s_cet, 3381 vmcs12->guest_ssp, 3382 vmcs12->guest_ssp_tbl)) 3383 return -EINVAL; 3384 3385 /* 3386 * Guest SSP must have 63:N bits identical, rather than 3387 * be canonical (i.e., 63:N-1 bits identical), where N is 3388 * the CPU's maximum linear-address width. Similar to 3389 * is_noncanonical_msr_address(), use the host's 3390 * linear-address width. 3391 */ 3392 if (CC(!__is_canonical_address(vmcs12->guest_ssp, max_host_virt_addr_bits() + 1))) 3393 return -EINVAL; 3394 } 3395 3396 if (nested_check_guest_non_reg_state(vmcs12)) 3397 return -EINVAL; 3398 3399 return 0; 3400 } 3401 3402 #ifdef CONFIG_KVM_HYPERV 3403 static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) 3404 { 3405 struct vcpu_vmx *vmx = to_vmx(vcpu); 3406 3407 /* 3408 * hv_evmcs may end up being not mapped after migration (when 3409 * L2 was running), map it here to make sure vmcs12 changes are 3410 * properly reflected. 3411 */ 3412 if (guest_cpu_cap_has_evmcs(vcpu) && 3413 vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { 3414 enum nested_evmptrld_status evmptrld_status = 3415 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 3416 3417 if (evmptrld_status == EVMPTRLD_VMFAIL || 3418 evmptrld_status == EVMPTRLD_ERROR) 3419 return false; 3420 3421 /* 3422 * Post migration VMCS12 always provides the most actual 3423 * information, copy it to eVMCS upon entry. 3424 */ 3425 vmx->nested.need_vmcs12_to_shadow_sync = true; 3426 } 3427 3428 return true; 3429 } 3430 #endif 3431 3432 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 3433 { 3434 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3435 struct vcpu_vmx *vmx = to_vmx(vcpu); 3436 struct kvm_host_map *map; 3437 3438 if (!vcpu->arch.pdptrs_from_userspace && 3439 !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 3440 /* 3441 * Reload the guest's PDPTRs since after a migration 3442 * the guest CR3 might be restored prior to setting the nested 3443 * state which can lead to a load of wrong PDPTRs. 3444 */ 3445 if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) 3446 return false; 3447 } 3448 3449 3450 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3451 map = &vmx->nested.apic_access_page_map; 3452 3453 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) { 3454 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn)); 3455 } else { 3456 pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n", 3457 __func__); 3458 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3459 vcpu->run->internal.suberror = 3460 KVM_INTERNAL_ERROR_EMULATION; 3461 vcpu->run->internal.ndata = 0; 3462 return false; 3463 } 3464 } 3465 3466 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3467 map = &vmx->nested.virtual_apic_map; 3468 3469 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 3470 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 3471 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 3472 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 3473 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3474 /* 3475 * The processor will never use the TPR shadow, simply 3476 * clear the bit from the execution control. Such a 3477 * configuration is useless, but it happens in tests. 3478 * For any other configuration, failing the vm entry is 3479 * _not_ what the processor does but it's basically the 3480 * only possibility we have. 3481 */ 3482 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 3483 } else { 3484 /* 3485 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 3486 * force VM-Entry to fail. 3487 */ 3488 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); 3489 } 3490 } 3491 3492 if (nested_cpu_has_posted_intr(vmcs12)) { 3493 map = &vmx->nested.pi_desc_map; 3494 3495 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 3496 vmx->nested.pi_desc = 3497 (struct pi_desc *)(((void *)map->hva) + 3498 offset_in_page(vmcs12->posted_intr_desc_addr)); 3499 vmcs_write64(POSTED_INTR_DESC_ADDR, 3500 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 3501 } else { 3502 /* 3503 * Defer the KVM_INTERNAL_EXIT until KVM tries to 3504 * access the contents of the VMCS12 posted interrupt 3505 * descriptor. (Note that KVM may do this when it 3506 * should not, per the architectural specification.) 3507 */ 3508 vmx->nested.pi_desc = NULL; 3509 pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); 3510 } 3511 } 3512 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 3513 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3514 else 3515 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3516 3517 return true; 3518 } 3519 3520 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) 3521 { 3522 #ifdef CONFIG_KVM_HYPERV 3523 /* 3524 * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy 3525 * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory 3526 * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post 3527 * migration. 3528 */ 3529 if (!nested_get_evmcs_page(vcpu)) { 3530 pr_debug_ratelimited("%s: enlightened vmptrld failed\n", 3531 __func__); 3532 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3533 vcpu->run->internal.suberror = 3534 KVM_INTERNAL_ERROR_EMULATION; 3535 vcpu->run->internal.ndata = 0; 3536 3537 return false; 3538 } 3539 #endif 3540 3541 if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) 3542 return false; 3543 3544 return true; 3545 } 3546 3547 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) 3548 { 3549 struct vmcs12 *vmcs12; 3550 struct vcpu_vmx *vmx = to_vmx(vcpu); 3551 gpa_t dst; 3552 3553 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) 3554 return 0; 3555 3556 if (WARN_ON_ONCE(vmx->nested.pml_full)) 3557 return 1; 3558 3559 /* 3560 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is 3561 * set is already checked as part of A/D emulation. 3562 */ 3563 vmcs12 = get_vmcs12(vcpu); 3564 if (!nested_cpu_has_pml(vmcs12)) 3565 return 0; 3566 3567 if (vmcs12->guest_pml_index >= PML_LOG_NR_ENTRIES) { 3568 vmx->nested.pml_full = true; 3569 return 1; 3570 } 3571 3572 gpa &= ~0xFFFull; 3573 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; 3574 3575 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, 3576 offset_in_page(dst), sizeof(gpa))) 3577 return 0; 3578 3579 vmcs12->guest_pml_index--; 3580 3581 return 0; 3582 } 3583 3584 /* 3585 * Intel's VMX Instruction Reference specifies a common set of prerequisites 3586 * for running VMX instructions (except VMXON, whose prerequisites are 3587 * slightly different). It also specifies what exception to inject otherwise. 3588 * Note that many of these exceptions have priority over VM exits, so they 3589 * don't have to be checked again here. 3590 */ 3591 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 3592 { 3593 if (!to_vmx(vcpu)->nested.vmxon) { 3594 kvm_queue_exception(vcpu, UD_VECTOR); 3595 return 0; 3596 } 3597 3598 if (vmx_get_cpl(vcpu)) { 3599 kvm_inject_gp(vcpu, 0); 3600 return 0; 3601 } 3602 3603 return 1; 3604 } 3605 3606 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3607 struct vmcs12 *vmcs12); 3608 3609 /* 3610 * If from_vmentry is false, this is being called from state restore (either RSM 3611 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 3612 * 3613 * Returns: 3614 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode 3615 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail 3616 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit 3617 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error 3618 */ 3619 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, 3620 bool from_vmentry) 3621 { 3622 struct vcpu_vmx *vmx = to_vmx(vcpu); 3623 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3624 enum vm_entry_failure_code entry_failure_code; 3625 union vmx_exit_reason exit_reason = { 3626 .basic = EXIT_REASON_INVALID_STATE, 3627 .failed_vmentry = 1, 3628 }; 3629 u32 failed_index; 3630 3631 trace_kvm_nested_vmenter(kvm_rip_read(vcpu), 3632 vmx->nested.current_vmptr, 3633 vmcs12->guest_rip, 3634 vmcs12->guest_intr_status, 3635 vmcs12->vm_entry_intr_info_field, 3636 vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT, 3637 vmcs12->ept_pointer, 3638 vmcs12->guest_cr3, 3639 KVM_ISA_VMX); 3640 3641 kvm_service_local_tlb_flush_requests(vcpu); 3642 3643 if (!vcpu->arch.nested_run_pending || 3644 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3645 vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read(); 3646 if (kvm_mpx_supported() && 3647 (!vcpu->arch.nested_run_pending || 3648 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 3649 vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3650 3651 if (!vcpu->arch.nested_run_pending || 3652 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) 3653 vmcs_read_cet_state(vcpu, &vmx->nested.pre_vmenter_s_cet, 3654 &vmx->nested.pre_vmenter_ssp, 3655 &vmx->nested.pre_vmenter_ssp_tbl); 3656 3657 /* 3658 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled. In the 3659 * event of a "late" VM-Fail, i.e. a VM-Fail detected by hardware but 3660 * not KVM, KVM must unwind its software model to the pre-VM-Entry host 3661 * state. When EPT is disabled, GUEST_CR3 holds KVM's shadow CR3, not 3662 * L1's "real" CR3, which causes nested_vmx_restore_host_state() to 3663 * corrupt vcpu->arch.cr3. Stuffing vmcs01.GUEST_CR3 results in the 3664 * unwind naturally setting arch.cr3 to the correct value. Smashing 3665 * vmcs01.GUEST_CR3 is safe because nested VM-Exits, and the unwind, 3666 * reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is guaranteed to be 3667 * overwritten with a shadow CR3 prior to re-entering L1. 3668 */ 3669 if (!enable_ept) 3670 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3671 3672 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3673 3674 prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); 3675 3676 if (from_vmentry) { 3677 if (unlikely(!nested_get_vmcs12_pages(vcpu))) { 3678 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3679 return NVMX_VMENTRY_KVM_INTERNAL_ERROR; 3680 } 3681 3682 if (nested_vmx_check_controls_late(vcpu, vmcs12)) { 3683 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3684 return NVMX_VMENTRY_VMFAIL; 3685 } 3686 3687 if (nested_vmx_check_guest_state(vcpu, vmcs12, 3688 &entry_failure_code)) { 3689 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3690 vmcs12->exit_qualification = entry_failure_code; 3691 goto vmentry_fail_vmexit; 3692 } 3693 } 3694 3695 enter_guest_mode(vcpu); 3696 3697 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) { 3698 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3699 vmcs12->exit_qualification = entry_failure_code; 3700 goto vmentry_fail_vmexit_guest_mode; 3701 } 3702 3703 if (from_vmentry) { 3704 failed_index = nested_vmx_load_msr(vcpu, 3705 vmcs12->vm_entry_msr_load_addr, 3706 vmcs12->vm_entry_msr_load_count); 3707 if (failed_index) { 3708 exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; 3709 vmcs12->exit_qualification = failed_index; 3710 goto vmentry_fail_vmexit_guest_mode; 3711 } 3712 } else { 3713 /* 3714 * The MMU is not initialized to point at the right entities yet and 3715 * "get pages" would need to read data from the guest (i.e. we will 3716 * need to perform gpa to hpa translation). Request a call 3717 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3718 * have already been set at vmentry time and should not be reset. 3719 */ 3720 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 3721 } 3722 3723 /* 3724 * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI 3725 * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can 3726 * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit 3727 * unconditionally. Take care to pull data from vmcs01 as appropriate, 3728 * e.g. when checking for interrupt windows, as vmcs02 is now loaded. 3729 */ 3730 if ((__exec_controls_get(&vmx->vmcs01) & (CPU_BASED_INTR_WINDOW_EXITING | 3731 CPU_BASED_NMI_WINDOW_EXITING)) || 3732 kvm_apic_has_pending_init_or_sipi(vcpu) || 3733 kvm_apic_has_interrupt(vcpu)) 3734 kvm_make_request(KVM_REQ_EVENT, vcpu); 3735 3736 /* 3737 * Do not start the preemption timer hrtimer until after we know 3738 * we are successful, so that only nested_vmx_vmexit needs to cancel 3739 * the timer. 3740 */ 3741 vmx->nested.preemption_timer_expired = false; 3742 if (nested_cpu_has_preemption_timer(vmcs12)) { 3743 u64 timer_value = vmx_calc_preemption_timer_value(vcpu); 3744 vmx_start_preemption_timer(vcpu, timer_value); 3745 } 3746 3747 /* 3748 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3749 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3750 * returned as far as L1 is concerned. It will only return (and set 3751 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3752 */ 3753 return NVMX_VMENTRY_SUCCESS; 3754 3755 /* 3756 * A failed consistency check that leads to a VMExit during L1's 3757 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3758 * 26.7 "VM-entry failures during or after loading guest state". 3759 */ 3760 vmentry_fail_vmexit_guest_mode: 3761 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3762 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3763 leave_guest_mode(vcpu); 3764 3765 vmentry_fail_vmexit: 3766 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3767 3768 if (!from_vmentry) 3769 return NVMX_VMENTRY_VMEXIT; 3770 3771 load_vmcs12_host_state(vcpu, vmcs12); 3772 vmcs12->vm_exit_reason = exit_reason.full; 3773 if (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)) 3774 vmx->nested.need_vmcs12_to_shadow_sync = true; 3775 return NVMX_VMENTRY_VMEXIT; 3776 } 3777 3778 /* 3779 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3780 * for running an L2 nested guest. 3781 */ 3782 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3783 { 3784 struct vmcs12 *vmcs12; 3785 enum nvmx_vmentry_status status; 3786 struct vcpu_vmx *vmx = to_vmx(vcpu); 3787 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3788 enum nested_evmptrld_status evmptrld_status; 3789 3790 if (!nested_vmx_check_permission(vcpu)) 3791 return 1; 3792 3793 evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); 3794 if (evmptrld_status == EVMPTRLD_ERROR) { 3795 kvm_queue_exception(vcpu, UD_VECTOR); 3796 return 1; 3797 } 3798 3799 kvm_pmu_branch_retired(vcpu); 3800 3801 if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) 3802 return nested_vmx_failInvalid(vcpu); 3803 3804 if (CC(!nested_vmx_is_evmptr12_valid(vmx) && 3805 vmx->nested.current_vmptr == INVALID_GPA)) 3806 return nested_vmx_failInvalid(vcpu); 3807 3808 vmcs12 = get_vmcs12(vcpu); 3809 3810 /* 3811 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3812 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3813 * rather than RFLAGS.ZF, and no error number is stored to the 3814 * VM-instruction error field. 3815 */ 3816 if (CC(vmcs12->hdr.shadow_vmcs)) 3817 return nested_vmx_failInvalid(vcpu); 3818 3819 if (nested_vmx_is_evmptr12_valid(vmx)) { 3820 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 3821 3822 copy_enlightened_to_vmcs12(vmx, evmcs->hv_clean_fields); 3823 /* Enlightened VMCS doesn't have launch state */ 3824 vmcs12->launch_state = !launch; 3825 } else if (enable_shadow_vmcs) { 3826 copy_shadow_to_vmcs12(vmx); 3827 } 3828 3829 /* 3830 * The nested entry process starts with enforcing various prerequisites 3831 * on vmcs12 as required by the Intel SDM, and act appropriately when 3832 * they fail: As the SDM explains, some conditions should cause the 3833 * instruction to fail, while others will cause the instruction to seem 3834 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3835 * To speed up the normal (success) code path, we should avoid checking 3836 * for misconfigurations which will anyway be caught by the processor 3837 * when using the merged vmcs02. 3838 */ 3839 if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) 3840 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3841 3842 if (CC(vmcs12->launch_state == launch)) 3843 return nested_vmx_fail(vcpu, 3844 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3845 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3846 3847 if (nested_vmx_check_controls(vcpu, vmcs12)) 3848 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3849 3850 if (nested_vmx_check_address_space_size(vcpu, vmcs12)) 3851 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3852 3853 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3854 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3855 3856 /* 3857 * We're finally done with prerequisite checking, and can start with 3858 * the nested entry. 3859 */ 3860 vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING; 3861 vmx->nested.has_preemption_timer_deadline = false; 3862 status = nested_vmx_enter_non_root_mode(vcpu, true); 3863 if (unlikely(status != NVMX_VMENTRY_SUCCESS)) 3864 goto vmentry_failed; 3865 3866 /* Hide L1D cache contents from the nested guest. */ 3867 kvm_request_l1tf_flush_l1d(); 3868 3869 /* 3870 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3871 * also be used as part of restoring nVMX state for 3872 * snapshot restore (migration). 3873 * 3874 * In this flow, it is assumed that vmcs12 cache was 3875 * transferred as part of captured nVMX state and should 3876 * therefore not be read from guest memory (which may not 3877 * exist on destination host yet). 3878 */ 3879 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3880 3881 switch (vmcs12->guest_activity_state) { 3882 case GUEST_ACTIVITY_HLT: 3883 /* 3884 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3885 * awakened by event injection or by an NMI-window VM-exit or 3886 * by an interrupt-window VM-exit, halt the vcpu. 3887 */ 3888 if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3889 !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && 3890 !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && 3891 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3892 vcpu->arch.nested_run_pending = 0; 3893 return kvm_emulate_halt_noskip(vcpu); 3894 } 3895 break; 3896 case GUEST_ACTIVITY_WAIT_SIPI: 3897 vcpu->arch.nested_run_pending = 0; 3898 kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); 3899 break; 3900 default: 3901 break; 3902 } 3903 3904 return 1; 3905 3906 vmentry_failed: 3907 vcpu->arch.nested_run_pending = 0; 3908 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) 3909 return 0; 3910 if (status == NVMX_VMENTRY_VMEXIT) 3911 return 1; 3912 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); 3913 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3914 } 3915 3916 /* 3917 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3918 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). 3919 * This function returns the new value we should put in vmcs12.guest_cr0. 3920 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3921 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3922 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3923 * didn't trap the bit, because if L1 did, so would L0). 3924 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3925 * been modified by L2, and L1 knows it. So just leave the old value of 3926 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3927 * isn't relevant, because if L0 traps this bit it can set it to anything. 3928 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3929 * changed these bits, and therefore they need to be updated, but L0 3930 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3931 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3932 */ 3933 static inline unsigned long 3934 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3935 { 3936 return 3937 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3938 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3939 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3940 vcpu->arch.cr0_guest_owned_bits)); 3941 } 3942 3943 static inline unsigned long 3944 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3945 { 3946 return 3947 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3948 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3949 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3950 vcpu->arch.cr4_guest_owned_bits)); 3951 } 3952 3953 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3954 struct vmcs12 *vmcs12, 3955 u32 vm_exit_reason, u32 exit_intr_info) 3956 { 3957 u32 idt_vectoring; 3958 unsigned int nr; 3959 3960 /* 3961 * Per the SDM, VM-Exits due to double and triple faults are never 3962 * considered to occur during event delivery, even if the double/triple 3963 * fault is the result of an escalating vectoring issue. 3964 * 3965 * Note, the SDM qualifies the double fault behavior with "The original 3966 * event results in a double-fault exception". It's unclear why the 3967 * qualification exists since exits due to double fault can occur only 3968 * while vectoring a different exception (injected events are never 3969 * subject to interception), i.e. there's _always_ an original event. 3970 * 3971 * The SDM also uses NMI as a confusing example for the "original event 3972 * causes the VM exit directly" clause. NMI isn't special in any way, 3973 * the same rule applies to all events that cause an exit directly. 3974 * NMI is an odd choice for the example because NMIs can only occur on 3975 * instruction boundaries, i.e. they _can't_ occur during vectoring. 3976 */ 3977 if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT || 3978 ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI && 3979 is_double_fault(exit_intr_info))) { 3980 vmcs12->idt_vectoring_info_field = 0; 3981 } else if (vcpu->arch.exception.injected) { 3982 nr = vcpu->arch.exception.vector; 3983 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3984 3985 if (kvm_exception_is_soft(nr)) { 3986 vmcs12->vm_exit_instruction_len = 3987 vcpu->arch.event_exit_inst_len; 3988 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 3989 } else 3990 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 3991 3992 if (vcpu->arch.exception.has_error_code) { 3993 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 3994 vmcs12->idt_vectoring_error_code = 3995 vcpu->arch.exception.error_code; 3996 } 3997 3998 vmcs12->idt_vectoring_info_field = idt_vectoring; 3999 } else if (vcpu->arch.nmi_injected) { 4000 vmcs12->idt_vectoring_info_field = 4001 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 4002 } else if (vcpu->arch.interrupt.injected) { 4003 nr = vcpu->arch.interrupt.nr; 4004 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 4005 4006 if (vcpu->arch.interrupt.soft) { 4007 idt_vectoring |= INTR_TYPE_SOFT_INTR; 4008 vmcs12->vm_entry_instruction_len = 4009 vcpu->arch.event_exit_inst_len; 4010 } else 4011 idt_vectoring |= INTR_TYPE_EXT_INTR; 4012 4013 vmcs12->idt_vectoring_info_field = idt_vectoring; 4014 } else { 4015 vmcs12->idt_vectoring_info_field = 0; 4016 } 4017 } 4018 4019 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 4020 { 4021 struct vcpu_vmx *vmx = to_vmx(vcpu); 4022 int max_irr; 4023 void *vapic_page; 4024 u16 status; 4025 4026 if (!vmx->nested.pi_pending) 4027 return 0; 4028 4029 if (!vmx->nested.pi_desc) 4030 goto mmio_needed; 4031 4032 vmx->nested.pi_pending = false; 4033 4034 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 4035 return 0; 4036 4037 max_irr = pi_find_highest_vector(vmx->nested.pi_desc); 4038 if (max_irr > 0) { 4039 vapic_page = vmx->nested.virtual_apic_map.hva; 4040 if (!vapic_page) 4041 goto mmio_needed; 4042 4043 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 4044 vapic_page, &max_irr); 4045 status = vmcs_read16(GUEST_INTR_STATUS); 4046 if ((u8)max_irr > ((u8)status & 0xff)) { 4047 status &= ~0xff; 4048 status |= (u8)max_irr; 4049 vmcs_write16(GUEST_INTR_STATUS, status); 4050 } 4051 } 4052 4053 kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.virtual_apic_map); 4054 kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.pi_desc_map); 4055 return 0; 4056 4057 mmio_needed: 4058 kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); 4059 return -ENXIO; 4060 } 4061 4062 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) 4063 { 4064 struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; 4065 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; 4066 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4067 unsigned long exit_qual; 4068 4069 if (ex->has_payload) { 4070 exit_qual = ex->payload; 4071 } else if (ex->vector == PF_VECTOR) { 4072 exit_qual = vcpu->arch.cr2; 4073 } else if (ex->vector == DB_VECTOR) { 4074 exit_qual = vcpu->arch.dr6; 4075 exit_qual &= ~DR6_BT; 4076 exit_qual ^= DR6_ACTIVE_LOW; 4077 } else { 4078 exit_qual = 0; 4079 } 4080 4081 /* 4082 * Unlike AMD's Paged Real Mode, which reports an error code on #PF 4083 * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the 4084 * "has error code" flags on VM-Exit if the CPU is in Real Mode. 4085 */ 4086 if (ex->has_error_code && is_protmode(vcpu)) { 4087 /* 4088 * Intel CPUs do not generate error codes with bits 31:16 set, 4089 * and more importantly VMX disallows setting bits 31:16 in the 4090 * injected error code for VM-Entry. Drop the bits to mimic 4091 * hardware and avoid inducing failure on nested VM-Entry if L1 4092 * chooses to inject the exception back to L2. AMD CPUs _do_ 4093 * generate "full" 32-bit error codes, so KVM allows userspace 4094 * to inject exception error codes with bits 31:16 set. 4095 */ 4096 vmcs12->vm_exit_intr_error_code = (u16)ex->error_code; 4097 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 4098 } 4099 4100 if (kvm_exception_is_soft(ex->vector)) 4101 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 4102 else 4103 intr_info |= INTR_TYPE_HARD_EXCEPTION; 4104 4105 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 4106 vmx_get_nmi_mask(vcpu)) 4107 intr_info |= INTR_INFO_UNBLOCK_NMI; 4108 4109 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 4110 } 4111 4112 /* 4113 * Returns true if a debug trap is (likely) pending delivery. Infer the class 4114 * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6). 4115 * Using the payload is flawed because code breakpoints (fault-like) and data 4116 * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e. 4117 * this will return false positives if a to-be-injected code breakpoint #DB is 4118 * pending (from KVM's perspective, but not "pending" across an instruction 4119 * boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it 4120 * too is trap-like. 4121 * 4122 * KVM "works" despite these flaws as ICEBP isn't currently supported by the 4123 * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the 4124 * #DB has already happened), and MTF isn't marked pending on code breakpoints 4125 * from the emulator (because such #DBs are fault-like and thus don't trigger 4126 * actions that fire on instruction retire). 4127 */ 4128 static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex) 4129 { 4130 if (!ex->pending || ex->vector != DB_VECTOR) 4131 return 0; 4132 4133 /* General Detect #DBs are always fault-like. */ 4134 return ex->payload & ~DR6_BD; 4135 } 4136 4137 /* 4138 * Returns true if there's a pending #DB exception that is lower priority than 4139 * a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by 4140 * KVM, but could theoretically be injected by userspace. Note, this code is 4141 * imperfect, see above. 4142 */ 4143 static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex) 4144 { 4145 return vmx_get_pending_dbg_trap(ex) & ~DR6_BT; 4146 } 4147 4148 /* 4149 * Certain VM-exits set the 'pending debug exceptions' field to indicate a 4150 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM 4151 * represents these debug traps with a payload that is said to be compatible 4152 * with the 'pending debug exceptions' field, write the payload to the VMCS 4153 * field if a VM-exit is delivered before the debug trap. 4154 */ 4155 static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) 4156 { 4157 unsigned long pending_dbg; 4158 4159 pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception); 4160 if (pending_dbg) 4161 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg); 4162 } 4163 4164 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) 4165 { 4166 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 4167 to_vmx(vcpu)->nested.preemption_timer_expired; 4168 } 4169 4170 static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection) 4171 { 4172 struct vcpu_vmx *vmx = to_vmx(vcpu); 4173 void *vapic = vmx->nested.virtual_apic_map.hva; 4174 int max_irr, vppr; 4175 4176 if (nested_vmx_preemption_timer_pending(vcpu) || 4177 vmx->nested.mtf_pending) 4178 return true; 4179 4180 /* 4181 * Virtual Interrupt Delivery doesn't require manual injection. Either 4182 * the interrupt is already in GUEST_RVI and will be recognized by CPU 4183 * at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move 4184 * the interrupt from the PIR to RVI prior to entering the guest. 4185 */ 4186 if (for_injection) 4187 return false; 4188 4189 if (!nested_cpu_has_vid(get_vmcs12(vcpu)) || 4190 __vmx_interrupt_blocked(vcpu)) 4191 return false; 4192 4193 if (!vapic) 4194 return false; 4195 4196 vppr = *((u32 *)(vapic + APIC_PROCPRI)); 4197 4198 max_irr = vmx_get_rvi(); 4199 if ((max_irr & 0xf0) > (vppr & 0xf0)) 4200 return true; 4201 4202 if (vmx->nested.pi_pending && vmx->nested.pi_desc && 4203 pi_test_on(vmx->nested.pi_desc)) { 4204 max_irr = pi_find_highest_vector(vmx->nested.pi_desc); 4205 if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0)) 4206 return true; 4207 } 4208 4209 return false; 4210 } 4211 4212 /* 4213 * Per the Intel SDM's table "Priority Among Concurrent Events", with minor 4214 * edits to fill in missing examples, e.g. #DB due to split-lock accesses, 4215 * and less minor edits to splice in the priority of VMX Non-Root specific 4216 * events, e.g. MTF and NMI/INTR-window exiting. 4217 * 4218 * 1 Hardware Reset and Machine Checks 4219 * - RESET 4220 * - Machine Check 4221 * 4222 * 2 Trap on Task Switch 4223 * - T flag in TSS is set (on task switch) 4224 * 4225 * 3 External Hardware Interventions 4226 * - FLUSH 4227 * - STOPCLK 4228 * - SMI 4229 * - INIT 4230 * 4231 * 3.5 Monitor Trap Flag (MTF) VM-exit[1] 4232 * 4233 * 4 Traps on Previous Instruction 4234 * - Breakpoints 4235 * - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O 4236 * breakpoint, or #DB due to a split-lock access) 4237 * 4238 * 4.3 VMX-preemption timer expired VM-exit 4239 * 4240 * 4.6 NMI-window exiting VM-exit[2] 4241 * 4242 * 5 Nonmaskable Interrupts (NMI) 4243 * 4244 * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery 4245 * 4246 * 6 Maskable Hardware Interrupts 4247 * 4248 * 7 Code Breakpoint Fault 4249 * 4250 * 8 Faults from Fetching Next Instruction 4251 * - Code-Segment Limit Violation 4252 * - Code Page Fault 4253 * - Control protection exception (missing ENDBRANCH at target of indirect 4254 * call or jump) 4255 * 4256 * 9 Faults from Decoding Next Instruction 4257 * - Instruction length > 15 bytes 4258 * - Invalid Opcode 4259 * - Coprocessor Not Available 4260 * 4261 *10 Faults on Executing Instruction 4262 * - Overflow 4263 * - Bound error 4264 * - Invalid TSS 4265 * - Segment Not Present 4266 * - Stack fault 4267 * - General Protection 4268 * - Data Page Fault 4269 * - Alignment Check 4270 * - x86 FPU Floating-point exception 4271 * - SIMD floating-point exception 4272 * - Virtualization exception 4273 * - Control protection exception 4274 * 4275 * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs), 4276 * INIT signals, and higher priority events take priority over MTF VM exits. 4277 * MTF VM exits take priority over debug-trap exceptions and lower priority 4278 * events. 4279 * 4280 * [2] Debug-trap exceptions and higher priority events take priority over VM exits 4281 * caused by the VMX-preemption timer. VM exits caused by the VMX-preemption 4282 * timer take priority over VM exits caused by the "NMI-window exiting" 4283 * VM-execution control and lower priority events. 4284 * 4285 * [3] Debug-trap exceptions and higher priority events take priority over VM exits 4286 * caused by "NMI-window exiting". VM exits caused by this control take 4287 * priority over non-maskable interrupts (NMIs) and lower priority events. 4288 * 4289 * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to 4290 * the 1-setting of the "interrupt-window exiting" VM-execution control. Thus, 4291 * non-maskable interrupts (NMIs) and higher priority events take priority over 4292 * delivery of a virtual interrupt; delivery of a virtual interrupt takes 4293 * priority over external interrupts and lower priority events. 4294 */ 4295 static int vmx_check_nested_events(struct kvm_vcpu *vcpu) 4296 { 4297 struct kvm_lapic *apic = vcpu->arch.apic; 4298 struct vcpu_vmx *vmx = to_vmx(vcpu); 4299 /* 4300 * Only a pending nested run blocks a pending exception. If there is a 4301 * previously injected event, the pending exception occurred while said 4302 * event was being delivered and thus needs to be handled. 4303 */ 4304 bool block_nested_exceptions = vcpu->arch.nested_run_pending; 4305 /* 4306 * Events that don't require injection, i.e. that are virtualized by 4307 * hardware, aren't blocked by a pending VM-Enter as KVM doesn't need 4308 * to regain control in order to deliver the event, and hardware will 4309 * handle event ordering, e.g. with respect to injected exceptions. 4310 * 4311 * But, new events (not exceptions) are only recognized at instruction 4312 * boundaries. If an event needs reinjection, then KVM is handling a 4313 * VM-Exit that occurred _during_ instruction execution; new events, 4314 * irrespective of whether or not they're injected, are blocked until 4315 * the instruction completes. 4316 */ 4317 bool block_non_injected_events = kvm_event_needs_reinjection(vcpu); 4318 /* 4319 * Inject events are blocked by nested VM-Enter, as KVM is responsible 4320 * for managing priority between concurrent events, i.e. KVM needs to 4321 * wait until after VM-Enter completes to deliver injected events. 4322 */ 4323 bool block_nested_events = block_nested_exceptions || 4324 block_non_injected_events; 4325 4326 if (lapic_in_kernel(vcpu) && 4327 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 4328 if (block_nested_events) 4329 return -EBUSY; 4330 nested_vmx_update_pending_dbg(vcpu); 4331 clear_bit(KVM_APIC_INIT, &apic->pending_events); 4332 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) 4333 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 4334 4335 /* MTF is discarded if the vCPU is in WFS. */ 4336 vmx->nested.mtf_pending = false; 4337 return 0; 4338 } 4339 4340 if (lapic_in_kernel(vcpu) && 4341 test_bit(KVM_APIC_SIPI, &apic->pending_events)) { 4342 if (block_nested_events) 4343 return -EBUSY; 4344 4345 clear_bit(KVM_APIC_SIPI, &apic->pending_events); 4346 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 4347 nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, 4348 apic->sipi_vector & 0xFFUL); 4349 return 0; 4350 } 4351 /* Fallthrough, the SIPI is completely ignored. */ 4352 } 4353 4354 /* 4355 * Process exceptions that are higher priority than Monitor Trap Flag: 4356 * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but 4357 * could theoretically come in from userspace), and ICEBP (INT1). 4358 * 4359 * TODO: SMIs have higher priority than MTF and trap-like #DBs (except 4360 * for TSS T flag #DBs). KVM also doesn't save/restore pending MTF 4361 * across SMI/RSM as it should; that needs to be addressed in order to 4362 * prioritize SMI over MTF and trap-like #DBs. 4363 */ 4364 if (vcpu->arch.exception_vmexit.pending && 4365 !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) { 4366 if (block_nested_exceptions) 4367 return -EBUSY; 4368 4369 nested_vmx_inject_exception_vmexit(vcpu); 4370 return 0; 4371 } 4372 4373 if (vcpu->arch.exception.pending && 4374 !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) { 4375 if (block_nested_exceptions) 4376 return -EBUSY; 4377 goto no_vmexit; 4378 } 4379 4380 if (vmx->nested.mtf_pending) { 4381 if (block_nested_events) 4382 return -EBUSY; 4383 nested_vmx_update_pending_dbg(vcpu); 4384 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); 4385 return 0; 4386 } 4387 4388 if (vcpu->arch.exception_vmexit.pending) { 4389 if (block_nested_exceptions) 4390 return -EBUSY; 4391 4392 nested_vmx_inject_exception_vmexit(vcpu); 4393 return 0; 4394 } 4395 4396 if (vcpu->arch.exception.pending) { 4397 if (block_nested_exceptions) 4398 return -EBUSY; 4399 goto no_vmexit; 4400 } 4401 4402 if (nested_vmx_preemption_timer_pending(vcpu)) { 4403 if (block_nested_events) 4404 return -EBUSY; 4405 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 4406 return 0; 4407 } 4408 4409 if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 4410 if (block_nested_events) 4411 return -EBUSY; 4412 goto no_vmexit; 4413 } 4414 4415 if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { 4416 if (block_nested_events) 4417 return -EBUSY; 4418 if (!nested_exit_on_nmi(vcpu)) 4419 goto no_vmexit; 4420 4421 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 4422 NMI_VECTOR | INTR_TYPE_NMI_INTR | 4423 INTR_INFO_VALID_MASK, 0); 4424 /* 4425 * The NMI-triggered VM exit counts as injection: 4426 * clear this one and block further NMIs. 4427 */ 4428 vcpu->arch.nmi_pending = 0; 4429 vmx_set_nmi_mask(vcpu, true); 4430 return 0; 4431 } 4432 4433 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { 4434 int irq; 4435 4436 if (!nested_exit_on_intr(vcpu)) { 4437 if (block_nested_events) 4438 return -EBUSY; 4439 4440 goto no_vmexit; 4441 } 4442 4443 if (!nested_exit_intr_ack_set(vcpu)) { 4444 if (block_nested_events) 4445 return -EBUSY; 4446 4447 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 4448 return 0; 4449 } 4450 4451 irq = kvm_cpu_get_extint(vcpu); 4452 if (irq != -1) { 4453 if (block_nested_events) 4454 return -EBUSY; 4455 4456 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 4457 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); 4458 return 0; 4459 } 4460 4461 irq = kvm_apic_has_interrupt(vcpu); 4462 if (WARN_ON_ONCE(irq < 0)) 4463 goto no_vmexit; 4464 4465 /* 4466 * If the IRQ is L2's PI notification vector, process posted 4467 * interrupts for L2 instead of injecting VM-Exit, as the 4468 * detection/morphing architecturally occurs when the IRQ is 4469 * delivered to the CPU. Note, only interrupts that are routed 4470 * through the local APIC trigger posted interrupt processing, 4471 * and enabling posted interrupts requires ACK-on-exit. 4472 */ 4473 if (irq == vmx->nested.posted_intr_nv) { 4474 /* 4475 * Nested posted interrupts are delivered via RVI, i.e. 4476 * aren't injected by KVM, and so can be queued even if 4477 * manual event injection is disallowed. 4478 */ 4479 if (block_non_injected_events) 4480 return -EBUSY; 4481 4482 vmx->nested.pi_pending = true; 4483 kvm_apic_clear_irr(vcpu, irq); 4484 goto no_vmexit; 4485 } 4486 4487 if (block_nested_events) 4488 return -EBUSY; 4489 4490 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 4491 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); 4492 4493 /* 4494 * ACK the interrupt _after_ emulating VM-Exit, as the IRQ must 4495 * be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI 4496 * if APICv is active. 4497 */ 4498 kvm_apic_ack_interrupt(vcpu, irq); 4499 return 0; 4500 } 4501 4502 no_vmexit: 4503 return vmx_complete_nested_posted_interrupt(vcpu); 4504 } 4505 4506 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 4507 { 4508 ktime_t remaining = 4509 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 4510 u64 value; 4511 4512 if (ktime_to_ns(remaining) <= 0) 4513 return 0; 4514 4515 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 4516 do_div(value, 1000000); 4517 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 4518 } 4519 4520 static bool is_vmcs12_ext_field(unsigned long field) 4521 { 4522 switch (field) { 4523 case GUEST_ES_SELECTOR: 4524 case GUEST_CS_SELECTOR: 4525 case GUEST_SS_SELECTOR: 4526 case GUEST_DS_SELECTOR: 4527 case GUEST_FS_SELECTOR: 4528 case GUEST_GS_SELECTOR: 4529 case GUEST_LDTR_SELECTOR: 4530 case GUEST_TR_SELECTOR: 4531 case GUEST_ES_LIMIT: 4532 case GUEST_CS_LIMIT: 4533 case GUEST_SS_LIMIT: 4534 case GUEST_DS_LIMIT: 4535 case GUEST_FS_LIMIT: 4536 case GUEST_GS_LIMIT: 4537 case GUEST_LDTR_LIMIT: 4538 case GUEST_TR_LIMIT: 4539 case GUEST_GDTR_LIMIT: 4540 case GUEST_IDTR_LIMIT: 4541 case GUEST_ES_AR_BYTES: 4542 case GUEST_DS_AR_BYTES: 4543 case GUEST_FS_AR_BYTES: 4544 case GUEST_GS_AR_BYTES: 4545 case GUEST_LDTR_AR_BYTES: 4546 case GUEST_TR_AR_BYTES: 4547 case GUEST_ES_BASE: 4548 case GUEST_CS_BASE: 4549 case GUEST_SS_BASE: 4550 case GUEST_DS_BASE: 4551 case GUEST_FS_BASE: 4552 case GUEST_GS_BASE: 4553 case GUEST_LDTR_BASE: 4554 case GUEST_TR_BASE: 4555 case GUEST_GDTR_BASE: 4556 case GUEST_IDTR_BASE: 4557 case GUEST_PENDING_DBG_EXCEPTIONS: 4558 case GUEST_BNDCFGS: 4559 return true; 4560 default: 4561 break; 4562 } 4563 4564 return false; 4565 } 4566 4567 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4568 struct vmcs12 *vmcs12) 4569 { 4570 struct vcpu_vmx *vmx = to_vmx(vcpu); 4571 4572 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 4573 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 4574 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 4575 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 4576 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 4577 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 4578 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 4579 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 4580 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 4581 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 4582 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 4583 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 4584 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 4585 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 4586 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 4587 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 4588 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 4589 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 4590 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 4591 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 4592 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 4593 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 4594 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 4595 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 4596 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 4597 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 4598 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 4599 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 4600 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 4601 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 4602 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 4603 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 4604 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 4605 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 4606 vmcs12->guest_pending_dbg_exceptions = 4607 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 4608 4609 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 4610 } 4611 4612 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4613 struct vmcs12 *vmcs12) 4614 { 4615 struct vcpu_vmx *vmx = to_vmx(vcpu); 4616 int cpu; 4617 4618 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 4619 return; 4620 4621 4622 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 4623 4624 cpu = get_cpu(); 4625 vmx->loaded_vmcs = &vmx->nested.vmcs02; 4626 vmx_vcpu_load_vmcs(vcpu, cpu); 4627 4628 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4629 4630 vmx->loaded_vmcs = &vmx->vmcs01; 4631 vmx_vcpu_load_vmcs(vcpu, cpu); 4632 put_cpu(); 4633 } 4634 4635 /* 4636 * Update the guest state fields of vmcs12 to reflect changes that 4637 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 4638 * VM-entry controls is also updated, since this is really a guest 4639 * state bit.) 4640 */ 4641 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 4642 { 4643 struct vcpu_vmx *vmx = to_vmx(vcpu); 4644 4645 if (nested_vmx_is_evmptr12_valid(vmx)) 4646 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4647 4648 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = 4649 !nested_vmx_is_evmptr12_valid(vmx); 4650 4651 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 4652 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 4653 4654 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 4655 vmcs12->guest_rip = kvm_rip_read(vcpu); 4656 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 4657 4658 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 4659 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 4660 4661 vmcs12->guest_interruptibility_info = 4662 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 4663 4664 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 4665 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 4666 else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 4667 vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; 4668 else 4669 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 4670 4671 if (nested_cpu_has_preemption_timer(vmcs12) && 4672 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && 4673 !vcpu->arch.nested_run_pending) 4674 vmcs12->vmx_preemption_timer_value = 4675 vmx_get_preemption_timer_value(vcpu); 4676 4677 /* 4678 * In some cases (usually, nested EPT), L2 is allowed to change its 4679 * own CR3 without exiting. If it has changed it, we must keep it. 4680 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 4681 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 4682 * 4683 * Additionally, restore L2's PDPTR to vmcs12. 4684 */ 4685 if (enable_ept) { 4686 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 4687 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 4688 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 4689 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 4690 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 4691 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 4692 } 4693 } 4694 4695 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 4696 4697 if (nested_cpu_has_vid(vmcs12)) 4698 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 4699 4700 vmcs12->vm_entry_controls = 4701 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 4702 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 4703 4704 /* 4705 * Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02. 4706 * Writes to DEBUGCTL that aren't intercepted by L1 are immediately 4707 * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into 4708 * vmcs02 doesn't strictly track vmcs12. 4709 */ 4710 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 4711 vmcs12->guest_dr7 = vcpu->arch.dr7; 4712 4713 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 4714 vmcs12->guest_ia32_efer = vcpu->arch.efer; 4715 4716 vmcs_read_cet_state(&vmx->vcpu, &vmcs12->guest_s_cet, 4717 &vmcs12->guest_ssp, 4718 &vmcs12->guest_ssp_tbl); 4719 } 4720 4721 /* 4722 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 4723 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 4724 * and this function updates it to reflect the changes to the guest state while 4725 * L2 was running (and perhaps made some exits which were handled directly by L0 4726 * without going back to L1), and to reflect the exit reason. 4727 * Note that we do not have to copy here all VMCS fields, just those that 4728 * could have changed by the L2 guest or the exit - i.e., the guest-state and 4729 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 4730 * which already writes to vmcs12 directly. 4731 */ 4732 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 4733 u32 vm_exit_reason, u32 exit_intr_info, 4734 unsigned long exit_qualification, u32 exit_insn_len) 4735 { 4736 /* update exit information fields: */ 4737 vmcs12->vm_exit_reason = vm_exit_reason; 4738 if (vmx_get_exit_reason(vcpu).enclave_mode) 4739 vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; 4740 vmcs12->exit_qualification = exit_qualification; 4741 4742 /* 4743 * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched 4744 * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other 4745 * exit info fields are unmodified. 4746 */ 4747 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 4748 vmcs12->launch_state = 1; 4749 4750 /* vm_entry_intr_info_field is cleared on exit. Emulate this 4751 * instead of reading the real value. */ 4752 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 4753 4754 /* 4755 * Transfer the event that L0 or L1 may wanted to inject into 4756 * L2 to IDT_VECTORING_INFO_FIELD. 4757 */ 4758 vmcs12_save_pending_event(vcpu, vmcs12, 4759 vm_exit_reason, exit_intr_info); 4760 4761 vmcs12->vm_exit_intr_info = exit_intr_info; 4762 vmcs12->vm_exit_instruction_len = exit_insn_len; 4763 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4764 4765 /* 4766 * According to spec, there's no need to store the guest's 4767 * MSRs if the exit is due to a VM-entry failure that occurs 4768 * during or after loading the guest state. Since this exit 4769 * does not fall in that category, we need to save the MSRs. 4770 */ 4771 if (nested_vmx_store_msr(vcpu, 4772 vmcs12->vm_exit_msr_store_addr, 4773 vmcs12->vm_exit_msr_store_count)) 4774 nested_vmx_abort(vcpu, 4775 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 4776 } 4777 } 4778 4779 /* 4780 * A part of what we need to when the nested L2 guest exits and we want to 4781 * run its L1 parent, is to reset L1's guest state to the host state specified 4782 * in vmcs12. 4783 * This function is to be called not only on normal nested exit, but also on 4784 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 4785 * Failures During or After Loading Guest State"). 4786 * This function should be called when the active VMCS is L1's (vmcs01). 4787 */ 4788 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 4789 struct vmcs12 *vmcs12) 4790 { 4791 enum vm_entry_failure_code ignored; 4792 struct kvm_segment seg; 4793 4794 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 4795 vcpu->arch.efer = vmcs12->host_ia32_efer; 4796 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4797 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 4798 else 4799 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 4800 vmx_set_efer(vcpu, vcpu->arch.efer); 4801 4802 kvm_rsp_write(vcpu, vmcs12->host_rsp); 4803 kvm_rip_write(vcpu, vmcs12->host_rip); 4804 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 4805 vmx_set_interrupt_shadow(vcpu, 0); 4806 4807 /* 4808 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 4809 * actually changed, because vmx_set_cr0 refers to efer set above. 4810 * 4811 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 4812 * (KVM doesn't change it); 4813 */ 4814 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4815 vmx_set_cr0(vcpu, vmcs12->host_cr0); 4816 4817 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 4818 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4819 vmx_set_cr4(vcpu, vmcs12->host_cr4); 4820 4821 nested_ept_uninit_mmu_context(vcpu); 4822 4823 /* 4824 * Only PDPTE load can fail as the value of cr3 was checked on entry and 4825 * couldn't have changed. 4826 */ 4827 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored)) 4828 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 4829 4830 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); 4831 4832 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 4833 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 4834 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 4835 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 4836 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 4837 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 4838 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 4839 4840 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 4841 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 4842 vmcs_write64(GUEST_BNDCFGS, 0); 4843 4844 /* 4845 * Load CET state from host state if VM_EXIT_LOAD_CET_STATE is set. 4846 * otherwise CET state should be retained across VM-exit, i.e., 4847 * guest values should be propagated from vmcs12 to vmcs01. 4848 */ 4849 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) 4850 vmcs_write_cet_state(vcpu, vmcs12->host_s_cet, vmcs12->host_ssp, 4851 vmcs12->host_ssp_tbl); 4852 else 4853 vmcs_write_cet_state(vcpu, vmcs12->guest_s_cet, vmcs12->guest_ssp, 4854 vmcs12->guest_ssp_tbl); 4855 4856 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 4857 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 4858 vcpu->arch.pat = vmcs12->host_ia32_pat; 4859 } 4860 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 4861 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) 4862 WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4863 vmcs12->host_ia32_perf_global_ctrl)); 4864 4865 /* Set L1 segment info according to Intel SDM 4866 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 4867 seg = (struct kvm_segment) { 4868 .base = 0, 4869 .limit = 0xFFFFFFFF, 4870 .selector = vmcs12->host_cs_selector, 4871 .type = 11, 4872 .present = 1, 4873 .s = 1, 4874 .g = 1 4875 }; 4876 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4877 seg.l = 1; 4878 else 4879 seg.db = 1; 4880 __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 4881 seg = (struct kvm_segment) { 4882 .base = 0, 4883 .limit = 0xFFFFFFFF, 4884 .type = 3, 4885 .present = 1, 4886 .s = 1, 4887 .db = 1, 4888 .g = 1 4889 }; 4890 seg.selector = vmcs12->host_ds_selector; 4891 __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 4892 seg.selector = vmcs12->host_es_selector; 4893 __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 4894 seg.selector = vmcs12->host_ss_selector; 4895 __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 4896 seg.selector = vmcs12->host_fs_selector; 4897 seg.base = vmcs12->host_fs_base; 4898 __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 4899 seg.selector = vmcs12->host_gs_selector; 4900 seg.base = vmcs12->host_gs_base; 4901 __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 4902 seg = (struct kvm_segment) { 4903 .base = vmcs12->host_tr_base, 4904 .limit = 0x67, 4905 .selector = vmcs12->host_tr_selector, 4906 .type = 11, 4907 .present = 1 4908 }; 4909 __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 4910 4911 memset(&seg, 0, sizeof(seg)); 4912 seg.unusable = 1; 4913 __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); 4914 4915 kvm_set_dr(vcpu, 7, 0x400); 4916 vmx_guest_debugctl_write(vcpu, 0); 4917 4918 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 4919 vmcs12->vm_exit_msr_load_count)) 4920 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4921 4922 to_vt(vcpu)->emulation_required = vmx_emulation_required(vcpu); 4923 } 4924 4925 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 4926 { 4927 struct vmx_uret_msr *efer_msr; 4928 unsigned int i; 4929 4930 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 4931 return vmcs_read64(GUEST_IA32_EFER); 4932 4933 if (cpu_has_load_ia32_efer()) 4934 return kvm_host.efer; 4935 4936 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 4937 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 4938 return vmx->msr_autoload.guest.val[i].value; 4939 } 4940 4941 efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); 4942 if (efer_msr) 4943 return efer_msr->data; 4944 4945 return kvm_host.efer; 4946 } 4947 4948 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 4949 { 4950 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4951 struct vcpu_vmx *vmx = to_vmx(vcpu); 4952 struct vmx_msr_entry g, h; 4953 gpa_t gpa; 4954 u32 i, j; 4955 4956 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 4957 4958 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 4959 /* 4960 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 4961 * as vmcs01.GUEST_DR7 contains a userspace defined value 4962 * and vcpu->arch.dr7 is not squirreled away before the 4963 * nested VMENTER (not worth adding a variable in nested_vmx). 4964 */ 4965 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 4966 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 4967 else 4968 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 4969 } 4970 4971 /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */ 4972 vmx_reload_guest_debugctl(vcpu); 4973 4974 /* 4975 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 4976 * handle a variety of side effects to KVM's software model. 4977 */ 4978 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 4979 4980 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4981 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 4982 4983 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4984 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 4985 4986 nested_ept_uninit_mmu_context(vcpu); 4987 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 4988 kvm_register_mark_available(vcpu, VCPU_REG_CR3); 4989 4990 /* 4991 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 4992 * from vmcs01 (if necessary). The PDPTRs are not loaded on 4993 * VMFail, like everything else we just need to ensure our 4994 * software model is up-to-date. 4995 */ 4996 if (enable_ept && is_pae_paging(vcpu)) 4997 ept_save_pdptrs(vcpu); 4998 4999 kvm_mmu_reset_context(vcpu); 5000 5001 /* 5002 * This nasty bit of open coding is a compromise between blindly 5003 * loading L1's MSRs using the exit load lists (incorrect emulation 5004 * of VMFail), leaving the nested VM's MSRs in the software model 5005 * (incorrect behavior) and snapshotting the modified MSRs (too 5006 * expensive since the lists are unbound by hardware). For each 5007 * MSR that was (prematurely) loaded from the nested VMEntry load 5008 * list, reload it from the exit load list if it exists and differs 5009 * from the guest value. The intent is to stuff host state as 5010 * silently as possible, not to fully process the exit load list. 5011 */ 5012 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 5013 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 5014 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 5015 pr_debug_ratelimited( 5016 "%s read MSR index failed (%u, 0x%08llx)\n", 5017 __func__, i, gpa); 5018 goto vmabort; 5019 } 5020 5021 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 5022 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 5023 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 5024 pr_debug_ratelimited( 5025 "%s read MSR failed (%u, 0x%08llx)\n", 5026 __func__, j, gpa); 5027 goto vmabort; 5028 } 5029 if (h.index != g.index) 5030 continue; 5031 if (h.value == g.value) 5032 break; 5033 5034 if (nested_vmx_load_msr_check(vcpu, &h)) { 5035 pr_debug_ratelimited( 5036 "%s check failed (%u, 0x%x, 0x%x)\n", 5037 __func__, j, h.index, h.reserved); 5038 goto vmabort; 5039 } 5040 5041 if (kvm_emulate_msr_write(vcpu, h.index, h.value)) { 5042 pr_debug_ratelimited( 5043 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 5044 __func__, j, h.index, h.value); 5045 goto vmabort; 5046 } 5047 } 5048 } 5049 5050 return; 5051 5052 vmabort: 5053 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 5054 } 5055 5056 /* 5057 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 5058 * and modify vmcs12 to make it see what it would expect to see there if 5059 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 5060 */ 5061 void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, 5062 u32 exit_intr_info, unsigned long exit_qualification, 5063 u32 exit_insn_len) 5064 { 5065 struct vcpu_vmx *vmx = to_vmx(vcpu); 5066 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5067 5068 /* Pending MTF traps are discarded on VM-Exit. */ 5069 vmx->nested.mtf_pending = false; 5070 5071 /* trying to cancel vmlaunch/vmresume is a bug */ 5072 kvm_warn_on_nested_run_pending(vcpu); 5073 5074 #ifdef CONFIG_KVM_HYPERV 5075 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { 5076 /* 5077 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map 5078 * Enlightened VMCS after migration and we still need to 5079 * do that when something is forcing L2->L1 exit prior to 5080 * the first L2 run. 5081 */ 5082 (void)nested_get_evmcs_page(vcpu); 5083 } 5084 #endif 5085 5086 /* Service pending TLB flush requests for L2 before switching to L1. */ 5087 kvm_service_local_tlb_flush_requests(vcpu); 5088 5089 /* 5090 * VCPU_REG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between 5091 * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are 5092 * up-to-date before switching to L1. 5093 */ 5094 if (enable_ept && is_pae_paging(vcpu)) 5095 vmx_ept_load_pdptrs(vcpu); 5096 5097 leave_guest_mode(vcpu); 5098 5099 if (nested_cpu_has_preemption_timer(vmcs12)) 5100 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 5101 5102 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { 5103 vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; 5104 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 5105 vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; 5106 } 5107 5108 if (likely(!vmx->fail)) { 5109 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 5110 5111 if (vm_exit_reason != -1) 5112 prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, 5113 exit_intr_info, exit_qualification, 5114 exit_insn_len); 5115 5116 /* 5117 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 5118 * also be used to capture vmcs12 cache as part of 5119 * capturing nVMX state for snapshot (migration). 5120 * 5121 * Otherwise, this flush will dirty guest memory at a 5122 * point it is already assumed by user-space to be 5123 * immutable. 5124 */ 5125 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 5126 } else { 5127 /* 5128 * The only expected VM-instruction error is "VM entry with 5129 * invalid control field(s)." Anything else indicates a 5130 * problem with L0. 5131 */ 5132 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 5133 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 5134 5135 /* VM-Fail at VM-Entry means KVM missed a consistency check. */ 5136 WARN_ON_ONCE(warn_on_missed_cc); 5137 } 5138 5139 /* 5140 * Drop events/exceptions that were queued for re-injection to L2 5141 * (picked up via vmx_complete_interrupts()), as well as exceptions 5142 * that were pending for L2. Note, this must NOT be hoisted above 5143 * prepare_vmcs12(), events/exceptions queued for re-injection need to 5144 * be captured in vmcs12 (see vmcs12_save_pending_event()). 5145 */ 5146 vcpu->arch.nmi_injected = false; 5147 kvm_clear_exception_queue(vcpu); 5148 kvm_clear_interrupt_queue(vcpu); 5149 5150 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 5151 5152 kvm_nested_vmexit_handle_ibrs(vcpu); 5153 5154 /* 5155 * Update any VMCS fields that might have changed while vmcs02 was the 5156 * active VMCS. The tracking is per-vCPU, not per-VMCS. 5157 */ 5158 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr); 5159 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 5160 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 5161 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 5162 if (kvm_caps.has_tsc_control) 5163 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 5164 5165 nested_put_vmcs12_pages(vcpu); 5166 5167 if ((vm_exit_reason != -1) && 5168 (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))) 5169 vmx->nested.need_vmcs12_to_shadow_sync = true; 5170 5171 /* in case we halted in L2 */ 5172 kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); 5173 5174 if (likely(!vmx->fail)) { 5175 if (vm_exit_reason != -1) 5176 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 5177 vmcs12->exit_qualification, 5178 vmcs12->idt_vectoring_info_field, 5179 vmcs12->vm_exit_intr_info, 5180 vmcs12->vm_exit_intr_error_code, 5181 KVM_ISA_VMX); 5182 5183 load_vmcs12_host_state(vcpu, vmcs12); 5184 5185 /* 5186 * Process events if an injectable IRQ or NMI is pending, even 5187 * if the event is blocked (RFLAGS.IF is cleared on VM-Exit). 5188 * If an event became pending while L2 was active, KVM needs to 5189 * either inject the event or request an IRQ/NMI window. SMIs 5190 * don't need to be processed as SMM is mutually exclusive with 5191 * non-root mode. INIT/SIPI don't need to be checked as INIT 5192 * is blocked post-VMXON, and SIPIs are ignored. 5193 */ 5194 if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending) 5195 kvm_make_request(KVM_REQ_EVENT, vcpu); 5196 return; 5197 } 5198 5199 /* 5200 * After an early L2 VM-entry failure, we're now back 5201 * in L1 which thinks it just finished a VMLAUNCH or 5202 * VMRESUME instruction, so we need to set the failure 5203 * flag and the VM-instruction error field of the VMCS 5204 * accordingly, and skip the emulated instruction. 5205 */ 5206 (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 5207 5208 /* 5209 * Restore L1's host state to KVM's software model. We're here 5210 * because a consistency check was caught by hardware, which 5211 * means some amount of guest state has been propagated to KVM's 5212 * model and needs to be unwound to the host's state. 5213 */ 5214 nested_vmx_restore_host_state(vcpu); 5215 5216 vmx->fail = 0; 5217 } 5218 5219 static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) 5220 { 5221 kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); 5222 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 5223 } 5224 5225 /* 5226 * Decode the memory-address operand of a vmx instruction, as recorded on an 5227 * exit caused by such an instruction (run by a guest hypervisor). 5228 * On success, returns 0. When the operand is invalid, returns 1 and throws 5229 * #UD, #GP, or #SS. 5230 */ 5231 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 5232 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 5233 { 5234 gva_t off; 5235 bool exn; 5236 struct kvm_segment s; 5237 5238 /* 5239 * According to Vol. 3B, "Information for VM Exits Due to Instruction 5240 * Execution", on an exit, vmx_instruction_info holds most of the 5241 * addressing components of the operand. Only the displacement part 5242 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 5243 * For how an actual address is calculated from all these components, 5244 * refer to Vol. 1, "Operand Addressing". 5245 */ 5246 int scaling = vmx_instruction_info & 3; 5247 int addr_size = (vmx_instruction_info >> 7) & 7; 5248 bool is_reg = vmx_instruction_info & (1u << 10); 5249 int seg_reg = (vmx_instruction_info >> 15) & 7; 5250 int index_reg = (vmx_instruction_info >> 18) & 0xf; 5251 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 5252 int base_reg = (vmx_instruction_info >> 23) & 0xf; 5253 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 5254 5255 if (is_reg) { 5256 kvm_queue_exception(vcpu, UD_VECTOR); 5257 return 1; 5258 } 5259 5260 /* Addr = segment_base + offset */ 5261 /* offset = base + [index * scale] + displacement */ 5262 off = exit_qualification; /* holds the displacement */ 5263 if (addr_size == 1) 5264 off = (gva_t)sign_extend64(off, 31); 5265 else if (addr_size == 0) 5266 off = (gva_t)sign_extend64(off, 15); 5267 if (base_is_valid) 5268 off += kvm_register_read(vcpu, base_reg); 5269 if (index_is_valid) 5270 off += kvm_register_read(vcpu, index_reg) << scaling; 5271 vmx_get_segment(vcpu, &s, seg_reg); 5272 5273 /* 5274 * The effective address, i.e. @off, of a memory operand is truncated 5275 * based on the address size of the instruction. Note that this is 5276 * the *effective address*, i.e. the address prior to accounting for 5277 * the segment's base. 5278 */ 5279 if (addr_size == 1) /* 32 bit */ 5280 off &= 0xffffffff; 5281 else if (addr_size == 0) /* 16 bit */ 5282 off &= 0xffff; 5283 5284 /* Checks for #GP/#SS exceptions. */ 5285 exn = false; 5286 if (is_long_mode(vcpu)) { 5287 /* 5288 * The virtual/linear address is never truncated in 64-bit 5289 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 5290 * address when using FS/GS with a non-zero base. 5291 */ 5292 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 5293 *ret = s.base + off; 5294 else 5295 *ret = off; 5296 5297 *ret = vmx_get_untagged_addr(vcpu, *ret, 0); 5298 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 5299 * non-canonical form. This is the only check on the memory 5300 * destination for long mode! 5301 */ 5302 exn = is_noncanonical_address(*ret, vcpu, 0); 5303 } else { 5304 /* 5305 * When not in long mode, the virtual/linear address is 5306 * unconditionally truncated to 32 bits regardless of the 5307 * address size. 5308 */ 5309 *ret = (s.base + off) & 0xffffffff; 5310 5311 /* Protected mode: apply checks for segment validity in the 5312 * following order: 5313 * - segment type check (#GP(0) may be thrown) 5314 * - usability check (#GP(0)/#SS(0)) 5315 * - limit check (#GP(0)/#SS(0)) 5316 */ 5317 if (wr) 5318 /* #GP(0) if the destination operand is located in a 5319 * read-only data segment or any code segment. 5320 */ 5321 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 5322 else 5323 /* #GP(0) if the source operand is located in an 5324 * execute-only code segment 5325 */ 5326 exn = ((s.type & 0xa) == 8); 5327 if (exn) { 5328 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 5329 return 1; 5330 } 5331 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 5332 */ 5333 exn = (s.unusable != 0); 5334 5335 /* 5336 * Protected mode: #GP(0)/#SS(0) if the memory operand is 5337 * outside the segment limit. All CPUs that support VMX ignore 5338 * limit checks for flat segments, i.e. segments with base==0, 5339 * limit==0xffffffff and of type expand-up data or code. 5340 */ 5341 if (!(s.base == 0 && s.limit == 0xffffffff && 5342 ((s.type & 8) || !(s.type & 4)))) 5343 exn = exn || ((u64)off + len - 1 > s.limit); 5344 } 5345 if (exn) { 5346 kvm_queue_exception_e(vcpu, 5347 seg_reg == VCPU_SREG_SS ? 5348 SS_VECTOR : GP_VECTOR, 5349 0); 5350 return 1; 5351 } 5352 5353 return 0; 5354 } 5355 5356 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, 5357 int *ret) 5358 { 5359 gva_t gva; 5360 struct x86_exception e; 5361 int r; 5362 5363 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5364 vmcs_read32(VMX_INSTRUCTION_INFO), false, 5365 sizeof(*vmpointer), &gva)) { 5366 *ret = 1; 5367 return -EINVAL; 5368 } 5369 5370 r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); 5371 if (r != X86EMUL_CONTINUE) { 5372 *ret = kvm_handle_memory_failure(vcpu, r, &e); 5373 return -EINVAL; 5374 } 5375 5376 return 0; 5377 } 5378 5379 /* 5380 * Allocate a shadow VMCS and associate it with the currently loaded 5381 * VMCS, unless such a shadow VMCS already exists. The newly allocated 5382 * VMCS is also VMCLEARed, so that it is ready for use. 5383 */ 5384 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 5385 { 5386 struct vcpu_vmx *vmx = to_vmx(vcpu); 5387 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 5388 5389 /* 5390 * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it 5391 * when L1 executes VMXOFF or the vCPU is forced out of nested 5392 * operation. VMXON faults if the CPU is already post-VMXON, so it 5393 * should be impossible to already have an allocated shadow VMCS. KVM 5394 * doesn't support virtualization of VMCS shadowing, so vmcs01 should 5395 * always be the loaded VMCS. 5396 */ 5397 if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs)) 5398 return loaded_vmcs->shadow_vmcs; 5399 5400 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 5401 if (loaded_vmcs->shadow_vmcs) 5402 vmcs_clear(loaded_vmcs->shadow_vmcs); 5403 5404 return loaded_vmcs->shadow_vmcs; 5405 } 5406 5407 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 5408 { 5409 struct vcpu_vmx *vmx = to_vmx(vcpu); 5410 int r; 5411 5412 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 5413 if (r < 0) 5414 goto out_vmcs02; 5415 5416 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5417 if (!vmx->nested.cached_vmcs12) 5418 goto out_cached_vmcs12; 5419 5420 vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; 5421 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5422 if (!vmx->nested.cached_shadow_vmcs12) 5423 goto out_cached_shadow_vmcs12; 5424 5425 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 5426 goto out_shadow_vmcs; 5427 5428 hrtimer_setup(&vmx->nested.preemption_timer, vmx_preemption_timer_fn, CLOCK_MONOTONIC, 5429 HRTIMER_MODE_ABS_PINNED); 5430 5431 vmx->nested.vpid02 = allocate_vpid(); 5432 5433 vmx->nested.vmcs02_initialized = false; 5434 vmx->nested.vmxon = true; 5435 5436 if (vmx_pt_mode_is_host_guest()) { 5437 vmx->pt_desc.guest.ctl = 0; 5438 pt_update_intercept_for_msr(vcpu); 5439 } 5440 5441 return 0; 5442 5443 out_shadow_vmcs: 5444 kfree(vmx->nested.cached_shadow_vmcs12); 5445 5446 out_cached_shadow_vmcs12: 5447 kfree(vmx->nested.cached_vmcs12); 5448 5449 out_cached_vmcs12: 5450 free_loaded_vmcs(&vmx->nested.vmcs02); 5451 5452 out_vmcs02: 5453 return -ENOMEM; 5454 } 5455 5456 /* Emulate the VMXON instruction. */ 5457 static int handle_vmxon(struct kvm_vcpu *vcpu) 5458 { 5459 int ret; 5460 gpa_t vmptr; 5461 uint32_t revision; 5462 struct vcpu_vmx *vmx = to_vmx(vcpu); 5463 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED 5464 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 5465 5466 /* 5467 * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter 5468 * the guest and so cannot rely on hardware to perform the check, 5469 * which has higher priority than VM-Exit (see Intel SDM's pseudocode 5470 * for VMXON). 5471 * 5472 * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 5473 * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't 5474 * force any of the relevant guest state. For a restricted guest, KVM 5475 * does force CR0.PE=1, but only to also force VM86 in order to emulate 5476 * Real Mode, and so there's no need to check CR0.PE manually. 5477 */ 5478 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) { 5479 kvm_queue_exception(vcpu, UD_VECTOR); 5480 return 1; 5481 } 5482 5483 /* 5484 * The CPL is checked for "not in VMX operation" and for "in VMX root", 5485 * and has higher priority than the VM-Fail due to being post-VMXON, 5486 * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, 5487 * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits 5488 * from L2 to L1, i.e. there's no need to check for the vCPU being in 5489 * VMX non-root. 5490 * 5491 * Forwarding the VM-Exit unconditionally, i.e. without performing the 5492 * #UD checks (see above), is functionally ok because KVM doesn't allow 5493 * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's 5494 * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are 5495 * missed by hardware due to shadowing CR0 and/or CR4. 5496 */ 5497 if (vmx_get_cpl(vcpu)) { 5498 kvm_inject_gp(vcpu, 0); 5499 return 1; 5500 } 5501 5502 if (vmx->nested.vmxon) 5503 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 5504 5505 /* 5506 * Invalid CR0/CR4 generates #GP. These checks are performed if and 5507 * only if the vCPU isn't already in VMX operation, i.e. effectively 5508 * have lower priority than the VM-Fail above. 5509 */ 5510 if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || 5511 !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { 5512 kvm_inject_gp(vcpu, 0); 5513 return 1; 5514 } 5515 5516 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 5517 != VMXON_NEEDED_FEATURES) { 5518 kvm_inject_gp(vcpu, 0); 5519 return 1; 5520 } 5521 5522 if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) 5523 return ret; 5524 5525 /* 5526 * SDM 3: 24.11.5 5527 * The first 4 bytes of VMXON region contain the supported 5528 * VMCS revision identifier 5529 * 5530 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 5531 * which replaces physical address width with 32 5532 */ 5533 if (!page_address_valid(vcpu, vmptr)) 5534 return nested_vmx_failInvalid(vcpu); 5535 5536 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 5537 revision != VMCS12_REVISION) 5538 return nested_vmx_failInvalid(vcpu); 5539 5540 vmx->nested.vmxon_ptr = vmptr; 5541 ret = enter_vmx_operation(vcpu); 5542 if (ret) 5543 return ret; 5544 5545 return nested_vmx_succeed(vcpu); 5546 } 5547 5548 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 5549 { 5550 struct vcpu_vmx *vmx = to_vmx(vcpu); 5551 5552 if (vmx->nested.current_vmptr == INVALID_GPA) 5553 return; 5554 5555 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 5556 5557 if (enable_shadow_vmcs) { 5558 /* copy to memory all shadowed fields in case 5559 they were modified */ 5560 copy_shadow_to_vmcs12(vmx); 5561 vmx_disable_shadow_vmcs(vmx); 5562 } 5563 vmx->nested.posted_intr_nv = -1; 5564 5565 /* Flush VMCS12 to guest memory */ 5566 kvm_vcpu_write_guest_page(vcpu, 5567 vmx->nested.current_vmptr >> PAGE_SHIFT, 5568 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 5569 5570 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 5571 5572 vmx->nested.current_vmptr = INVALID_GPA; 5573 } 5574 5575 /* Emulate the VMXOFF instruction */ 5576 static int handle_vmxoff(struct kvm_vcpu *vcpu) 5577 { 5578 if (!nested_vmx_check_permission(vcpu)) 5579 return 1; 5580 5581 free_nested(vcpu); 5582 5583 if (kvm_apic_has_pending_init_or_sipi(vcpu)) 5584 kvm_make_request(KVM_REQ_EVENT, vcpu); 5585 5586 return nested_vmx_succeed(vcpu); 5587 } 5588 5589 /* Emulate the VMCLEAR instruction */ 5590 static int handle_vmclear(struct kvm_vcpu *vcpu) 5591 { 5592 struct vcpu_vmx *vmx = to_vmx(vcpu); 5593 u32 zero = 0; 5594 gpa_t vmptr; 5595 int r; 5596 5597 if (!nested_vmx_check_permission(vcpu)) 5598 return 1; 5599 5600 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5601 return r; 5602 5603 if (!page_address_valid(vcpu, vmptr)) 5604 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); 5605 5606 if (vmptr == vmx->nested.vmxon_ptr) 5607 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); 5608 5609 if (likely(!nested_evmcs_handle_vmclear(vcpu, vmptr))) { 5610 if (vmptr == vmx->nested.current_vmptr) 5611 nested_release_vmcs12(vcpu); 5612 5613 /* 5614 * Silently ignore memory errors on VMCLEAR, Intel's pseudocode 5615 * for VMCLEAR includes a "ensure that data for VMCS referenced 5616 * by the operand is in memory" clause that guards writes to 5617 * memory, i.e. doing nothing for I/O is architecturally valid. 5618 * 5619 * FIXME: Suppress failures if and only if no memslot is found, 5620 * i.e. exit to userspace if __copy_to_user() fails. 5621 */ 5622 (void)kvm_vcpu_write_guest(vcpu, 5623 vmptr + offsetof(struct vmcs12, 5624 launch_state), 5625 &zero, sizeof(zero)); 5626 } 5627 5628 return nested_vmx_succeed(vcpu); 5629 } 5630 5631 /* Emulate the VMLAUNCH instruction */ 5632 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 5633 { 5634 return nested_vmx_run(vcpu, true); 5635 } 5636 5637 /* Emulate the VMRESUME instruction */ 5638 static int handle_vmresume(struct kvm_vcpu *vcpu) 5639 { 5640 5641 return nested_vmx_run(vcpu, false); 5642 } 5643 5644 static int handle_vmread(struct kvm_vcpu *vcpu) 5645 { 5646 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5647 : get_vmcs12(vcpu); 5648 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5649 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5650 struct vcpu_vmx *vmx = to_vmx(vcpu); 5651 struct x86_exception e; 5652 unsigned long field; 5653 u64 value; 5654 gva_t gva = 0; 5655 short offset; 5656 int len, r; 5657 5658 if (!nested_vmx_check_permission(vcpu)) 5659 return 1; 5660 5661 /* Decode instruction info and find the field to read */ 5662 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5663 5664 if (!nested_vmx_is_evmptr12_valid(vmx)) { 5665 /* 5666 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5667 * any VMREAD sets the ALU flags for VMfailInvalid. 5668 */ 5669 if (vmx->nested.current_vmptr == INVALID_GPA || 5670 (is_guest_mode(vcpu) && 5671 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5672 return nested_vmx_failInvalid(vcpu); 5673 5674 offset = get_vmcs12_field_offset(field); 5675 if (offset < 0) 5676 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5677 5678 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 5679 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5680 5681 /* Read the field, zero-extended to a u64 value */ 5682 value = vmcs12_read_any(vmcs12, field, offset); 5683 } else { 5684 /* 5685 * Hyper-V TLFS (as of 6.0b) explicitly states, that while an 5686 * enlightened VMCS is active VMREAD/VMWRITE instructions are 5687 * unsupported. Unfortunately, certain versions of Windows 11 5688 * don't comply with this requirement which is not enforced in 5689 * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a 5690 * workaround, as misbehaving guests will panic on VM-Fail. 5691 * Note, enlightened VMCS is incompatible with shadow VMCS so 5692 * all VMREADs from L2 should go to L1. 5693 */ 5694 if (WARN_ON_ONCE(is_guest_mode(vcpu))) 5695 return nested_vmx_failInvalid(vcpu); 5696 5697 offset = evmcs_field_offset(field, NULL); 5698 if (offset < 0) 5699 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5700 5701 /* Read the field, zero-extended to a u64 value */ 5702 value = evmcs_read_any(nested_vmx_evmcs(vmx), field, offset); 5703 } 5704 5705 /* 5706 * Now copy part of this value to register or memory, as requested. 5707 * Note that the number of bits actually copied is 32 or 64 depending 5708 * on the guest's mode (32 or 64 bit), not on the given field's length. 5709 */ 5710 if (instr_info & BIT(10)) { 5711 kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); 5712 } else { 5713 len = is_64_bit_mode(vcpu) ? 8 : 4; 5714 if (get_vmx_mem_address(vcpu, exit_qualification, 5715 instr_info, true, len, &gva)) 5716 return 1; 5717 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 5718 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); 5719 if (r != X86EMUL_CONTINUE) 5720 return kvm_handle_memory_failure(vcpu, r, &e); 5721 } 5722 5723 return nested_vmx_succeed(vcpu); 5724 } 5725 5726 static bool is_shadow_field_rw(unsigned long field) 5727 { 5728 switch (field) { 5729 #define SHADOW_FIELD_RW(x, y) case x: 5730 #include "vmcs_shadow_fields.h" 5731 return true; 5732 default: 5733 break; 5734 } 5735 return false; 5736 } 5737 5738 static bool is_shadow_field_ro(unsigned long field) 5739 { 5740 switch (field) { 5741 #define SHADOW_FIELD_RO(x, y) case x: 5742 #include "vmcs_shadow_fields.h" 5743 return true; 5744 default: 5745 break; 5746 } 5747 return false; 5748 } 5749 5750 static int handle_vmwrite(struct kvm_vcpu *vcpu) 5751 { 5752 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5753 : get_vmcs12(vcpu); 5754 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5755 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5756 struct vcpu_vmx *vmx = to_vmx(vcpu); 5757 struct x86_exception e; 5758 unsigned long field; 5759 short offset; 5760 gva_t gva; 5761 int len, r; 5762 5763 /* 5764 * The value to write might be 32 or 64 bits, depending on L1's long 5765 * mode, and eventually we need to write that into a field of several 5766 * possible lengths. The code below first zero-extends the value to 64 5767 * bit (value), and then copies only the appropriate number of 5768 * bits into the vmcs12 field. 5769 */ 5770 u64 value = 0; 5771 5772 if (!nested_vmx_check_permission(vcpu)) 5773 return 1; 5774 5775 /* 5776 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5777 * any VMWRITE sets the ALU flags for VMfailInvalid. 5778 */ 5779 if (vmx->nested.current_vmptr == INVALID_GPA || 5780 (is_guest_mode(vcpu) && 5781 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5782 return nested_vmx_failInvalid(vcpu); 5783 5784 if (instr_info & BIT(10)) 5785 value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); 5786 else { 5787 len = is_64_bit_mode(vcpu) ? 8 : 4; 5788 if (get_vmx_mem_address(vcpu, exit_qualification, 5789 instr_info, false, len, &gva)) 5790 return 1; 5791 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); 5792 if (r != X86EMUL_CONTINUE) 5793 return kvm_handle_memory_failure(vcpu, r, &e); 5794 } 5795 5796 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5797 5798 offset = get_vmcs12_field_offset(field); 5799 if (offset < 0) 5800 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5801 5802 /* 5803 * If the vCPU supports "VMWRITE to any supported field in the 5804 * VMCS," then the "read-only" fields are actually read/write. 5805 */ 5806 if (vmcs_field_readonly(field) && 5807 !nested_cpu_has_vmwrite_any_field(vcpu)) 5808 return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 5809 5810 /* 5811 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 5812 * vmcs12, else we may crush a field or consume a stale value. 5813 */ 5814 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) 5815 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5816 5817 /* 5818 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 5819 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 5820 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 5821 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 5822 * from L1 will return a different value than VMREAD from L2 (L1 sees 5823 * the stripped down value, L2 sees the full value as stored by KVM). 5824 */ 5825 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 5826 value &= 0x1f0ff; 5827 5828 vmcs12_write_any(vmcs12, field, offset, value); 5829 5830 /* 5831 * Do not track vmcs12 dirty-state if in guest-mode as we actually 5832 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 5833 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 5834 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 5835 */ 5836 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 5837 /* 5838 * L1 can read these fields without exiting, ensure the 5839 * shadow VMCS is up-to-date. 5840 */ 5841 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 5842 preempt_disable(); 5843 vmcs_load(vmx->vmcs01.shadow_vmcs); 5844 5845 __vmcs_writel(field, value); 5846 5847 vmcs_clear(vmx->vmcs01.shadow_vmcs); 5848 vmcs_load(vmx->loaded_vmcs->vmcs); 5849 preempt_enable(); 5850 } 5851 vmx->nested.dirty_vmcs12 = true; 5852 } 5853 5854 return nested_vmx_succeed(vcpu); 5855 } 5856 5857 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 5858 { 5859 vmx->nested.current_vmptr = vmptr; 5860 if (enable_shadow_vmcs) { 5861 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 5862 vmcs_write64(VMCS_LINK_POINTER, 5863 __pa(vmx->vmcs01.shadow_vmcs)); 5864 vmx->nested.need_vmcs12_to_shadow_sync = true; 5865 } 5866 vmx->nested.dirty_vmcs12 = true; 5867 vmx->nested.force_msr_bitmap_recalc = true; 5868 } 5869 5870 /* Emulate the VMPTRLD instruction */ 5871 static int handle_vmptrld(struct kvm_vcpu *vcpu) 5872 { 5873 struct vcpu_vmx *vmx = to_vmx(vcpu); 5874 gpa_t vmptr; 5875 int r; 5876 5877 if (!nested_vmx_check_permission(vcpu)) 5878 return 1; 5879 5880 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5881 return r; 5882 5883 if (!page_address_valid(vcpu, vmptr)) 5884 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); 5885 5886 if (vmptr == vmx->nested.vmxon_ptr) 5887 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); 5888 5889 /* Forbid normal VMPTRLD if Enlightened version was used */ 5890 if (nested_vmx_is_evmptr12_valid(vmx)) 5891 return 1; 5892 5893 if (vmx->nested.current_vmptr != vmptr) { 5894 struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; 5895 struct vmcs_hdr hdr; 5896 5897 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { 5898 /* 5899 * Reads from an unbacked page return all 1s, 5900 * which means that the 32 bits located at the 5901 * given physical address won't match the required 5902 * VMCS12_REVISION identifier. 5903 */ 5904 return nested_vmx_fail(vcpu, 5905 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5906 } 5907 5908 if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 5909 offsetof(struct vmcs12, hdr), 5910 sizeof(hdr))) { 5911 return nested_vmx_fail(vcpu, 5912 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5913 } 5914 5915 if (hdr.revision_id != VMCS12_REVISION || 5916 (hdr.shadow_vmcs && 5917 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 5918 return nested_vmx_fail(vcpu, 5919 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5920 } 5921 5922 nested_release_vmcs12(vcpu); 5923 5924 /* 5925 * Load VMCS12 from guest memory since it is not already 5926 * cached. 5927 */ 5928 if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12, 5929 VMCS12_SIZE)) { 5930 return nested_vmx_fail(vcpu, 5931 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5932 } 5933 5934 set_current_vmptr(vmx, vmptr); 5935 } 5936 5937 return nested_vmx_succeed(vcpu); 5938 } 5939 5940 /* Emulate the VMPTRST instruction */ 5941 static int handle_vmptrst(struct kvm_vcpu *vcpu) 5942 { 5943 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 5944 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5945 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 5946 struct x86_exception e; 5947 gva_t gva; 5948 int r; 5949 5950 if (!nested_vmx_check_permission(vcpu)) 5951 return 1; 5952 5953 if (unlikely(nested_vmx_is_evmptr12_valid(to_vmx(vcpu)))) 5954 return 1; 5955 5956 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 5957 true, sizeof(gpa_t), &gva)) 5958 return 1; 5959 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 5960 r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 5961 sizeof(gpa_t), &e); 5962 if (r != X86EMUL_CONTINUE) 5963 return kvm_handle_memory_failure(vcpu, r, &e); 5964 5965 return nested_vmx_succeed(vcpu); 5966 } 5967 5968 /* Emulate the INVEPT instruction */ 5969 static int handle_invept(struct kvm_vcpu *vcpu) 5970 { 5971 struct vcpu_vmx *vmx = to_vmx(vcpu); 5972 u32 vmx_instruction_info, types; 5973 unsigned long type, roots_to_free; 5974 struct kvm_mmu *mmu; 5975 gva_t gva; 5976 struct x86_exception e; 5977 struct { 5978 u64 eptp, gpa; 5979 } operand; 5980 int i, r, gpr_index; 5981 5982 if (!(vmx->nested.msrs.secondary_ctls_high & 5983 SECONDARY_EXEC_ENABLE_EPT) || 5984 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 5985 kvm_queue_exception(vcpu, UD_VECTOR); 5986 return 1; 5987 } 5988 5989 if (!nested_vmx_check_permission(vcpu)) 5990 return 1; 5991 5992 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5993 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5994 type = kvm_register_read(vcpu, gpr_index); 5995 5996 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 5997 5998 if (type >= 32 || !(types & (1 << type))) 5999 return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6000 6001 /* According to the Intel VMX instruction reference, the memory 6002 * operand is read even if it isn't needed (e.g., for type==global) 6003 */ 6004 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 6005 vmx_instruction_info, false, sizeof(operand), &gva)) 6006 return 1; 6007 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 6008 if (r != X86EMUL_CONTINUE) 6009 return kvm_handle_memory_failure(vcpu, r, &e); 6010 6011 /* 6012 * Nested EPT roots are always held through guest_mmu, 6013 * not root_mmu. 6014 */ 6015 mmu = &vcpu->arch.guest_mmu; 6016 6017 switch (type) { 6018 case VMX_EPT_EXTENT_CONTEXT: 6019 if (!nested_vmx_check_eptp(vcpu, operand.eptp)) 6020 return nested_vmx_fail(vcpu, 6021 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6022 6023 roots_to_free = 0; 6024 if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd, 6025 operand.eptp)) 6026 roots_to_free |= KVM_MMU_ROOT_CURRENT; 6027 6028 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 6029 if (nested_ept_root_matches(mmu->prev_roots[i].hpa, 6030 mmu->prev_roots[i].pgd, 6031 operand.eptp)) 6032 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 6033 } 6034 break; 6035 case VMX_EPT_EXTENT_GLOBAL: 6036 roots_to_free = KVM_MMU_ROOTS_ALL; 6037 break; 6038 default: 6039 BUG(); 6040 break; 6041 } 6042 6043 if (roots_to_free) 6044 kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); 6045 6046 return nested_vmx_succeed(vcpu); 6047 } 6048 6049 static int handle_invvpid(struct kvm_vcpu *vcpu) 6050 { 6051 struct vcpu_vmx *vmx = to_vmx(vcpu); 6052 u32 vmx_instruction_info; 6053 unsigned long type, types; 6054 gva_t gva; 6055 struct x86_exception e; 6056 struct { 6057 u64 vpid; 6058 u64 gla; 6059 } operand; 6060 u16 vpid02; 6061 int r, gpr_index; 6062 6063 if (!(vmx->nested.msrs.secondary_ctls_high & 6064 SECONDARY_EXEC_ENABLE_VPID) || 6065 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 6066 kvm_queue_exception(vcpu, UD_VECTOR); 6067 return 1; 6068 } 6069 6070 if (!nested_vmx_check_permission(vcpu)) 6071 return 1; 6072 6073 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6074 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 6075 type = kvm_register_read(vcpu, gpr_index); 6076 6077 types = (vmx->nested.msrs.vpid_caps & 6078 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 6079 6080 if (type >= 32 || !(types & (1 << type))) 6081 return nested_vmx_fail(vcpu, 6082 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6083 6084 /* according to the intel vmx instruction reference, the memory 6085 * operand is read even if it isn't needed (e.g., for type==global) 6086 */ 6087 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 6088 vmx_instruction_info, false, sizeof(operand), &gva)) 6089 return 1; 6090 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 6091 if (r != X86EMUL_CONTINUE) 6092 return kvm_handle_memory_failure(vcpu, r, &e); 6093 6094 if (operand.vpid >> 16) 6095 return nested_vmx_fail(vcpu, 6096 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6097 6098 /* 6099 * Always flush the effective vpid02, i.e. never flush the current VPID 6100 * and never explicitly flush vpid01. INVVPID targets a VPID, not a 6101 * VMCS, and so whether or not the current vmcs12 has VPID enabled is 6102 * irrelevant (and there may not be a loaded vmcs12). 6103 */ 6104 vpid02 = nested_get_vpid02(vcpu); 6105 switch (type) { 6106 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 6107 /* 6108 * LAM doesn't apply to addresses that are inputs to TLB 6109 * invalidation. 6110 */ 6111 if (!operand.vpid || 6112 is_noncanonical_invlpg_address(operand.gla, vcpu)) 6113 return nested_vmx_fail(vcpu, 6114 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6115 vpid_sync_vcpu_addr(vpid02, operand.gla); 6116 break; 6117 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 6118 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 6119 if (!operand.vpid) 6120 return nested_vmx_fail(vcpu, 6121 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6122 vpid_sync_context(vpid02); 6123 break; 6124 case VMX_VPID_EXTENT_ALL_CONTEXT: 6125 vpid_sync_context(vpid02); 6126 break; 6127 default: 6128 WARN_ON_ONCE(1); 6129 return kvm_skip_emulated_instruction(vcpu); 6130 } 6131 6132 /* 6133 * Sync the shadow page tables if EPT is disabled, L1 is invalidating 6134 * linear mappings for L2 (tagged with L2's VPID). Free all guest 6135 * roots as VPIDs are not tracked in the MMU role. 6136 * 6137 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share 6138 * an MMU when EPT is disabled. 6139 * 6140 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. 6141 */ 6142 if (!enable_ept) 6143 kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu); 6144 6145 return nested_vmx_succeed(vcpu); 6146 } 6147 6148 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 6149 struct vmcs12 *vmcs12) 6150 { 6151 u32 index = kvm_rcx_read(vcpu); 6152 u64 new_eptp; 6153 6154 if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) 6155 return 1; 6156 if (index >= VMFUNC_EPTP_ENTRIES) 6157 return 1; 6158 6159 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 6160 &new_eptp, index * 8, 8)) 6161 return 1; 6162 6163 /* 6164 * If the (L2) guest does a vmfunc to the currently 6165 * active ept pointer, we don't have to do anything else 6166 */ 6167 if (vmcs12->ept_pointer != new_eptp) { 6168 if (!nested_vmx_check_eptp(vcpu, new_eptp)) 6169 return 1; 6170 6171 vmcs12->ept_pointer = new_eptp; 6172 nested_ept_new_eptp(vcpu); 6173 6174 if (!nested_cpu_has_vpid(vmcs12)) 6175 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 6176 } 6177 6178 return 0; 6179 } 6180 6181 static int handle_vmfunc(struct kvm_vcpu *vcpu) 6182 { 6183 struct vcpu_vmx *vmx = to_vmx(vcpu); 6184 struct vmcs12 *vmcs12; 6185 u32 function = kvm_rax_read(vcpu); 6186 6187 /* 6188 * VMFUNC should never execute cleanly while L1 is active; KVM supports 6189 * VMFUNC for nested VMs, but not for L1. 6190 */ 6191 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) { 6192 kvm_queue_exception(vcpu, UD_VECTOR); 6193 return 1; 6194 } 6195 6196 vmcs12 = get_vmcs12(vcpu); 6197 6198 /* 6199 * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC 6200 * is enabled in vmcs02 if and only if it's enabled in vmcs12. 6201 */ 6202 if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { 6203 kvm_queue_exception(vcpu, UD_VECTOR); 6204 return 1; 6205 } 6206 6207 if (!(vmcs12->vm_function_control & BIT_ULL(function))) 6208 goto fail; 6209 6210 switch (function) { 6211 case 0: 6212 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 6213 goto fail; 6214 break; 6215 default: 6216 goto fail; 6217 } 6218 return kvm_skip_emulated_instruction(vcpu); 6219 6220 fail: 6221 /* 6222 * This is effectively a reflected VM-Exit, as opposed to a synthesized 6223 * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode 6224 * EXIT_REASON_VMFUNC as the exit reason. 6225 */ 6226 nested_vmx_vmexit(vcpu, vmx->vt.exit_reason.full, 6227 vmx_get_intr_info(vcpu), 6228 vmx_get_exit_qual(vcpu)); 6229 return 1; 6230 } 6231 6232 /* 6233 * Return true if an IO instruction with the specified port and size should cause 6234 * a VM-exit into L1. 6235 */ 6236 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, 6237 int size) 6238 { 6239 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6240 gpa_t bitmap, last_bitmap; 6241 u8 b; 6242 6243 last_bitmap = INVALID_GPA; 6244 b = -1; 6245 6246 while (size > 0) { 6247 if (port < 0x8000) 6248 bitmap = vmcs12->io_bitmap_a; 6249 else if (port < 0x10000) 6250 bitmap = vmcs12->io_bitmap_b; 6251 else 6252 return true; 6253 bitmap += (port & 0x7fff) / 8; 6254 6255 if (last_bitmap != bitmap) 6256 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 6257 return true; 6258 if (b & (1 << (port & 7))) 6259 return true; 6260 6261 port++; 6262 size--; 6263 last_bitmap = bitmap; 6264 } 6265 6266 return false; 6267 } 6268 6269 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 6270 struct vmcs12 *vmcs12) 6271 { 6272 unsigned long exit_qualification; 6273 unsigned short port; 6274 int size; 6275 6276 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 6277 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 6278 6279 exit_qualification = vmx_get_exit_qual(vcpu); 6280 6281 port = exit_qualification >> 16; 6282 size = (exit_qualification & 7) + 1; 6283 6284 return nested_vmx_check_io_bitmaps(vcpu, port, size); 6285 } 6286 6287 /* 6288 * Return 1 if we should exit from L2 to L1 to handle an MSR access, 6289 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 6290 * disinterest in the current event (read or write a specific MSR) by using an 6291 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 6292 */ 6293 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 6294 struct vmcs12 *vmcs12, 6295 union vmx_exit_reason exit_reason) 6296 { 6297 u32 msr_index; 6298 gpa_t bitmap; 6299 6300 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 6301 return true; 6302 6303 if (exit_reason.basic == EXIT_REASON_MSR_READ_IMM || 6304 exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) 6305 msr_index = vmx_get_exit_qual(vcpu); 6306 else 6307 msr_index = kvm_rcx_read(vcpu); 6308 6309 /* 6310 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 6311 * for the four combinations of read/write and low/high MSR numbers. 6312 * First we need to figure out which of the four to use: 6313 */ 6314 bitmap = vmcs12->msr_bitmap; 6315 if (exit_reason.basic == EXIT_REASON_MSR_WRITE || 6316 exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) 6317 bitmap += 2048; 6318 if (msr_index >= 0xc0000000) { 6319 msr_index -= 0xc0000000; 6320 bitmap += 1024; 6321 } 6322 6323 /* Then read the msr_index'th bit from this bitmap: */ 6324 if (msr_index < 1024*8) { 6325 unsigned char b; 6326 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 6327 return true; 6328 return 1 & (b >> (msr_index & 7)); 6329 } else 6330 return true; /* let L1 handle the wrong parameter */ 6331 } 6332 6333 /* 6334 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 6335 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 6336 * intercept (via guest_host_mask etc.) the current event. 6337 */ 6338 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 6339 struct vmcs12 *vmcs12) 6340 { 6341 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 6342 int cr = exit_qualification & 15; 6343 int reg; 6344 unsigned long val; 6345 6346 switch ((exit_qualification >> 4) & 3) { 6347 case 0: /* mov to cr */ 6348 reg = (exit_qualification >> 8) & 15; 6349 val = kvm_register_read(vcpu, reg); 6350 switch (cr) { 6351 case 0: 6352 if (vmcs12->cr0_guest_host_mask & 6353 (val ^ vmcs12->cr0_read_shadow)) 6354 return true; 6355 break; 6356 case 3: 6357 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 6358 return true; 6359 break; 6360 case 4: 6361 if (vmcs12->cr4_guest_host_mask & 6362 (vmcs12->cr4_read_shadow ^ val)) 6363 return true; 6364 break; 6365 case 8: 6366 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 6367 return true; 6368 break; 6369 } 6370 break; 6371 case 2: /* clts */ 6372 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 6373 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 6374 return true; 6375 break; 6376 case 1: /* mov from cr */ 6377 switch (cr) { 6378 case 3: 6379 if (vmcs12->cpu_based_vm_exec_control & 6380 CPU_BASED_CR3_STORE_EXITING) 6381 return true; 6382 break; 6383 case 8: 6384 if (vmcs12->cpu_based_vm_exec_control & 6385 CPU_BASED_CR8_STORE_EXITING) 6386 return true; 6387 break; 6388 } 6389 break; 6390 case 3: /* lmsw */ 6391 /* 6392 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 6393 * cr0. Other attempted changes are ignored, with no exit. 6394 */ 6395 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 6396 if (vmcs12->cr0_guest_host_mask & 0xe & 6397 (val ^ vmcs12->cr0_read_shadow)) 6398 return true; 6399 if ((vmcs12->cr0_guest_host_mask & 0x1) && 6400 !(vmcs12->cr0_read_shadow & 0x1) && 6401 (val & 0x1)) 6402 return true; 6403 break; 6404 } 6405 return false; 6406 } 6407 6408 static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, 6409 struct vmcs12 *vmcs12) 6410 { 6411 u32 encls_leaf; 6412 6413 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) || 6414 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) 6415 return false; 6416 6417 encls_leaf = kvm_rax_read(vcpu); 6418 if (encls_leaf > 62) 6419 encls_leaf = 63; 6420 return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); 6421 } 6422 6423 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 6424 struct vmcs12 *vmcs12, gpa_t bitmap) 6425 { 6426 u32 vmx_instruction_info; 6427 unsigned long field; 6428 u8 b; 6429 6430 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 6431 return true; 6432 6433 /* Decode instruction info and find the field to access */ 6434 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6435 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 6436 6437 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 6438 if (field >> 15) 6439 return true; 6440 6441 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 6442 return true; 6443 6444 return 1 & (b >> (field & 7)); 6445 } 6446 6447 static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) 6448 { 6449 u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; 6450 6451 if (nested_cpu_has_mtf(vmcs12)) 6452 return true; 6453 6454 /* 6455 * An MTF VM-exit may be injected into the guest by setting the 6456 * interruption-type to 7 (other event) and the vector field to 0. Such 6457 * is the case regardless of the 'monitor trap flag' VM-execution 6458 * control. 6459 */ 6460 return entry_intr_info == (INTR_INFO_VALID_MASK 6461 | INTR_TYPE_OTHER_EVENT); 6462 } 6463 6464 /* 6465 * Return true if L0 wants to handle an exit from L2 regardless of whether or not 6466 * L1 wants the exit. Only call this when in is_guest_mode (L2). 6467 */ 6468 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, 6469 union vmx_exit_reason exit_reason) 6470 { 6471 u32 intr_info; 6472 6473 switch ((u16)exit_reason.basic) { 6474 case EXIT_REASON_EXCEPTION_NMI: 6475 intr_info = vmx_get_intr_info(vcpu); 6476 if (is_nmi(intr_info)) 6477 return true; 6478 else if (is_page_fault(intr_info)) 6479 return vcpu->arch.apf.host_apf_flags || 6480 vmx_need_pf_intercept(vcpu); 6481 else if (is_debug(intr_info) && 6482 vcpu->guest_debug & 6483 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 6484 return true; 6485 else if (is_breakpoint(intr_info) && 6486 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 6487 return true; 6488 else if (is_alignment_check(intr_info) && 6489 !vmx_guest_inject_ac(vcpu)) 6490 return true; 6491 else if (is_ve_fault(intr_info)) 6492 return true; 6493 return false; 6494 case EXIT_REASON_EXTERNAL_INTERRUPT: 6495 return true; 6496 case EXIT_REASON_MCE_DURING_VMENTRY: 6497 return true; 6498 case EXIT_REASON_EPT_VIOLATION: 6499 /* 6500 * L0 always deals with the EPT violation. If nested EPT is 6501 * used, and the nested mmu code discovers that the address is 6502 * missing in the guest EPT table (EPT12), the EPT violation 6503 * will be injected with nested_ept_inject_page_fault() 6504 */ 6505 return true; 6506 case EXIT_REASON_EPT_MISCONFIG: 6507 /* 6508 * L2 never uses directly L1's EPT, but rather L0's own EPT 6509 * table (shadow on EPT) or a merged EPT table that L0 built 6510 * (EPT on EPT). So any problems with the structure of the 6511 * table is L0's fault. 6512 */ 6513 return true; 6514 case EXIT_REASON_PREEMPTION_TIMER: 6515 return true; 6516 case EXIT_REASON_PML_FULL: 6517 /* 6518 * PML is emulated for an L1 VMM and should never be enabled in 6519 * vmcs02, always "handle" PML_FULL by exiting to userspace. 6520 */ 6521 return true; 6522 case EXIT_REASON_VMFUNC: 6523 /* VM functions are emulated through L2->L0 vmexits. */ 6524 return true; 6525 case EXIT_REASON_BUS_LOCK: 6526 /* 6527 * At present, bus lock VM exit is never exposed to L1. 6528 * Handle L2's bus locks in L0 directly. 6529 */ 6530 return true; 6531 #ifdef CONFIG_KVM_HYPERV 6532 case EXIT_REASON_VMCALL: 6533 /* Hyper-V L2 TLB flush hypercall is handled by L0 */ 6534 return guest_hv_cpuid_has_l2_tlb_flush(vcpu) && 6535 nested_evmcs_l2_tlb_flush_enabled(vcpu) && 6536 kvm_hv_is_tlb_flush_hcall(vcpu); 6537 #endif 6538 default: 6539 break; 6540 } 6541 return false; 6542 } 6543 6544 /* 6545 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in 6546 * is_guest_mode (L2). 6547 */ 6548 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, 6549 union vmx_exit_reason exit_reason) 6550 { 6551 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6552 u32 intr_info; 6553 6554 switch ((u16)exit_reason.basic) { 6555 case EXIT_REASON_EXCEPTION_NMI: 6556 intr_info = vmx_get_intr_info(vcpu); 6557 if (is_nmi(intr_info)) 6558 return true; 6559 else if (is_page_fault(intr_info)) 6560 return true; 6561 return vmcs12->exception_bitmap & 6562 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 6563 case EXIT_REASON_EXTERNAL_INTERRUPT: 6564 return nested_exit_on_intr(vcpu); 6565 case EXIT_REASON_TRIPLE_FAULT: 6566 return true; 6567 case EXIT_REASON_INTERRUPT_WINDOW: 6568 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); 6569 case EXIT_REASON_NMI_WINDOW: 6570 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); 6571 case EXIT_REASON_TASK_SWITCH: 6572 return true; 6573 case EXIT_REASON_CPUID: 6574 return true; 6575 case EXIT_REASON_HLT: 6576 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 6577 case EXIT_REASON_INVD: 6578 return true; 6579 case EXIT_REASON_INVLPG: 6580 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6581 case EXIT_REASON_RDPMC: 6582 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 6583 case EXIT_REASON_RDRAND: 6584 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 6585 case EXIT_REASON_RDSEED: 6586 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 6587 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 6588 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 6589 case EXIT_REASON_VMREAD: 6590 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6591 vmcs12->vmread_bitmap); 6592 case EXIT_REASON_VMWRITE: 6593 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6594 vmcs12->vmwrite_bitmap); 6595 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 6596 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 6597 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 6598 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 6599 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 6600 /* 6601 * VMX instructions trap unconditionally. This allows L1 to 6602 * emulate them for its L2 guest, i.e., allows 3-level nesting! 6603 */ 6604 return true; 6605 case EXIT_REASON_CR_ACCESS: 6606 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 6607 case EXIT_REASON_DR_ACCESS: 6608 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 6609 case EXIT_REASON_IO_INSTRUCTION: 6610 return nested_vmx_exit_handled_io(vcpu, vmcs12); 6611 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 6612 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 6613 case EXIT_REASON_MSR_READ: 6614 case EXIT_REASON_MSR_WRITE: 6615 case EXIT_REASON_MSR_READ_IMM: 6616 case EXIT_REASON_MSR_WRITE_IMM: 6617 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 6618 case EXIT_REASON_INVALID_STATE: 6619 return true; 6620 case EXIT_REASON_MWAIT_INSTRUCTION: 6621 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 6622 case EXIT_REASON_MONITOR_TRAP_FLAG: 6623 return nested_vmx_exit_handled_mtf(vmcs12); 6624 case EXIT_REASON_MONITOR_INSTRUCTION: 6625 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 6626 case EXIT_REASON_PAUSE_INSTRUCTION: 6627 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 6628 nested_cpu_has2(vmcs12, 6629 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 6630 case EXIT_REASON_MCE_DURING_VMENTRY: 6631 return true; 6632 case EXIT_REASON_TPR_BELOW_THRESHOLD: 6633 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 6634 case EXIT_REASON_APIC_ACCESS: 6635 case EXIT_REASON_APIC_WRITE: 6636 case EXIT_REASON_EOI_INDUCED: 6637 /* 6638 * The controls for "virtualize APIC accesses," "APIC- 6639 * register virtualization," and "virtual-interrupt 6640 * delivery" only come from vmcs12. 6641 */ 6642 return true; 6643 case EXIT_REASON_INVPCID: 6644 return 6645 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 6646 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6647 case EXIT_REASON_WBINVD: 6648 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 6649 case EXIT_REASON_XSETBV: 6650 return true; 6651 case EXIT_REASON_XSAVES: 6652 case EXIT_REASON_XRSTORS: 6653 /* 6654 * Always forward XSAVES/XRSTORS to L1 as KVM doesn't utilize 6655 * XSS-bitmap, and always loads vmcs02 with vmcs12's XSS-bitmap 6656 * verbatim, i.e. any exit is due to L1's bitmap. WARN if 6657 * XSAVES isn't enabled, as the CPU is supposed to inject #UD 6658 * in that case, before consulting the XSS-bitmap. 6659 */ 6660 WARN_ON_ONCE(!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES)); 6661 return true; 6662 case EXIT_REASON_UMWAIT: 6663 case EXIT_REASON_TPAUSE: 6664 return nested_cpu_has2(vmcs12, 6665 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); 6666 case EXIT_REASON_ENCLS: 6667 return nested_vmx_exit_handled_encls(vcpu, vmcs12); 6668 case EXIT_REASON_NOTIFY: 6669 /* Notify VM exit is not exposed to L1 */ 6670 return false; 6671 case EXIT_REASON_SEAMCALL: 6672 case EXIT_REASON_TDCALL: 6673 /* 6674 * SEAMCALL and TDCALL unconditionally VM-Exit, but aren't 6675 * virtualized by KVM for L1 hypervisors, i.e. L1 should 6676 * never want or expect such an exit. 6677 */ 6678 return false; 6679 default: 6680 return true; 6681 } 6682 } 6683 6684 /* 6685 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was 6686 * reflected into L1. 6687 */ 6688 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) 6689 { 6690 struct vcpu_vmx *vmx = to_vmx(vcpu); 6691 union vmx_exit_reason exit_reason = vmx->vt.exit_reason; 6692 unsigned long exit_qual; 6693 u32 exit_intr_info; 6694 6695 kvm_warn_on_nested_run_pending(vcpu); 6696 6697 /* 6698 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM 6699 * has already loaded L2's state. 6700 */ 6701 if (unlikely(vmx->fail)) { 6702 trace_kvm_nested_vmenter_failed( 6703 "hardware VM-instruction error: ", 6704 vmcs_read32(VM_INSTRUCTION_ERROR)); 6705 exit_intr_info = 0; 6706 exit_qual = 0; 6707 goto reflect_vmexit; 6708 } 6709 6710 trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); 6711 6712 /* If L0 (KVM) wants the exit, it trumps L1's desires. */ 6713 if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) 6714 return false; 6715 6716 /* If L1 doesn't want the exit, handle it in L0. */ 6717 if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) 6718 return false; 6719 6720 /* 6721 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For 6722 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would 6723 * need to be synthesized by querying the in-kernel LAPIC, but external 6724 * interrupts are never reflected to L1 so it's a non-issue. 6725 */ 6726 exit_intr_info = vmx_get_intr_info(vcpu); 6727 if (is_exception_with_error_code(exit_intr_info)) { 6728 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6729 6730 vmcs12->vm_exit_intr_error_code = 6731 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6732 } 6733 exit_qual = vmx_get_exit_qual(vcpu); 6734 6735 reflect_vmexit: 6736 nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); 6737 return true; 6738 } 6739 6740 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 6741 struct kvm_nested_state __user *user_kvm_nested_state, 6742 u32 user_data_size) 6743 { 6744 struct vcpu_vmx *vmx; 6745 struct vmcs12 *vmcs12; 6746 struct kvm_nested_state kvm_state = { 6747 .flags = 0, 6748 .format = KVM_STATE_NESTED_FORMAT_VMX, 6749 .size = sizeof(kvm_state), 6750 .hdr.vmx.flags = 0, 6751 .hdr.vmx.vmxon_pa = INVALID_GPA, 6752 .hdr.vmx.vmcs12_pa = INVALID_GPA, 6753 .hdr.vmx.preemption_timer_deadline = 0, 6754 }; 6755 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6756 &user_kvm_nested_state->data.vmx[0]; 6757 6758 if (!vcpu) 6759 return kvm_state.size + sizeof(*user_vmx_nested_state); 6760 6761 vmx = to_vmx(vcpu); 6762 vmcs12 = get_vmcs12(vcpu); 6763 6764 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) && 6765 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 6766 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 6767 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 6768 6769 if (vmx_has_valid_vmcs12(vcpu)) { 6770 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 6771 6772 /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ 6773 if (nested_vmx_is_evmptr12_set(vmx)) 6774 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 6775 6776 if (is_guest_mode(vcpu) && 6777 nested_cpu_has_shadow_vmcs(vmcs12) && 6778 vmcs12->vmcs_link_pointer != INVALID_GPA) 6779 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 6780 } 6781 6782 if (vmx->nested.smm.vmxon) 6783 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 6784 6785 if (vmx->nested.smm.guest_mode) 6786 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 6787 6788 if (is_guest_mode(vcpu)) { 6789 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 6790 6791 if (vcpu->arch.nested_run_pending) 6792 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 6793 6794 if (vmx->nested.mtf_pending) 6795 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; 6796 6797 if (nested_cpu_has_preemption_timer(vmcs12) && 6798 vmx->nested.has_preemption_timer_deadline) { 6799 kvm_state.hdr.vmx.flags |= 6800 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; 6801 kvm_state.hdr.vmx.preemption_timer_deadline = 6802 vmx->nested.preemption_timer_deadline; 6803 } 6804 } 6805 } 6806 6807 if (user_data_size < kvm_state.size) 6808 goto out; 6809 6810 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 6811 return -EFAULT; 6812 6813 if (!vmx_has_valid_vmcs12(vcpu)) 6814 goto out; 6815 6816 /* 6817 * When running L2, the authoritative vmcs12 state is in the 6818 * vmcs02. When running L1, the authoritative vmcs12 state is 6819 * in the shadow or enlightened vmcs linked to vmcs01, unless 6820 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 6821 * vmcs12 state is in the vmcs12 already. 6822 */ 6823 if (is_guest_mode(vcpu)) { 6824 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 6825 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 6826 } else { 6827 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 6828 if (!vmx->nested.need_vmcs12_to_shadow_sync) { 6829 if (nested_vmx_is_evmptr12_valid(vmx)) 6830 /* 6831 * L1 hypervisor is not obliged to keep eVMCS 6832 * clean fields data always up-to-date while 6833 * not in guest mode, 'hv_clean_fields' is only 6834 * supposed to be actual upon vmentry so we need 6835 * to ignore it here and do full copy. 6836 */ 6837 copy_enlightened_to_vmcs12(vmx, 0); 6838 else if (enable_shadow_vmcs) 6839 copy_shadow_to_vmcs12(vmx); 6840 } 6841 } 6842 6843 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 6844 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 6845 6846 /* 6847 * Copy over the full allocated size of vmcs12 rather than just the size 6848 * of the struct. 6849 */ 6850 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 6851 return -EFAULT; 6852 6853 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6854 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6855 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 6856 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 6857 return -EFAULT; 6858 } 6859 out: 6860 return kvm_state.size; 6861 } 6862 6863 void vmx_leave_nested(struct kvm_vcpu *vcpu) 6864 { 6865 if (is_guest_mode(vcpu)) { 6866 vcpu->arch.nested_run_pending = 0; 6867 nested_vmx_vmexit(vcpu, -1, 0, 0); 6868 } 6869 free_nested(vcpu); 6870 } 6871 6872 int nested_vmx_check_restored_vmcs12(struct kvm_vcpu *vcpu) 6873 { 6874 enum vm_entry_failure_code ignored; 6875 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6876 6877 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6878 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6879 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 6880 6881 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 6882 !shadow_vmcs12->hdr.shadow_vmcs) 6883 return -EINVAL; 6884 } 6885 6886 if (nested_vmx_check_controls(vcpu, vmcs12) || 6887 nested_vmx_check_host_state(vcpu, vmcs12) || 6888 nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) 6889 return -EINVAL; 6890 6891 return 0; 6892 } 6893 6894 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 6895 struct kvm_nested_state __user *user_kvm_nested_state, 6896 struct kvm_nested_state *kvm_state) 6897 { 6898 struct vcpu_vmx *vmx = to_vmx(vcpu); 6899 struct vmcs12 *vmcs12; 6900 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6901 &user_kvm_nested_state->data.vmx[0]; 6902 int ret; 6903 6904 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 6905 return -EINVAL; 6906 6907 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { 6908 if (kvm_state->hdr.vmx.smm.flags) 6909 return -EINVAL; 6910 6911 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) 6912 return -EINVAL; 6913 6914 /* 6915 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 6916 * enable eVMCS capability on vCPU. However, since then 6917 * code was changed such that flag signals vmcs12 should 6918 * be copied into eVMCS in guest memory. 6919 * 6920 * To preserve backwards compatibility, allow user 6921 * to set this flag even when there is no VMXON region. 6922 */ 6923 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 6924 return -EINVAL; 6925 } else { 6926 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 6927 return -EINVAL; 6928 6929 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 6930 return -EINVAL; 6931 } 6932 6933 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6934 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6935 return -EINVAL; 6936 6937 if (kvm_state->hdr.vmx.smm.flags & 6938 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 6939 return -EINVAL; 6940 6941 if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) 6942 return -EINVAL; 6943 6944 /* 6945 * SMM temporarily disables VMX, so we cannot be in guest mode, 6946 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 6947 * must be zero. 6948 */ 6949 if (is_smm(vcpu) ? 6950 (kvm_state->flags & 6951 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 6952 : kvm_state->hdr.vmx.smm.flags) 6953 return -EINVAL; 6954 6955 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6956 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 6957 return -EINVAL; 6958 6959 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 6960 (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) || 6961 !vmx->nested.enlightened_vmcs_enabled)) 6962 return -EINVAL; 6963 6964 vmx_leave_nested(vcpu); 6965 6966 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) 6967 return 0; 6968 6969 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 6970 ret = enter_vmx_operation(vcpu); 6971 if (ret) 6972 return ret; 6973 6974 /* Empty 'VMXON' state is permitted if no VMCS loaded */ 6975 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { 6976 /* See vmx_has_valid_vmcs12. */ 6977 if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || 6978 (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || 6979 (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) 6980 return -EINVAL; 6981 else 6982 return 0; 6983 } 6984 6985 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { 6986 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 6987 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 6988 return -EINVAL; 6989 6990 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 6991 #ifdef CONFIG_KVM_HYPERV 6992 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 6993 /* 6994 * nested_vmx_handle_enlightened_vmptrld() cannot be called 6995 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be 6996 * restored yet. EVMCS will be mapped from 6997 * nested_get_vmcs12_pages(). 6998 */ 6999 vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; 7000 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 7001 #endif 7002 } else { 7003 return -EINVAL; 7004 } 7005 7006 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 7007 vmx->nested.smm.vmxon = true; 7008 vmx->nested.vmxon = false; 7009 7010 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 7011 vmx->nested.smm.guest_mode = true; 7012 } 7013 7014 vmcs12 = get_vmcs12(vcpu); 7015 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 7016 return -EFAULT; 7017 7018 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 7019 return -EINVAL; 7020 7021 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 7022 return 0; 7023 7024 if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING) 7025 vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING_UNTRUSTED; 7026 else 7027 vcpu->arch.nested_run_pending = 0; 7028 7029 vmx->nested.mtf_pending = 7030 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); 7031 7032 if (nested_cpu_has_shadow_vmcs(vmcs12) && 7033 vmcs12->vmcs_link_pointer != INVALID_GPA) { 7034 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 7035 7036 ret = -EINVAL; 7037 if (kvm_state->size < 7038 sizeof(*kvm_state) + 7039 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 7040 goto error_guest_mode; 7041 7042 ret = -EFAULT; 7043 if (copy_from_user(shadow_vmcs12, 7044 user_vmx_nested_state->shadow_vmcs12, 7045 sizeof(*shadow_vmcs12))) 7046 goto error_guest_mode; 7047 } 7048 7049 vmx->nested.has_preemption_timer_deadline = false; 7050 if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { 7051 vmx->nested.has_preemption_timer_deadline = true; 7052 vmx->nested.preemption_timer_deadline = 7053 kvm_state->hdr.vmx.preemption_timer_deadline; 7054 } 7055 7056 ret = nested_vmx_check_restored_vmcs12(vcpu); 7057 if (ret < 0) 7058 goto error_guest_mode; 7059 7060 vmx->nested.dirty_vmcs12 = true; 7061 vmx->nested.force_msr_bitmap_recalc = true; 7062 ret = nested_vmx_enter_non_root_mode(vcpu, false); 7063 if (ret) 7064 goto error_guest_mode; 7065 7066 if (vmx->nested.mtf_pending) 7067 kvm_make_request(KVM_REQ_EVENT, vcpu); 7068 7069 return 0; 7070 7071 error_guest_mode: 7072 vcpu->arch.nested_run_pending = 0; 7073 return ret; 7074 } 7075 7076 void nested_vmx_set_vmcs_shadowing_bitmap(void) 7077 { 7078 if (enable_shadow_vmcs) { 7079 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 7080 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 7081 } 7082 } 7083 7084 static u64 nested_vmx_calc_vmcs_enum_msr(void) 7085 { 7086 /* 7087 * Note these are the so called "index" of the VMCS field encoding, not 7088 * the index into vmcs12. 7089 */ 7090 unsigned int max_idx, idx; 7091 int i; 7092 7093 /* 7094 * For better or worse, KVM allows VMREAD/VMWRITE to all fields in 7095 * vmcs12, regardless of whether or not the associated feature is 7096 * exposed to L1. Simply find the field with the highest index. 7097 */ 7098 max_idx = 0; 7099 for (i = 0; i < nr_vmcs12_fields; i++) { 7100 /* The vmcs12 table is very, very sparsely populated. */ 7101 if (!vmcs12_field_offsets[i]) 7102 continue; 7103 7104 idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); 7105 if (idx > max_idx) 7106 max_idx = idx; 7107 } 7108 7109 return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; 7110 } 7111 7112 static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf, 7113 struct nested_vmx_msrs *msrs) 7114 { 7115 msrs->pinbased_ctls_low = 7116 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 7117 7118 msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl; 7119 msrs->pinbased_ctls_high &= 7120 PIN_BASED_EXT_INTR_MASK | 7121 PIN_BASED_NMI_EXITING | 7122 PIN_BASED_VIRTUAL_NMIS | 7123 (enable_apicv ? PIN_BASED_POSTED_INTR : 0); 7124 msrs->pinbased_ctls_high |= 7125 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 7126 PIN_BASED_VMX_PREEMPTION_TIMER; 7127 } 7128 7129 static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf, 7130 struct nested_vmx_msrs *msrs) 7131 { 7132 msrs->exit_ctls_low = 7133 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 7134 7135 msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl; 7136 msrs->exit_ctls_high &= 7137 #ifdef CONFIG_X86_64 7138 VM_EXIT_HOST_ADDR_SPACE_SIZE | 7139 #endif 7140 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | 7141 VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_CET_STATE; 7142 msrs->exit_ctls_high |= 7143 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 7144 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 7145 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT | 7146 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 7147 7148 if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && 7149 !kvm_cpu_cap_has(X86_FEATURE_IBT)) 7150 msrs->exit_ctls_high &= ~VM_EXIT_LOAD_CET_STATE; 7151 7152 /* We support free control of debug control saving. */ 7153 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 7154 } 7155 7156 static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf, 7157 struct nested_vmx_msrs *msrs) 7158 { 7159 msrs->entry_ctls_low = 7160 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 7161 7162 msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl; 7163 msrs->entry_ctls_high &= 7164 #ifdef CONFIG_X86_64 7165 VM_ENTRY_IA32E_MODE | 7166 #endif 7167 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS | 7168 VM_ENTRY_LOAD_CET_STATE; 7169 msrs->entry_ctls_high |= 7170 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER | 7171 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 7172 7173 if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && 7174 !kvm_cpu_cap_has(X86_FEATURE_IBT)) 7175 msrs->entry_ctls_high &= ~VM_ENTRY_LOAD_CET_STATE; 7176 7177 /* We support free control of debug control loading. */ 7178 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 7179 } 7180 7181 static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf, 7182 struct nested_vmx_msrs *msrs) 7183 { 7184 msrs->procbased_ctls_low = 7185 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 7186 7187 msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl; 7188 msrs->procbased_ctls_high &= 7189 CPU_BASED_INTR_WINDOW_EXITING | 7190 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | 7191 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 7192 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 7193 CPU_BASED_CR3_STORE_EXITING | 7194 #ifdef CONFIG_X86_64 7195 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 7196 #endif 7197 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 7198 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 7199 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 7200 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 7201 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 7202 /* 7203 * We can allow some features even when not supported by the 7204 * hardware. For example, L1 can specify an MSR bitmap - and we 7205 * can use it to avoid exits to L1 - even when L0 runs L2 7206 * without MSR bitmaps. 7207 */ 7208 msrs->procbased_ctls_high |= 7209 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 7210 CPU_BASED_USE_MSR_BITMAPS; 7211 7212 /* We support free control of CR3 access interception. */ 7213 msrs->procbased_ctls_low &= 7214 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 7215 } 7216 7217 static void nested_vmx_setup_secondary_ctls(u32 ept_caps, 7218 struct vmcs_config *vmcs_conf, 7219 struct nested_vmx_msrs *msrs) 7220 { 7221 msrs->secondary_ctls_low = 0; 7222 7223 msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl; 7224 msrs->secondary_ctls_high &= 7225 SECONDARY_EXEC_DESC | 7226 SECONDARY_EXEC_ENABLE_RDTSCP | 7227 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 7228 SECONDARY_EXEC_WBINVD_EXITING | 7229 SECONDARY_EXEC_APIC_REGISTER_VIRT | 7230 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 7231 SECONDARY_EXEC_RDRAND_EXITING | 7232 SECONDARY_EXEC_ENABLE_INVPCID | 7233 SECONDARY_EXEC_ENABLE_VMFUNC | 7234 SECONDARY_EXEC_RDSEED_EXITING | 7235 SECONDARY_EXEC_ENABLE_XSAVES | 7236 SECONDARY_EXEC_TSC_SCALING | 7237 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; 7238 7239 /* 7240 * We can emulate "VMCS shadowing," even if the hardware 7241 * doesn't support it. 7242 */ 7243 msrs->secondary_ctls_high |= 7244 SECONDARY_EXEC_SHADOW_VMCS; 7245 7246 if (enable_ept) { 7247 /* nested EPT: emulate EPT also to L1 */ 7248 msrs->secondary_ctls_high |= 7249 SECONDARY_EXEC_ENABLE_EPT; 7250 msrs->ept_caps = 7251 VMX_EPT_PAGE_WALK_4_BIT | 7252 VMX_EPT_PAGE_WALK_5_BIT | 7253 VMX_EPTP_WB_BIT | 7254 VMX_EPT_INVEPT_BIT | 7255 VMX_EPT_EXECUTE_ONLY_BIT | 7256 VMX_EPT_ADVANCED_VMEXIT_INFO_BIT; 7257 7258 msrs->ept_caps &= ept_caps; 7259 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 7260 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 7261 VMX_EPT_1GB_PAGE_BIT; 7262 if (enable_ept_ad_bits) { 7263 msrs->secondary_ctls_high |= 7264 SECONDARY_EXEC_ENABLE_PML; 7265 msrs->ept_caps |= VMX_EPT_AD_BIT; 7266 } 7267 7268 if (enable_mbec) 7269 msrs->secondary_ctls_high |= 7270 SECONDARY_EXEC_MODE_BASED_EPT_EXEC; 7271 /* 7272 * Advertise EPTP switching irrespective of hardware support, 7273 * KVM emulates it in software so long as VMFUNC is supported. 7274 */ 7275 if (cpu_has_vmx_vmfunc()) 7276 msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING; 7277 } 7278 7279 /* 7280 * Old versions of KVM use the single-context version without 7281 * checking for support, so declare that it is supported even 7282 * though it is treated as global context. The alternative is 7283 * not failing the single-context invvpid, and it is worse. 7284 */ 7285 if (enable_vpid) { 7286 msrs->secondary_ctls_high |= 7287 SECONDARY_EXEC_ENABLE_VPID; 7288 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 7289 VMX_VPID_EXTENT_SUPPORTED_MASK; 7290 } 7291 7292 if (enable_unrestricted_guest) 7293 msrs->secondary_ctls_high |= 7294 SECONDARY_EXEC_UNRESTRICTED_GUEST; 7295 7296 if (flexpriority_enabled) 7297 msrs->secondary_ctls_high |= 7298 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 7299 7300 if (enable_sgx) 7301 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; 7302 } 7303 7304 static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf, 7305 struct nested_vmx_msrs *msrs) 7306 { 7307 msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; 7308 msrs->misc_low |= 7309 VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 7310 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 7311 VMX_MISC_ACTIVITY_HLT | 7312 VMX_MISC_ACTIVITY_WAIT_SIPI; 7313 msrs->misc_high = 0; 7314 } 7315 7316 static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) 7317 { 7318 /* 7319 * This MSR reports some information about VMX support. We 7320 * should return information about the VMX we emulate for the 7321 * guest, and the VMCS structure we give it - not about the 7322 * VMX support of the underlying hardware. 7323 */ 7324 msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE, 7325 X86_MEMTYPE_WB); 7326 7327 msrs->basic |= VMX_BASIC_TRUE_CTLS; 7328 if (cpu_has_vmx_basic_inout()) 7329 msrs->basic |= VMX_BASIC_INOUT; 7330 if (cpu_has_vmx_basic_no_hw_errcode_cc()) 7331 msrs->basic |= VMX_BASIC_NO_HW_ERROR_CODE_CC; 7332 } 7333 7334 static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) 7335 { 7336 /* 7337 * These MSRs specify bits which the guest must keep fixed on 7338 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 7339 * We picked the standard core2 setting. 7340 */ 7341 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 7342 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 7343 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 7344 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 7345 7346 /* These MSRs specify bits which the guest must keep fixed off. */ 7347 rdmsrq(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 7348 rdmsrq(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 7349 7350 if (vmx_umip_emulated()) 7351 msrs->cr4_fixed1 |= X86_CR4_UMIP; 7352 } 7353 7354 /* 7355 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 7356 * returned for the various VMX controls MSRs when nested VMX is enabled. 7357 * The same values should also be used to verify that vmcs12 control fields are 7358 * valid during nested entry from L1 to L2. 7359 * Each of these control msrs has a low and high 32-bit half: A low bit is on 7360 * if the corresponding bit in the (32-bit) control field *must* be on, and a 7361 * bit in the high half is on if the corresponding bit in the control field 7362 * may be on. See also vmx_control_verify(). 7363 */ 7364 void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) 7365 { 7366 struct nested_vmx_msrs *msrs = &vmcs_conf->nested; 7367 7368 /* 7369 * Note that as a general rule, the high half of the MSRs (bits in 7370 * the control fields which may be 1) should be initialized by the 7371 * intersection of the underlying hardware's MSR (i.e., features which 7372 * can be supported) and the list of features we want to expose - 7373 * because they are known to be properly supported in our code. 7374 * Also, usually, the low half of the MSRs (bits which must be 1) can 7375 * be set to 0, meaning that L1 may turn off any of these bits. The 7376 * reason is that if one of these bits is necessary, it will appear 7377 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 7378 * fields of vmcs01 and vmcs02, will turn these bits off - and 7379 * nested_vmx_l1_wants_exit() will not pass related exits to L1. 7380 * These rules have exceptions below. 7381 */ 7382 nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs); 7383 7384 nested_vmx_setup_exit_ctls(vmcs_conf, msrs); 7385 7386 nested_vmx_setup_entry_ctls(vmcs_conf, msrs); 7387 7388 nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs); 7389 7390 nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs); 7391 7392 nested_vmx_setup_misc_data(vmcs_conf, msrs); 7393 7394 nested_vmx_setup_basic(msrs); 7395 7396 nested_vmx_setup_cr_fixed(msrs); 7397 7398 msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); 7399 } 7400 7401 void nested_vmx_hardware_unsetup(void) 7402 { 7403 int i; 7404 7405 if (enable_shadow_vmcs) { 7406 for (i = 0; i < VMX_BITMAP_NR; i++) 7407 free_page((unsigned long)vmx_bitmap[i]); 7408 } 7409 } 7410 7411 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 7412 { 7413 int i; 7414 7415 /* 7416 * Note! The set of supported vmcs12 fields is consumed by both VMX 7417 * MSR and shadow VMCS setup. 7418 */ 7419 nested_vmx_setup_vmcs12_fields(); 7420 7421 nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); 7422 7423 if (!cpu_has_vmx_shadow_vmcs()) 7424 enable_shadow_vmcs = 0; 7425 if (enable_shadow_vmcs) { 7426 for (i = 0; i < VMX_BITMAP_NR; i++) { 7427 /* 7428 * The vmx_bitmap is not tied to a VM and so should 7429 * not be charged to a memcg. 7430 */ 7431 vmx_bitmap[i] = (unsigned long *) 7432 __get_free_page(GFP_KERNEL); 7433 if (!vmx_bitmap[i]) { 7434 nested_vmx_hardware_unsetup(); 7435 return -ENOMEM; 7436 } 7437 } 7438 7439 init_vmcs_shadow_fields(); 7440 } 7441 7442 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; 7443 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; 7444 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; 7445 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; 7446 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; 7447 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; 7448 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; 7449 exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff; 7450 exit_handlers[EXIT_REASON_VMON] = handle_vmxon; 7451 exit_handlers[EXIT_REASON_INVEPT] = handle_invept; 7452 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; 7453 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; 7454 7455 return 0; 7456 } 7457 7458 static gpa_t vmx_translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, 7459 u64 access, 7460 struct x86_exception *exception, 7461 u64 pte_access) 7462 { 7463 struct kvm_mmu *mmu = vcpu->arch.mmu; 7464 7465 BUG_ON(!mmu_is_nested(vcpu)); 7466 7467 /* 7468 * MBEC differentiates based on the effective U/S bit of 7469 * the guest page tables; not the processor CPL. 7470 */ 7471 access &= ~PFERR_USER_MASK; 7472 if ((pte_access & ACC_USER_MASK) && (access & PFERR_GUEST_FINAL_MASK)) 7473 access |= PFERR_USER_MASK; 7474 7475 return mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception); 7476 } 7477 7478 struct kvm_x86_nested_ops vmx_nested_ops = { 7479 .leave_nested = vmx_leave_nested, 7480 .translate_nested_gpa = vmx_translate_nested_gpa, 7481 .is_exception_vmexit = nested_vmx_is_exception_vmexit, 7482 .check_events = vmx_check_nested_events, 7483 .has_events = vmx_has_nested_events, 7484 .triple_fault = nested_vmx_triple_fault, 7485 .get_state = vmx_get_nested_state, 7486 .set_state = vmx_set_nested_state, 7487 .get_nested_state_pages = vmx_get_nested_state_pages, 7488 .write_log_dirty = nested_vmx_write_pml_buffer, 7489 #ifdef CONFIG_KVM_HYPERV 7490 .enable_evmcs = nested_enable_evmcs, 7491 .get_evmcs_version = nested_get_evmcs_version, 7492 .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush, 7493 #endif 7494 }; 7495