1 // SPDX-License-Identifier: GPL-2.0 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include <linux/objtool.h> 5 #include <linux/percpu.h> 6 7 #include <asm/debugreg.h> 8 #include <asm/mmu_context.h> 9 #include <asm/msr.h> 10 11 #include "x86.h" 12 #include "cpuid.h" 13 #include "hyperv.h" 14 #include "mmu.h" 15 #include "nested.h" 16 #include "pmu.h" 17 #include "posted_intr.h" 18 #include "sgx.h" 19 #include "trace.h" 20 #include "vmx.h" 21 #include "smm.h" 22 23 static bool __read_mostly enable_shadow_vmcs = 1; 24 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 25 26 static bool __read_mostly nested_early_check = 0; 27 module_param(nested_early_check, bool, S_IRUGO); 28 29 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK 30 31 /* 32 * Hyper-V requires all of these, so mark them as supported even though 33 * they are just treated the same as all-context. 34 */ 35 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 36 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 37 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 38 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 39 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 40 41 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 42 43 enum { 44 VMX_VMREAD_BITMAP, 45 VMX_VMWRITE_BITMAP, 46 VMX_BITMAP_NR 47 }; 48 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 49 50 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 51 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 52 53 struct shadow_vmcs_field { 54 u16 encoding; 55 u16 offset; 56 }; 57 static struct shadow_vmcs_field shadow_read_only_fields[] = { 58 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 59 #include "vmcs_shadow_fields.h" 60 }; 61 static int max_shadow_read_only_fields = 62 ARRAY_SIZE(shadow_read_only_fields); 63 64 static struct shadow_vmcs_field shadow_read_write_fields[] = { 65 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 66 #include "vmcs_shadow_fields.h" 67 }; 68 static int max_shadow_read_write_fields = 69 ARRAY_SIZE(shadow_read_write_fields); 70 71 static void init_vmcs_shadow_fields(void) 72 { 73 int i, j; 74 75 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 76 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 77 78 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 79 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 80 u16 field = entry.encoding; 81 82 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 83 (i + 1 == max_shadow_read_only_fields || 84 shadow_read_only_fields[i + 1].encoding != field + 1)) 85 pr_err("Missing field from shadow_read_only_field %x\n", 86 field + 1); 87 88 clear_bit(field, vmx_vmread_bitmap); 89 if (field & 1) 90 #ifdef CONFIG_X86_64 91 continue; 92 #else 93 entry.offset += sizeof(u32); 94 #endif 95 shadow_read_only_fields[j++] = entry; 96 } 97 max_shadow_read_only_fields = j; 98 99 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 100 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 101 u16 field = entry.encoding; 102 103 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 104 (i + 1 == max_shadow_read_write_fields || 105 shadow_read_write_fields[i + 1].encoding != field + 1)) 106 pr_err("Missing field from shadow_read_write_field %x\n", 107 field + 1); 108 109 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 110 field <= GUEST_TR_AR_BYTES, 111 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 112 113 /* 114 * PML and the preemption timer can be emulated, but the 115 * processor cannot vmwrite to fields that don't exist 116 * on bare metal. 117 */ 118 switch (field) { 119 case GUEST_PML_INDEX: 120 if (!cpu_has_vmx_pml()) 121 continue; 122 break; 123 case VMX_PREEMPTION_TIMER_VALUE: 124 if (!cpu_has_vmx_preemption_timer()) 125 continue; 126 break; 127 case GUEST_INTR_STATUS: 128 if (!cpu_has_vmx_apicv()) 129 continue; 130 break; 131 default: 132 break; 133 } 134 135 clear_bit(field, vmx_vmwrite_bitmap); 136 clear_bit(field, vmx_vmread_bitmap); 137 if (field & 1) 138 #ifdef CONFIG_X86_64 139 continue; 140 #else 141 entry.offset += sizeof(u32); 142 #endif 143 shadow_read_write_fields[j++] = entry; 144 } 145 max_shadow_read_write_fields = j; 146 } 147 148 /* 149 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 150 * set the success or error code of an emulated VMX instruction (as specified 151 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 152 * instruction. 153 */ 154 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 155 { 156 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 157 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 158 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 159 return kvm_skip_emulated_instruction(vcpu); 160 } 161 162 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 163 { 164 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 165 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 166 X86_EFLAGS_SF | X86_EFLAGS_OF)) 167 | X86_EFLAGS_CF); 168 return kvm_skip_emulated_instruction(vcpu); 169 } 170 171 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 172 u32 vm_instruction_error) 173 { 174 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 175 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 176 X86_EFLAGS_SF | X86_EFLAGS_OF)) 177 | X86_EFLAGS_ZF); 178 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 179 /* 180 * We don't need to force sync to shadow VMCS because 181 * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all 182 * fields and thus must be synced. 183 */ 184 if (nested_vmx_is_evmptr12_set(to_vmx(vcpu))) 185 to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; 186 187 return kvm_skip_emulated_instruction(vcpu); 188 } 189 190 static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) 191 { 192 struct vcpu_vmx *vmx = to_vmx(vcpu); 193 194 /* 195 * failValid writes the error number to the current VMCS, which 196 * can't be done if there isn't a current VMCS. 197 */ 198 if (vmx->nested.current_vmptr == INVALID_GPA && 199 !nested_vmx_is_evmptr12_valid(vmx)) 200 return nested_vmx_failInvalid(vcpu); 201 202 return nested_vmx_failValid(vcpu, vm_instruction_error); 203 } 204 205 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 206 { 207 /* TODO: not to reset guest simply here. */ 208 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 209 pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator); 210 } 211 212 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 213 { 214 return fixed_bits_valid(control, low, high); 215 } 216 217 static inline u64 vmx_control_msr(u32 low, u32 high) 218 { 219 return low | ((u64)high << 32); 220 } 221 222 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 223 { 224 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 225 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 226 vmx->nested.need_vmcs12_to_shadow_sync = false; 227 } 228 229 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 230 { 231 #ifdef CONFIG_KVM_HYPERV 232 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); 233 struct vcpu_vmx *vmx = to_vmx(vcpu); 234 235 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map); 236 vmx->nested.hv_evmcs = NULL; 237 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 238 239 if (hv_vcpu) { 240 hv_vcpu->nested.pa_page_gpa = INVALID_GPA; 241 hv_vcpu->nested.vm_id = 0; 242 hv_vcpu->nested.vp_id = 0; 243 } 244 #endif 245 } 246 247 static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr) 248 { 249 #ifdef CONFIG_KVM_HYPERV 250 struct vcpu_vmx *vmx = to_vmx(vcpu); 251 /* 252 * When Enlightened VMEntry is enabled on the calling CPU we treat 253 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 254 * way to distinguish it from VMCS12) and we must not corrupt it by 255 * writing to the non-existent 'launch_state' field. The area doesn't 256 * have to be the currently active EVMCS on the calling CPU and there's 257 * nothing KVM has to do to transition it from 'active' to 'non-active' 258 * state. It is possible that the area will stay mapped as 259 * vmx->nested.hv_evmcs but this shouldn't be a problem. 260 */ 261 if (!guest_cpu_cap_has_evmcs(vcpu) || 262 !evmptr_is_valid(nested_get_evmptr(vcpu))) 263 return false; 264 265 if (nested_vmx_evmcs(vmx) && vmptr == vmx->nested.hv_evmcs_vmptr) 266 nested_release_evmcs(vcpu); 267 268 return true; 269 #else 270 return false; 271 #endif 272 } 273 274 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 275 struct loaded_vmcs *prev) 276 { 277 struct vmcs_host_state *dest, *src; 278 279 if (unlikely(!vmx->vt.guest_state_loaded)) 280 return; 281 282 src = &prev->host_state; 283 dest = &vmx->loaded_vmcs->host_state; 284 285 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 286 dest->ldt_sel = src->ldt_sel; 287 #ifdef CONFIG_X86_64 288 dest->ds_sel = src->ds_sel; 289 dest->es_sel = src->es_sel; 290 #endif 291 } 292 293 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 294 { 295 struct vcpu_vmx *vmx = to_vmx(vcpu); 296 struct loaded_vmcs *prev; 297 int cpu; 298 299 if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) 300 return; 301 302 cpu = get_cpu(); 303 prev = vmx->loaded_vmcs; 304 vmx->loaded_vmcs = vmcs; 305 vmx_vcpu_load_vmcs(vcpu, cpu); 306 vmx_sync_vmcs_host_state(vmx, prev); 307 put_cpu(); 308 309 vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET; 310 311 /* 312 * All lazily updated registers will be reloaded from VMCS12 on both 313 * vmentry and vmexit. 314 */ 315 vcpu->arch.regs_dirty = 0; 316 } 317 318 static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu) 319 { 320 struct vcpu_vmx *vmx = to_vmx(vcpu); 321 322 kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map); 323 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map); 324 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map); 325 vmx->nested.pi_desc = NULL; 326 } 327 328 /* 329 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 330 * just stops using VMX. 331 */ 332 static void free_nested(struct kvm_vcpu *vcpu) 333 { 334 struct vcpu_vmx *vmx = to_vmx(vcpu); 335 336 if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) 337 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 338 339 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 340 return; 341 342 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 343 344 vmx->nested.vmxon = false; 345 vmx->nested.smm.vmxon = false; 346 vmx->nested.vmxon_ptr = INVALID_GPA; 347 free_vpid(vmx->nested.vpid02); 348 vmx->nested.posted_intr_nv = -1; 349 vmx->nested.current_vmptr = INVALID_GPA; 350 if (enable_shadow_vmcs) { 351 vmx_disable_shadow_vmcs(vmx); 352 vmcs_clear(vmx->vmcs01.shadow_vmcs); 353 free_vmcs(vmx->vmcs01.shadow_vmcs); 354 vmx->vmcs01.shadow_vmcs = NULL; 355 } 356 kfree(vmx->nested.cached_vmcs12); 357 vmx->nested.cached_vmcs12 = NULL; 358 kfree(vmx->nested.cached_shadow_vmcs12); 359 vmx->nested.cached_shadow_vmcs12 = NULL; 360 361 nested_put_vmcs12_pages(vcpu); 362 363 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 364 365 nested_release_evmcs(vcpu); 366 367 free_loaded_vmcs(&vmx->nested.vmcs02); 368 } 369 370 /* 371 * Ensure that the current vmcs of the logical processor is the 372 * vmcs01 of the vcpu before calling free_nested(). 373 */ 374 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 375 { 376 vcpu_load(vcpu); 377 vmx_leave_nested(vcpu); 378 vcpu_put(vcpu); 379 } 380 381 #define EPTP_PA_MASK GENMASK_ULL(51, 12) 382 383 static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) 384 { 385 return VALID_PAGE(root_hpa) && 386 ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); 387 } 388 389 static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, 390 gpa_t addr) 391 { 392 unsigned long roots = 0; 393 uint i; 394 struct kvm_mmu_root_info *cached_root; 395 396 WARN_ON_ONCE(!mmu_is_nested(vcpu)); 397 398 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 399 cached_root = &vcpu->arch.mmu->prev_roots[i]; 400 401 if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, 402 eptp)) 403 roots |= KVM_MMU_ROOT_PREVIOUS(i); 404 } 405 if (roots) 406 kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots); 407 } 408 409 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 410 struct x86_exception *fault) 411 { 412 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 413 struct vcpu_vmx *vmx = to_vmx(vcpu); 414 unsigned long exit_qualification; 415 u32 vm_exit_reason; 416 417 if (vmx->nested.pml_full) { 418 vm_exit_reason = EXIT_REASON_PML_FULL; 419 vmx->nested.pml_full = false; 420 421 /* 422 * It should be impossible to trigger a nested PML Full VM-Exit 423 * for anything other than an EPT Violation from L2. KVM *can* 424 * trigger nEPT page fault injection in response to an EPT 425 * Misconfig, e.g. if the MMIO SPTE was stale and L1's EPT 426 * tables also changed, but KVM should not treat EPT Misconfig 427 * VM-Exits as writes. 428 */ 429 WARN_ON_ONCE(vmx->vt.exit_reason.basic != EXIT_REASON_EPT_VIOLATION); 430 431 /* 432 * PML Full and EPT Violation VM-Exits both use bit 12 to report 433 * "NMI unblocking due to IRET", i.e. the bit can be propagated 434 * as-is from the original EXIT_QUALIFICATION. 435 */ 436 exit_qualification = vmx_get_exit_qual(vcpu) & INTR_INFO_UNBLOCK_NMI; 437 } else { 438 if (fault->error_code & PFERR_RSVD_MASK) { 439 vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 440 exit_qualification = 0; 441 } else { 442 exit_qualification = fault->exit_qualification; 443 exit_qualification |= vmx_get_exit_qual(vcpu) & 444 (EPT_VIOLATION_GVA_IS_VALID | 445 EPT_VIOLATION_GVA_TRANSLATED); 446 vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 447 } 448 449 /* 450 * Although the caller (kvm_inject_emulated_page_fault) would 451 * have already synced the faulting address in the shadow EPT 452 * tables for the current EPTP12, we also need to sync it for 453 * any other cached EPTP02s based on the same EP4TA, since the 454 * TLB associates mappings to the EP4TA rather than the full EPTP. 455 */ 456 nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, 457 fault->address); 458 } 459 460 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); 461 vmcs12->guest_physical_address = fault->address; 462 } 463 464 static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) 465 { 466 struct vcpu_vmx *vmx = to_vmx(vcpu); 467 bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT; 468 int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps); 469 470 kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level, 471 nested_ept_ad_enabled(vcpu), 472 nested_ept_get_eptp(vcpu)); 473 } 474 475 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 476 { 477 WARN_ON(mmu_is_nested(vcpu)); 478 479 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 480 nested_ept_new_eptp(vcpu); 481 vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; 482 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 483 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 484 485 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 486 } 487 488 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 489 { 490 vcpu->arch.mmu = &vcpu->arch.root_mmu; 491 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 492 } 493 494 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 495 u16 error_code) 496 { 497 bool inequality, bit; 498 499 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 500 inequality = 501 (error_code & vmcs12->page_fault_error_code_mask) != 502 vmcs12->page_fault_error_code_match; 503 return inequality ^ bit; 504 } 505 506 static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector, 507 u32 error_code) 508 { 509 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 510 511 /* 512 * Drop bits 31:16 of the error code when performing the #PF mask+match 513 * check. All VMCS fields involved are 32 bits, but Intel CPUs never 514 * set bits 31:16 and VMX disallows setting bits 31:16 in the injected 515 * error code. Including the to-be-dropped bits in the check might 516 * result in an "impossible" or missed exit from L1's perspective. 517 */ 518 if (vector == PF_VECTOR) 519 return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code); 520 521 return (vmcs12->exception_bitmap & (1u << vector)); 522 } 523 524 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 525 struct vmcs12 *vmcs12) 526 { 527 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 528 return 0; 529 530 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 531 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 532 return -EINVAL; 533 534 return 0; 535 } 536 537 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 538 struct vmcs12 *vmcs12) 539 { 540 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 541 return 0; 542 543 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 544 return -EINVAL; 545 546 return 0; 547 } 548 549 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 550 struct vmcs12 *vmcs12) 551 { 552 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 553 return 0; 554 555 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 556 return -EINVAL; 557 558 return 0; 559 } 560 561 /* 562 * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 563 * itself utilizing x2APIC. All MSRs were previously set to be intercepted, 564 * only the "disable intercept" case needs to be handled. 565 */ 566 static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, 567 unsigned long *msr_bitmap_l0, 568 u32 msr, int type) 569 { 570 if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) 571 vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); 572 573 if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) 574 vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); 575 } 576 577 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) 578 { 579 int msr; 580 581 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 582 unsigned word = msr / BITS_PER_LONG; 583 584 msr_bitmap[word] = ~0; 585 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 586 } 587 } 588 589 #define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ 590 static inline \ 591 void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ 592 unsigned long *msr_bitmap_l1, \ 593 unsigned long *msr_bitmap_l0, u32 msr) \ 594 { \ 595 if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ 596 vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ 597 vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 598 else \ 599 vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 600 } 601 BUILD_NVMX_MSR_INTERCEPT_HELPER(read) 602 BUILD_NVMX_MSR_INTERCEPT_HELPER(write) 603 604 static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, 605 unsigned long *msr_bitmap_l1, 606 unsigned long *msr_bitmap_l0, 607 u32 msr, int types) 608 { 609 if (types & MSR_TYPE_R) 610 nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, 611 msr_bitmap_l0, msr); 612 if (types & MSR_TYPE_W) 613 nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, 614 msr_bitmap_l0, msr); 615 } 616 617 /* 618 * Merge L0's and L1's MSR bitmap, return false to indicate that 619 * we do not use the hardware. 620 */ 621 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 622 struct vmcs12 *vmcs12) 623 { 624 struct vcpu_vmx *vmx = to_vmx(vcpu); 625 int msr; 626 unsigned long *msr_bitmap_l1; 627 unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; 628 struct kvm_host_map map; 629 630 /* Nothing to do if the MSR bitmap is not in use. */ 631 if (!cpu_has_vmx_msr_bitmap() || 632 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 633 return false; 634 635 /* 636 * MSR bitmap update can be skipped when: 637 * - MSR bitmap for L1 hasn't changed. 638 * - Nested hypervisor (L1) is attempting to launch the same L2 as 639 * before. 640 * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature 641 * and tells KVM (L0) there were no changes in MSR bitmap for L2. 642 */ 643 if (!vmx->nested.force_msr_bitmap_recalc) { 644 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 645 646 if (evmcs && evmcs->hv_enlightenments_control.msr_bitmap && 647 evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) 648 return true; 649 } 650 651 if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map)) 652 return false; 653 654 msr_bitmap_l1 = (unsigned long *)map.hva; 655 656 /* 657 * To keep the control flow simple, pay eight 8-byte writes (sixteen 658 * 4-byte writes on 32-bit systems) up front to enable intercepts for 659 * the x2APIC MSR range and selectively toggle those relevant to L2. 660 */ 661 enable_x2apic_msr_intercepts(msr_bitmap_l0); 662 663 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 664 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 665 /* 666 * L0 need not intercept reads for MSRs between 0x800 667 * and 0x8ff, it just lets the processor take the value 668 * from the virtual-APIC page; take those 256 bits 669 * directly from the L1 bitmap. 670 */ 671 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 672 unsigned word = msr / BITS_PER_LONG; 673 674 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 675 } 676 } 677 678 nested_vmx_disable_intercept_for_x2apic_msr( 679 msr_bitmap_l1, msr_bitmap_l0, 680 X2APIC_MSR(APIC_TASKPRI), 681 MSR_TYPE_R | MSR_TYPE_W); 682 683 if (nested_cpu_has_vid(vmcs12)) { 684 nested_vmx_disable_intercept_for_x2apic_msr( 685 msr_bitmap_l1, msr_bitmap_l0, 686 X2APIC_MSR(APIC_EOI), 687 MSR_TYPE_W); 688 nested_vmx_disable_intercept_for_x2apic_msr( 689 msr_bitmap_l1, msr_bitmap_l0, 690 X2APIC_MSR(APIC_SELF_IPI), 691 MSR_TYPE_W); 692 } 693 } 694 695 /* 696 * Always check vmcs01's bitmap to honor userspace MSR filters and any 697 * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. 698 */ 699 #ifdef CONFIG_X86_64 700 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 701 MSR_FS_BASE, MSR_TYPE_RW); 702 703 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 704 MSR_GS_BASE, MSR_TYPE_RW); 705 706 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 707 MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 708 #endif 709 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 710 MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); 711 712 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 713 MSR_IA32_PRED_CMD, MSR_TYPE_W); 714 715 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 716 MSR_IA32_FLUSH_CMD, MSR_TYPE_W); 717 718 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 719 MSR_IA32_APERF, MSR_TYPE_R); 720 721 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 722 MSR_IA32_MPERF, MSR_TYPE_R); 723 724 kvm_vcpu_unmap(vcpu, &map); 725 726 vmx->nested.force_msr_bitmap_recalc = false; 727 728 return true; 729 } 730 731 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 732 struct vmcs12 *vmcs12) 733 { 734 struct vcpu_vmx *vmx = to_vmx(vcpu); 735 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 736 737 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 738 vmcs12->vmcs_link_pointer == INVALID_GPA) 739 return; 740 741 if (ghc->gpa != vmcs12->vmcs_link_pointer && 742 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 743 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 744 return; 745 746 kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 747 VMCS12_SIZE); 748 } 749 750 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 751 struct vmcs12 *vmcs12) 752 { 753 struct vcpu_vmx *vmx = to_vmx(vcpu); 754 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 755 756 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 757 vmcs12->vmcs_link_pointer == INVALID_GPA) 758 return; 759 760 if (ghc->gpa != vmcs12->vmcs_link_pointer && 761 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 762 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 763 return; 764 765 kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 766 VMCS12_SIZE); 767 } 768 769 /* 770 * In nested virtualization, check if L1 has set 771 * VM_EXIT_ACK_INTR_ON_EXIT 772 */ 773 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 774 { 775 return get_vmcs12(vcpu)->vm_exit_controls & 776 VM_EXIT_ACK_INTR_ON_EXIT; 777 } 778 779 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 780 struct vmcs12 *vmcs12) 781 { 782 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 783 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 784 return -EINVAL; 785 else 786 return 0; 787 } 788 789 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 790 struct vmcs12 *vmcs12) 791 { 792 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 793 !nested_cpu_has_apic_reg_virt(vmcs12) && 794 !nested_cpu_has_vid(vmcs12) && 795 !nested_cpu_has_posted_intr(vmcs12)) 796 return 0; 797 798 /* 799 * If virtualize x2apic mode is enabled, 800 * virtualize apic access must be disabled. 801 */ 802 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 803 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 804 return -EINVAL; 805 806 /* 807 * If virtual interrupt delivery is enabled, 808 * we must exit on external interrupts. 809 */ 810 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 811 return -EINVAL; 812 813 /* 814 * bits 15:8 should be zero in posted_intr_nv, 815 * the descriptor address has been already checked 816 * in nested_get_vmcs12_pages. 817 * 818 * bits 5:0 of posted_intr_desc_addr should be zero. 819 */ 820 if (nested_cpu_has_posted_intr(vmcs12) && 821 (CC(!nested_cpu_has_vid(vmcs12)) || 822 CC(!nested_exit_intr_ack_set(vcpu)) || 823 CC((vmcs12->posted_intr_nv & 0xff00)) || 824 CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) 825 return -EINVAL; 826 827 /* tpr shadow is needed by all apicv features. */ 828 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 829 return -EINVAL; 830 831 return 0; 832 } 833 834 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) 835 { 836 struct vcpu_vmx *vmx = to_vmx(vcpu); 837 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 838 vmx->nested.msrs.misc_high); 839 840 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; 841 } 842 843 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 844 u32 count, u64 addr) 845 { 846 if (count == 0) 847 return 0; 848 849 /* 850 * Exceeding the limit results in architecturally _undefined_ behavior, 851 * i.e. KVM is allowed to do literally anything in response to a bad 852 * limit. Immediately generate a consistency check so that code that 853 * consumes the count doesn't need to worry about extreme edge cases. 854 */ 855 if (count > nested_vmx_max_atomic_switch_msrs(vcpu)) 856 return -EINVAL; 857 858 if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || 859 !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) 860 return -EINVAL; 861 862 return 0; 863 } 864 865 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 866 struct vmcs12 *vmcs12) 867 { 868 if (CC(nested_vmx_check_msr_switch(vcpu, 869 vmcs12->vm_exit_msr_load_count, 870 vmcs12->vm_exit_msr_load_addr)) || 871 CC(nested_vmx_check_msr_switch(vcpu, 872 vmcs12->vm_exit_msr_store_count, 873 vmcs12->vm_exit_msr_store_addr))) 874 return -EINVAL; 875 876 return 0; 877 } 878 879 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 880 struct vmcs12 *vmcs12) 881 { 882 if (CC(nested_vmx_check_msr_switch(vcpu, 883 vmcs12->vm_entry_msr_load_count, 884 vmcs12->vm_entry_msr_load_addr))) 885 return -EINVAL; 886 887 return 0; 888 } 889 890 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 891 struct vmcs12 *vmcs12) 892 { 893 if (!nested_cpu_has_pml(vmcs12)) 894 return 0; 895 896 if (CC(!nested_cpu_has_ept(vmcs12)) || 897 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 898 return -EINVAL; 899 900 return 0; 901 } 902 903 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 904 struct vmcs12 *vmcs12) 905 { 906 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 907 !nested_cpu_has_ept(vmcs12))) 908 return -EINVAL; 909 return 0; 910 } 911 912 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 913 struct vmcs12 *vmcs12) 914 { 915 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 916 !nested_cpu_has_ept(vmcs12))) 917 return -EINVAL; 918 return 0; 919 } 920 921 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 922 struct vmcs12 *vmcs12) 923 { 924 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 925 return 0; 926 927 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 928 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 929 return -EINVAL; 930 931 return 0; 932 } 933 934 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 935 struct vmx_msr_entry *e) 936 { 937 /* x2APIC MSR accesses are not allowed */ 938 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 939 return -EINVAL; 940 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 941 CC(e->index == MSR_IA32_UCODE_REV)) 942 return -EINVAL; 943 if (CC(e->reserved != 0)) 944 return -EINVAL; 945 return 0; 946 } 947 948 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 949 struct vmx_msr_entry *e) 950 { 951 if (CC(e->index == MSR_FS_BASE) || 952 CC(e->index == MSR_GS_BASE) || 953 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 954 nested_vmx_msr_check_common(vcpu, e)) 955 return -EINVAL; 956 return 0; 957 } 958 959 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 960 struct vmx_msr_entry *e) 961 { 962 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 963 nested_vmx_msr_check_common(vcpu, e)) 964 return -EINVAL; 965 return 0; 966 } 967 968 /* 969 * Load guest's/host's msr at nested entry/exit. 970 * return 0 for success, entry index for failure. 971 * 972 * One of the failure modes for MSR load/store is when a list exceeds the 973 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch 974 * as possible, process all valid entries before failing rather than precheck 975 * for a capacity violation. 976 */ 977 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 978 { 979 u32 i; 980 struct vmx_msr_entry e; 981 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 982 983 for (i = 0; i < count; i++) { 984 if (WARN_ON_ONCE(i >= max_msr_list_size)) 985 goto fail; 986 987 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 988 &e, sizeof(e))) { 989 pr_debug_ratelimited( 990 "%s cannot read MSR entry (%u, 0x%08llx)\n", 991 __func__, i, gpa + i * sizeof(e)); 992 goto fail; 993 } 994 if (nested_vmx_load_msr_check(vcpu, &e)) { 995 pr_debug_ratelimited( 996 "%s check failed (%u, 0x%x, 0x%x)\n", 997 __func__, i, e.index, e.reserved); 998 goto fail; 999 } 1000 if (kvm_set_msr_with_filter(vcpu, e.index, e.value)) { 1001 pr_debug_ratelimited( 1002 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1003 __func__, i, e.index, e.value); 1004 goto fail; 1005 } 1006 } 1007 return 0; 1008 fail: 1009 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ 1010 return i + 1; 1011 } 1012 1013 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, 1014 u32 msr_index, 1015 u64 *data) 1016 { 1017 struct vcpu_vmx *vmx = to_vmx(vcpu); 1018 1019 /* 1020 * If the L0 hypervisor stored a more accurate value for the TSC that 1021 * does not include the time taken for emulation of the L2->L1 1022 * VM-exit in L0, use the more accurate value. 1023 */ 1024 if (msr_index == MSR_IA32_TSC) { 1025 int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, 1026 MSR_IA32_TSC); 1027 1028 if (i >= 0) { 1029 u64 val = vmx->msr_autostore.guest.val[i].value; 1030 1031 *data = kvm_read_l1_tsc(vcpu, val); 1032 return true; 1033 } 1034 } 1035 1036 if (kvm_get_msr_with_filter(vcpu, msr_index, data)) { 1037 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 1038 msr_index); 1039 return false; 1040 } 1041 return true; 1042 } 1043 1044 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, 1045 struct vmx_msr_entry *e) 1046 { 1047 if (kvm_vcpu_read_guest(vcpu, 1048 gpa + i * sizeof(*e), 1049 e, 2 * sizeof(u32))) { 1050 pr_debug_ratelimited( 1051 "%s cannot read MSR entry (%u, 0x%08llx)\n", 1052 __func__, i, gpa + i * sizeof(*e)); 1053 return false; 1054 } 1055 if (nested_vmx_store_msr_check(vcpu, e)) { 1056 pr_debug_ratelimited( 1057 "%s check failed (%u, 0x%x, 0x%x)\n", 1058 __func__, i, e->index, e->reserved); 1059 return false; 1060 } 1061 return true; 1062 } 1063 1064 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 1065 { 1066 u64 data; 1067 u32 i; 1068 struct vmx_msr_entry e; 1069 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 1070 1071 for (i = 0; i < count; i++) { 1072 if (WARN_ON_ONCE(i >= max_msr_list_size)) 1073 return -EINVAL; 1074 1075 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1076 return -EINVAL; 1077 1078 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) 1079 return -EINVAL; 1080 1081 if (kvm_vcpu_write_guest(vcpu, 1082 gpa + i * sizeof(e) + 1083 offsetof(struct vmx_msr_entry, value), 1084 &data, sizeof(data))) { 1085 pr_debug_ratelimited( 1086 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1087 __func__, i, e.index, data); 1088 return -EINVAL; 1089 } 1090 } 1091 return 0; 1092 } 1093 1094 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) 1095 { 1096 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1097 u32 count = vmcs12->vm_exit_msr_store_count; 1098 u64 gpa = vmcs12->vm_exit_msr_store_addr; 1099 struct vmx_msr_entry e; 1100 u32 i; 1101 1102 for (i = 0; i < count; i++) { 1103 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1104 return false; 1105 1106 if (e.index == msr_index) 1107 return true; 1108 } 1109 return false; 1110 } 1111 1112 static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, 1113 u32 msr_index) 1114 { 1115 struct vcpu_vmx *vmx = to_vmx(vcpu); 1116 struct vmx_msrs *autostore = &vmx->msr_autostore.guest; 1117 bool in_vmcs12_store_list; 1118 int msr_autostore_slot; 1119 bool in_autostore_list; 1120 int last; 1121 1122 msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); 1123 in_autostore_list = msr_autostore_slot >= 0; 1124 in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); 1125 1126 if (in_vmcs12_store_list && !in_autostore_list) { 1127 if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { 1128 /* 1129 * Emulated VMEntry does not fail here. Instead a less 1130 * accurate value will be returned by 1131 * nested_vmx_get_vmexit_msr_value() by reading KVM's 1132 * internal MSR state instead of reading the value from 1133 * the vmcs02 VMExit MSR-store area. 1134 */ 1135 pr_warn_ratelimited( 1136 "Not enough msr entries in msr_autostore. Can't add msr %x\n", 1137 msr_index); 1138 return; 1139 } 1140 last = autostore->nr++; 1141 autostore->val[last].index = msr_index; 1142 } else if (!in_vmcs12_store_list && in_autostore_list) { 1143 last = --autostore->nr; 1144 autostore->val[msr_autostore_slot] = autostore->val[last]; 1145 } 1146 } 1147 1148 /* 1149 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are 1150 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected 1151 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to 1152 * @entry_failure_code. 1153 */ 1154 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, 1155 bool nested_ept, bool reload_pdptrs, 1156 enum vm_entry_failure_code *entry_failure_code) 1157 { 1158 if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) { 1159 *entry_failure_code = ENTRY_FAIL_DEFAULT; 1160 return -EINVAL; 1161 } 1162 1163 /* 1164 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 1165 * must not be dereferenced. 1166 */ 1167 if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && 1168 CC(!load_pdptrs(vcpu, cr3))) { 1169 *entry_failure_code = ENTRY_FAIL_PDPTE; 1170 return -EINVAL; 1171 } 1172 1173 vcpu->arch.cr3 = cr3; 1174 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); 1175 1176 /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ 1177 kvm_init_mmu(vcpu); 1178 1179 if (!nested_ept) 1180 kvm_mmu_new_pgd(vcpu, cr3); 1181 1182 return 0; 1183 } 1184 1185 /* 1186 * Returns if KVM is able to config CPU to tag TLB entries 1187 * populated by L2 differently than TLB entries populated 1188 * by L1. 1189 * 1190 * If L0 uses EPT, L1 and L2 run with different EPTP because 1191 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries 1192 * are tagged with different EPTP. 1193 * 1194 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 1195 * with different VPID (L1 entries are tagged with vmx->vpid 1196 * while L2 entries are tagged with vmx->nested.vpid02). 1197 */ 1198 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1199 { 1200 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1201 1202 return enable_ept || 1203 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1204 } 1205 1206 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, 1207 struct vmcs12 *vmcs12, 1208 bool is_vmenter) 1209 { 1210 struct vcpu_vmx *vmx = to_vmx(vcpu); 1211 1212 /* Handle pending Hyper-V TLB flush requests */ 1213 kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept); 1214 1215 /* 1216 * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the 1217 * same VPID as the host, and so architecturally, linear and combined 1218 * mappings for VPID=0 must be flushed at VM-Enter and VM-Exit. KVM 1219 * emulates L2 sharing L1's VPID=0 by using vpid01 while running L2, 1220 * and so KVM must also emulate TLB flush of VPID=0, i.e. vpid01. This 1221 * is required if VPID is disabled in KVM, as a TLB flush (there are no 1222 * VPIDs) still occurs from L1's perspective, and KVM may need to 1223 * synchronize the MMU in response to the guest TLB flush. 1224 * 1225 * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. 1226 * EPT is a special snowflake, as guest-physical mappings aren't 1227 * flushed on VPID invalidations, including VM-Enter or VM-Exit with 1228 * VPID disabled. As a result, KVM _never_ needs to sync nEPT 1229 * entries on VM-Enter because L1 can't rely on VM-Enter to flush 1230 * those mappings. 1231 */ 1232 if (!nested_cpu_has_vpid(vmcs12)) { 1233 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1234 return; 1235 } 1236 1237 /* L2 should never have a VPID if VPID is disabled. */ 1238 WARN_ON(!enable_vpid); 1239 1240 /* 1241 * VPID is enabled and in use by vmcs12. If vpid12 is changing, then 1242 * emulate a guest TLB flush as KVM does not track vpid12 history nor 1243 * is the VPID incorporated into the MMU context. I.e. KVM must assume 1244 * that the new vpid12 has never been used and thus represents a new 1245 * guest ASID that cannot have entries in the TLB. 1246 */ 1247 if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 1248 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 1249 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1250 return; 1251 } 1252 1253 /* 1254 * If VPID is enabled, used by vmc12, and vpid12 is not changing but 1255 * does not have a unique TLB tag (ASID), i.e. EPT is disabled and 1256 * KVM was unable to allocate a VPID for L2, flush the current context 1257 * as the effective ASID is common to both L1 and L2. 1258 */ 1259 if (!nested_has_guest_tlb_tag(vcpu)) 1260 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 1261 } 1262 1263 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1264 { 1265 superset &= mask; 1266 subset &= mask; 1267 1268 return (superset | subset) == superset; 1269 } 1270 1271 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1272 { 1273 const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT | 1274 VMX_BASIC_INOUT | 1275 VMX_BASIC_TRUE_CTLS; 1276 1277 const u64 reserved_bits = GENMASK_ULL(63, 56) | 1278 GENMASK_ULL(47, 45) | 1279 BIT_ULL(31); 1280 1281 u64 vmx_basic = vmcs_config.nested.basic; 1282 1283 BUILD_BUG_ON(feature_bits & reserved_bits); 1284 1285 /* 1286 * Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has 1287 * inverted polarity), the incoming value must not set feature bits or 1288 * reserved bits that aren't allowed/supported by KVM. Fields, i.e. 1289 * multi-bit values, are explicitly checked below. 1290 */ 1291 if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits)) 1292 return -EINVAL; 1293 1294 /* 1295 * KVM does not emulate a version of VMX that constrains physical 1296 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1297 */ 1298 if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) 1299 return -EINVAL; 1300 1301 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1302 vmx_basic_vmcs_revision_id(data)) 1303 return -EINVAL; 1304 1305 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1306 return -EINVAL; 1307 1308 vmx->nested.msrs.basic = data; 1309 return 0; 1310 } 1311 1312 static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index, 1313 u32 **low, u32 **high) 1314 { 1315 switch (msr_index) { 1316 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1317 *low = &msrs->pinbased_ctls_low; 1318 *high = &msrs->pinbased_ctls_high; 1319 break; 1320 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1321 *low = &msrs->procbased_ctls_low; 1322 *high = &msrs->procbased_ctls_high; 1323 break; 1324 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1325 *low = &msrs->exit_ctls_low; 1326 *high = &msrs->exit_ctls_high; 1327 break; 1328 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1329 *low = &msrs->entry_ctls_low; 1330 *high = &msrs->entry_ctls_high; 1331 break; 1332 case MSR_IA32_VMX_PROCBASED_CTLS2: 1333 *low = &msrs->secondary_ctls_low; 1334 *high = &msrs->secondary_ctls_high; 1335 break; 1336 default: 1337 BUG(); 1338 } 1339 } 1340 1341 static int 1342 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1343 { 1344 u32 *lowp, *highp; 1345 u64 supported; 1346 1347 vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp); 1348 1349 supported = vmx_control_msr(*lowp, *highp); 1350 1351 /* Check must-be-1 bits are still 1. */ 1352 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1353 return -EINVAL; 1354 1355 /* Check must-be-0 bits are still 0. */ 1356 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1357 return -EINVAL; 1358 1359 vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp); 1360 *lowp = data; 1361 *highp = data >> 32; 1362 return 0; 1363 } 1364 1365 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1366 { 1367 const u64 feature_bits = VMX_MISC_SAVE_EFER_LMA | 1368 VMX_MISC_ACTIVITY_HLT | 1369 VMX_MISC_ACTIVITY_SHUTDOWN | 1370 VMX_MISC_ACTIVITY_WAIT_SIPI | 1371 VMX_MISC_INTEL_PT | 1372 VMX_MISC_RDMSR_IN_SMM | 1373 VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 1374 VMX_MISC_VMXOFF_BLOCK_SMI | 1375 VMX_MISC_ZERO_LEN_INS; 1376 1377 const u64 reserved_bits = BIT_ULL(31) | GENMASK_ULL(13, 9); 1378 1379 u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, 1380 vmcs_config.nested.misc_high); 1381 1382 BUILD_BUG_ON(feature_bits & reserved_bits); 1383 1384 /* 1385 * The incoming value must not set feature bits or reserved bits that 1386 * aren't allowed/supported by KVM. Fields, i.e. multi-bit values, are 1387 * explicitly checked below. 1388 */ 1389 if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits)) 1390 return -EINVAL; 1391 1392 if ((vmx->nested.msrs.pinbased_ctls_high & 1393 PIN_BASED_VMX_PREEMPTION_TIMER) && 1394 vmx_misc_preemption_timer_rate(data) != 1395 vmx_misc_preemption_timer_rate(vmx_misc)) 1396 return -EINVAL; 1397 1398 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1399 return -EINVAL; 1400 1401 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1402 return -EINVAL; 1403 1404 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1405 return -EINVAL; 1406 1407 vmx->nested.msrs.misc_low = data; 1408 vmx->nested.msrs.misc_high = data >> 32; 1409 1410 return 0; 1411 } 1412 1413 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1414 { 1415 u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps, 1416 vmcs_config.nested.vpid_caps); 1417 1418 /* Every bit is either reserved or a feature bit. */ 1419 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1420 return -EINVAL; 1421 1422 vmx->nested.msrs.ept_caps = data; 1423 vmx->nested.msrs.vpid_caps = data >> 32; 1424 return 0; 1425 } 1426 1427 static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index) 1428 { 1429 switch (msr_index) { 1430 case MSR_IA32_VMX_CR0_FIXED0: 1431 return &msrs->cr0_fixed0; 1432 case MSR_IA32_VMX_CR4_FIXED0: 1433 return &msrs->cr4_fixed0; 1434 default: 1435 BUG(); 1436 } 1437 } 1438 1439 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1440 { 1441 const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index); 1442 1443 /* 1444 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1445 * must be 1 in the restored value. 1446 */ 1447 if (!is_bitwise_subset(data, *msr, -1ULL)) 1448 return -EINVAL; 1449 1450 *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data; 1451 return 0; 1452 } 1453 1454 /* 1455 * Called when userspace is restoring VMX MSRs. 1456 * 1457 * Returns 0 on success, non-0 otherwise. 1458 */ 1459 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1460 { 1461 struct vcpu_vmx *vmx = to_vmx(vcpu); 1462 1463 /* 1464 * Don't allow changes to the VMX capability MSRs while the vCPU 1465 * is in VMX operation. 1466 */ 1467 if (vmx->nested.vmxon) 1468 return -EBUSY; 1469 1470 switch (msr_index) { 1471 case MSR_IA32_VMX_BASIC: 1472 return vmx_restore_vmx_basic(vmx, data); 1473 case MSR_IA32_VMX_PINBASED_CTLS: 1474 case MSR_IA32_VMX_PROCBASED_CTLS: 1475 case MSR_IA32_VMX_EXIT_CTLS: 1476 case MSR_IA32_VMX_ENTRY_CTLS: 1477 /* 1478 * The "non-true" VMX capability MSRs are generated from the 1479 * "true" MSRs, so we do not support restoring them directly. 1480 * 1481 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1482 * should restore the "true" MSRs with the must-be-1 bits 1483 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1484 * DEFAULT SETTINGS". 1485 */ 1486 return -EINVAL; 1487 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1488 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1489 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1490 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1491 case MSR_IA32_VMX_PROCBASED_CTLS2: 1492 return vmx_restore_control_msr(vmx, msr_index, data); 1493 case MSR_IA32_VMX_MISC: 1494 return vmx_restore_vmx_misc(vmx, data); 1495 case MSR_IA32_VMX_CR0_FIXED0: 1496 case MSR_IA32_VMX_CR4_FIXED0: 1497 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1498 case MSR_IA32_VMX_CR0_FIXED1: 1499 case MSR_IA32_VMX_CR4_FIXED1: 1500 /* 1501 * These MSRs are generated based on the vCPU's CPUID, so we 1502 * do not support restoring them directly. 1503 */ 1504 return -EINVAL; 1505 case MSR_IA32_VMX_EPT_VPID_CAP: 1506 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1507 case MSR_IA32_VMX_VMCS_ENUM: 1508 vmx->nested.msrs.vmcs_enum = data; 1509 return 0; 1510 case MSR_IA32_VMX_VMFUNC: 1511 if (data & ~vmcs_config.nested.vmfunc_controls) 1512 return -EINVAL; 1513 vmx->nested.msrs.vmfunc_controls = data; 1514 return 0; 1515 default: 1516 /* 1517 * The rest of the VMX capability MSRs do not support restore. 1518 */ 1519 return -EINVAL; 1520 } 1521 } 1522 1523 /* Returns 0 on success, non-0 otherwise. */ 1524 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1525 { 1526 switch (msr_index) { 1527 case MSR_IA32_VMX_BASIC: 1528 *pdata = msrs->basic; 1529 break; 1530 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1531 case MSR_IA32_VMX_PINBASED_CTLS: 1532 *pdata = vmx_control_msr( 1533 msrs->pinbased_ctls_low, 1534 msrs->pinbased_ctls_high); 1535 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1536 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1537 break; 1538 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1539 case MSR_IA32_VMX_PROCBASED_CTLS: 1540 *pdata = vmx_control_msr( 1541 msrs->procbased_ctls_low, 1542 msrs->procbased_ctls_high); 1543 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1544 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1545 break; 1546 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1547 case MSR_IA32_VMX_EXIT_CTLS: 1548 *pdata = vmx_control_msr( 1549 msrs->exit_ctls_low, 1550 msrs->exit_ctls_high); 1551 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1552 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1553 break; 1554 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1555 case MSR_IA32_VMX_ENTRY_CTLS: 1556 *pdata = vmx_control_msr( 1557 msrs->entry_ctls_low, 1558 msrs->entry_ctls_high); 1559 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1560 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1561 break; 1562 case MSR_IA32_VMX_MISC: 1563 *pdata = vmx_control_msr( 1564 msrs->misc_low, 1565 msrs->misc_high); 1566 break; 1567 case MSR_IA32_VMX_CR0_FIXED0: 1568 *pdata = msrs->cr0_fixed0; 1569 break; 1570 case MSR_IA32_VMX_CR0_FIXED1: 1571 *pdata = msrs->cr0_fixed1; 1572 break; 1573 case MSR_IA32_VMX_CR4_FIXED0: 1574 *pdata = msrs->cr4_fixed0; 1575 break; 1576 case MSR_IA32_VMX_CR4_FIXED1: 1577 *pdata = msrs->cr4_fixed1; 1578 break; 1579 case MSR_IA32_VMX_VMCS_ENUM: 1580 *pdata = msrs->vmcs_enum; 1581 break; 1582 case MSR_IA32_VMX_PROCBASED_CTLS2: 1583 *pdata = vmx_control_msr( 1584 msrs->secondary_ctls_low, 1585 msrs->secondary_ctls_high); 1586 break; 1587 case MSR_IA32_VMX_EPT_VPID_CAP: 1588 *pdata = msrs->ept_caps | 1589 ((u64)msrs->vpid_caps << 32); 1590 break; 1591 case MSR_IA32_VMX_VMFUNC: 1592 *pdata = msrs->vmfunc_controls; 1593 break; 1594 default: 1595 return 1; 1596 } 1597 1598 return 0; 1599 } 1600 1601 /* 1602 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1603 * been modified by the L1 guest. Note, "writable" in this context means 1604 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1605 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1606 * VM-exit information fields (which are actually writable if the vCPU is 1607 * configured to support "VMWRITE to any supported field in the VMCS"). 1608 */ 1609 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1610 { 1611 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1612 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1613 struct shadow_vmcs_field field; 1614 unsigned long val; 1615 int i; 1616 1617 if (WARN_ON(!shadow_vmcs)) 1618 return; 1619 1620 preempt_disable(); 1621 1622 vmcs_load(shadow_vmcs); 1623 1624 for (i = 0; i < max_shadow_read_write_fields; i++) { 1625 field = shadow_read_write_fields[i]; 1626 val = __vmcs_readl(field.encoding); 1627 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1628 } 1629 1630 vmcs_clear(shadow_vmcs); 1631 vmcs_load(vmx->loaded_vmcs->vmcs); 1632 1633 preempt_enable(); 1634 } 1635 1636 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1637 { 1638 const struct shadow_vmcs_field *fields[] = { 1639 shadow_read_write_fields, 1640 shadow_read_only_fields 1641 }; 1642 const int max_fields[] = { 1643 max_shadow_read_write_fields, 1644 max_shadow_read_only_fields 1645 }; 1646 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1647 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1648 struct shadow_vmcs_field field; 1649 unsigned long val; 1650 int i, q; 1651 1652 if (WARN_ON(!shadow_vmcs)) 1653 return; 1654 1655 vmcs_load(shadow_vmcs); 1656 1657 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1658 for (i = 0; i < max_fields[q]; i++) { 1659 field = fields[q][i]; 1660 val = vmcs12_read_any(vmcs12, field.encoding, 1661 field.offset); 1662 __vmcs_writel(field.encoding, val); 1663 } 1664 } 1665 1666 vmcs_clear(shadow_vmcs); 1667 vmcs_load(vmx->loaded_vmcs->vmcs); 1668 } 1669 1670 static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) 1671 { 1672 #ifdef CONFIG_KVM_HYPERV 1673 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1674 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 1675 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu); 1676 1677 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1678 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1679 vmcs12->guest_rip = evmcs->guest_rip; 1680 1681 if (unlikely(!(hv_clean_fields & 1682 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) { 1683 hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page; 1684 hv_vcpu->nested.vm_id = evmcs->hv_vm_id; 1685 hv_vcpu->nested.vp_id = evmcs->hv_vp_id; 1686 } 1687 1688 if (unlikely(!(hv_clean_fields & 1689 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1690 vmcs12->guest_rsp = evmcs->guest_rsp; 1691 vmcs12->guest_rflags = evmcs->guest_rflags; 1692 vmcs12->guest_interruptibility_info = 1693 evmcs->guest_interruptibility_info; 1694 /* 1695 * Not present in struct vmcs12: 1696 * vmcs12->guest_ssp = evmcs->guest_ssp; 1697 */ 1698 } 1699 1700 if (unlikely(!(hv_clean_fields & 1701 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1702 vmcs12->cpu_based_vm_exec_control = 1703 evmcs->cpu_based_vm_exec_control; 1704 } 1705 1706 if (unlikely(!(hv_clean_fields & 1707 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1708 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1709 } 1710 1711 if (unlikely(!(hv_clean_fields & 1712 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1713 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1714 } 1715 1716 if (unlikely(!(hv_clean_fields & 1717 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1718 vmcs12->vm_entry_intr_info_field = 1719 evmcs->vm_entry_intr_info_field; 1720 vmcs12->vm_entry_exception_error_code = 1721 evmcs->vm_entry_exception_error_code; 1722 vmcs12->vm_entry_instruction_len = 1723 evmcs->vm_entry_instruction_len; 1724 } 1725 1726 if (unlikely(!(hv_clean_fields & 1727 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1728 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1729 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1730 vmcs12->host_cr0 = evmcs->host_cr0; 1731 vmcs12->host_cr3 = evmcs->host_cr3; 1732 vmcs12->host_cr4 = evmcs->host_cr4; 1733 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1734 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1735 vmcs12->host_rip = evmcs->host_rip; 1736 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1737 vmcs12->host_es_selector = evmcs->host_es_selector; 1738 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1739 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1740 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1741 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1742 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1743 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1744 vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl; 1745 /* 1746 * Not present in struct vmcs12: 1747 * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet; 1748 * vmcs12->host_ssp = evmcs->host_ssp; 1749 * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr; 1750 */ 1751 } 1752 1753 if (unlikely(!(hv_clean_fields & 1754 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1755 vmcs12->pin_based_vm_exec_control = 1756 evmcs->pin_based_vm_exec_control; 1757 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1758 vmcs12->secondary_vm_exec_control = 1759 evmcs->secondary_vm_exec_control; 1760 } 1761 1762 if (unlikely(!(hv_clean_fields & 1763 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1764 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1765 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1766 } 1767 1768 if (unlikely(!(hv_clean_fields & 1769 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1770 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1771 } 1772 1773 if (unlikely(!(hv_clean_fields & 1774 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1775 vmcs12->guest_es_base = evmcs->guest_es_base; 1776 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1777 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1778 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1779 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1780 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1781 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1782 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1783 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1784 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1785 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1786 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1787 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1788 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1789 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1790 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1791 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1792 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1793 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1794 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1795 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1796 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1797 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1798 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1799 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1800 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1801 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1802 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1803 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1804 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1805 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1806 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1807 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1808 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1809 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1810 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1811 } 1812 1813 if (unlikely(!(hv_clean_fields & 1814 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1815 vmcs12->tsc_offset = evmcs->tsc_offset; 1816 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1817 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1818 vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap; 1819 vmcs12->tsc_multiplier = evmcs->tsc_multiplier; 1820 } 1821 1822 if (unlikely(!(hv_clean_fields & 1823 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1824 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1825 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1826 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1827 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1828 vmcs12->guest_cr0 = evmcs->guest_cr0; 1829 vmcs12->guest_cr3 = evmcs->guest_cr3; 1830 vmcs12->guest_cr4 = evmcs->guest_cr4; 1831 vmcs12->guest_dr7 = evmcs->guest_dr7; 1832 } 1833 1834 if (unlikely(!(hv_clean_fields & 1835 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1836 vmcs12->host_fs_base = evmcs->host_fs_base; 1837 vmcs12->host_gs_base = evmcs->host_gs_base; 1838 vmcs12->host_tr_base = evmcs->host_tr_base; 1839 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1840 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1841 vmcs12->host_rsp = evmcs->host_rsp; 1842 } 1843 1844 if (unlikely(!(hv_clean_fields & 1845 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1846 vmcs12->ept_pointer = evmcs->ept_pointer; 1847 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1848 } 1849 1850 if (unlikely(!(hv_clean_fields & 1851 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1852 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1853 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1854 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1855 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1856 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1857 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1858 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1859 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1860 vmcs12->guest_pending_dbg_exceptions = 1861 evmcs->guest_pending_dbg_exceptions; 1862 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1863 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1864 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1865 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1866 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1867 vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl; 1868 /* 1869 * Not present in struct vmcs12: 1870 * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet; 1871 * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl; 1872 * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr; 1873 */ 1874 } 1875 1876 /* 1877 * Not used? 1878 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1879 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1880 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1881 * vmcs12->page_fault_error_code_mask = 1882 * evmcs->page_fault_error_code_mask; 1883 * vmcs12->page_fault_error_code_match = 1884 * evmcs->page_fault_error_code_match; 1885 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1886 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1887 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1888 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1889 */ 1890 1891 /* 1892 * Read only fields: 1893 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1894 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1895 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1896 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1897 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1898 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1899 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1900 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1901 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1902 * vmcs12->exit_qualification = evmcs->exit_qualification; 1903 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1904 * 1905 * Not present in struct vmcs12: 1906 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1907 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1908 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1909 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1910 */ 1911 1912 return; 1913 #else /* CONFIG_KVM_HYPERV */ 1914 KVM_BUG_ON(1, vmx->vcpu.kvm); 1915 #endif /* CONFIG_KVM_HYPERV */ 1916 } 1917 1918 static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1919 { 1920 #ifdef CONFIG_KVM_HYPERV 1921 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1922 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 1923 1924 /* 1925 * Should not be changed by KVM: 1926 * 1927 * evmcs->host_es_selector = vmcs12->host_es_selector; 1928 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1929 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1930 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1931 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1932 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1933 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1934 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1935 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1936 * evmcs->host_cr0 = vmcs12->host_cr0; 1937 * evmcs->host_cr3 = vmcs12->host_cr3; 1938 * evmcs->host_cr4 = vmcs12->host_cr4; 1939 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1940 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1941 * evmcs->host_rip = vmcs12->host_rip; 1942 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1943 * evmcs->host_fs_base = vmcs12->host_fs_base; 1944 * evmcs->host_gs_base = vmcs12->host_gs_base; 1945 * evmcs->host_tr_base = vmcs12->host_tr_base; 1946 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1947 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1948 * evmcs->host_rsp = vmcs12->host_rsp; 1949 * sync_vmcs02_to_vmcs12() doesn't read these: 1950 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1951 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1952 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1953 * evmcs->ept_pointer = vmcs12->ept_pointer; 1954 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 1955 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 1956 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 1957 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 1958 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 1959 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 1960 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 1961 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 1962 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 1963 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 1964 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 1965 * evmcs->page_fault_error_code_mask = 1966 * vmcs12->page_fault_error_code_mask; 1967 * evmcs->page_fault_error_code_match = 1968 * vmcs12->page_fault_error_code_match; 1969 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 1970 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 1971 * evmcs->tsc_offset = vmcs12->tsc_offset; 1972 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 1973 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 1974 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 1975 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 1976 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 1977 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 1978 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 1979 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 1980 * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl; 1981 * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl; 1982 * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap; 1983 * evmcs->tsc_multiplier = vmcs12->tsc_multiplier; 1984 * 1985 * Not present in struct vmcs12: 1986 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 1987 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 1988 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 1989 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 1990 * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet; 1991 * evmcs->host_ssp = vmcs12->host_ssp; 1992 * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr; 1993 * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet; 1994 * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl; 1995 * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr; 1996 * evmcs->guest_ssp = vmcs12->guest_ssp; 1997 */ 1998 1999 evmcs->guest_es_selector = vmcs12->guest_es_selector; 2000 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 2001 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 2002 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 2003 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 2004 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 2005 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 2006 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 2007 2008 evmcs->guest_es_limit = vmcs12->guest_es_limit; 2009 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 2010 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 2011 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 2012 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 2013 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 2014 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 2015 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 2016 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 2017 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 2018 2019 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 2020 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 2021 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 2022 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 2023 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 2024 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 2025 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 2026 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 2027 2028 evmcs->guest_es_base = vmcs12->guest_es_base; 2029 evmcs->guest_cs_base = vmcs12->guest_cs_base; 2030 evmcs->guest_ss_base = vmcs12->guest_ss_base; 2031 evmcs->guest_ds_base = vmcs12->guest_ds_base; 2032 evmcs->guest_fs_base = vmcs12->guest_fs_base; 2033 evmcs->guest_gs_base = vmcs12->guest_gs_base; 2034 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 2035 evmcs->guest_tr_base = vmcs12->guest_tr_base; 2036 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 2037 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 2038 2039 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 2040 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 2041 2042 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 2043 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 2044 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 2045 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 2046 2047 evmcs->guest_pending_dbg_exceptions = 2048 vmcs12->guest_pending_dbg_exceptions; 2049 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 2050 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 2051 2052 evmcs->guest_activity_state = vmcs12->guest_activity_state; 2053 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 2054 2055 evmcs->guest_cr0 = vmcs12->guest_cr0; 2056 evmcs->guest_cr3 = vmcs12->guest_cr3; 2057 evmcs->guest_cr4 = vmcs12->guest_cr4; 2058 evmcs->guest_dr7 = vmcs12->guest_dr7; 2059 2060 evmcs->guest_physical_address = vmcs12->guest_physical_address; 2061 2062 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 2063 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 2064 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 2065 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 2066 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 2067 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 2068 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 2069 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 2070 2071 evmcs->exit_qualification = vmcs12->exit_qualification; 2072 2073 evmcs->guest_linear_address = vmcs12->guest_linear_address; 2074 evmcs->guest_rsp = vmcs12->guest_rsp; 2075 evmcs->guest_rflags = vmcs12->guest_rflags; 2076 2077 evmcs->guest_interruptibility_info = 2078 vmcs12->guest_interruptibility_info; 2079 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 2080 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 2081 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 2082 evmcs->vm_entry_exception_error_code = 2083 vmcs12->vm_entry_exception_error_code; 2084 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 2085 2086 evmcs->guest_rip = vmcs12->guest_rip; 2087 2088 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 2089 2090 return; 2091 #else /* CONFIG_KVM_HYPERV */ 2092 KVM_BUG_ON(1, vmx->vcpu.kvm); 2093 #endif /* CONFIG_KVM_HYPERV */ 2094 } 2095 2096 /* 2097 * This is an equivalent of the nested hypervisor executing the vmptrld 2098 * instruction. 2099 */ 2100 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( 2101 struct kvm_vcpu *vcpu, bool from_launch) 2102 { 2103 #ifdef CONFIG_KVM_HYPERV 2104 struct vcpu_vmx *vmx = to_vmx(vcpu); 2105 bool evmcs_gpa_changed = false; 2106 u64 evmcs_gpa; 2107 2108 if (likely(!guest_cpu_cap_has_evmcs(vcpu))) 2109 return EVMPTRLD_DISABLED; 2110 2111 evmcs_gpa = nested_get_evmptr(vcpu); 2112 if (!evmptr_is_valid(evmcs_gpa)) { 2113 nested_release_evmcs(vcpu); 2114 return EVMPTRLD_DISABLED; 2115 } 2116 2117 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 2118 vmx->nested.current_vmptr = INVALID_GPA; 2119 2120 nested_release_evmcs(vcpu); 2121 2122 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 2123 &vmx->nested.hv_evmcs_map)) 2124 return EVMPTRLD_ERROR; 2125 2126 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 2127 2128 /* 2129 * Currently, KVM only supports eVMCS version 1 2130 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 2131 * value to first u32 field of eVMCS which should specify eVMCS 2132 * VersionNumber. 2133 * 2134 * Guest should be aware of supported eVMCS versions by host by 2135 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 2136 * expected to set this CPUID leaf according to the value 2137 * returned in vmcs_version from nested_enable_evmcs(). 2138 * 2139 * However, it turns out that Microsoft Hyper-V fails to comply 2140 * to their own invented interface: When Hyper-V use eVMCS, it 2141 * just sets first u32 field of eVMCS to revision_id specified 2142 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 2143 * which is one of the supported versions specified in 2144 * CPUID.0x4000000A.EAX[0:15]. 2145 * 2146 * To overcome Hyper-V bug, we accept here either a supported 2147 * eVMCS version or VMCS12 revision_id as valid values for first 2148 * u32 field of eVMCS. 2149 */ 2150 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 2151 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 2152 nested_release_evmcs(vcpu); 2153 return EVMPTRLD_VMFAIL; 2154 } 2155 2156 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 2157 2158 evmcs_gpa_changed = true; 2159 /* 2160 * Unlike normal vmcs12, enlightened vmcs12 is not fully 2161 * reloaded from guest's memory (read only fields, fields not 2162 * present in struct hv_enlightened_vmcs, ...). Make sure there 2163 * are no leftovers. 2164 */ 2165 if (from_launch) { 2166 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2167 memset(vmcs12, 0, sizeof(*vmcs12)); 2168 vmcs12->hdr.revision_id = VMCS12_REVISION; 2169 } 2170 2171 } 2172 2173 /* 2174 * Clean fields data can't be used on VMLAUNCH and when we switch 2175 * between different L2 guests as KVM keeps a single VMCS12 per L1. 2176 */ 2177 if (from_launch || evmcs_gpa_changed) { 2178 vmx->nested.hv_evmcs->hv_clean_fields &= 2179 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2180 2181 vmx->nested.force_msr_bitmap_recalc = true; 2182 } 2183 2184 return EVMPTRLD_SUCCEEDED; 2185 #else 2186 return EVMPTRLD_DISABLED; 2187 #endif 2188 } 2189 2190 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 2191 { 2192 struct vcpu_vmx *vmx = to_vmx(vcpu); 2193 2194 if (nested_vmx_is_evmptr12_valid(vmx)) 2195 copy_vmcs12_to_enlightened(vmx); 2196 else 2197 copy_vmcs12_to_shadow(vmx); 2198 2199 vmx->nested.need_vmcs12_to_shadow_sync = false; 2200 } 2201 2202 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 2203 { 2204 struct vcpu_vmx *vmx = 2205 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 2206 2207 vmx->nested.preemption_timer_expired = true; 2208 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 2209 kvm_vcpu_kick(&vmx->vcpu); 2210 2211 return HRTIMER_NORESTART; 2212 } 2213 2214 static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) 2215 { 2216 struct vcpu_vmx *vmx = to_vmx(vcpu); 2217 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2218 2219 u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> 2220 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2221 2222 if (!vmx->nested.has_preemption_timer_deadline) { 2223 vmx->nested.preemption_timer_deadline = 2224 vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; 2225 vmx->nested.has_preemption_timer_deadline = true; 2226 } 2227 return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; 2228 } 2229 2230 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, 2231 u64 preemption_timeout) 2232 { 2233 struct vcpu_vmx *vmx = to_vmx(vcpu); 2234 2235 /* 2236 * A timer value of zero is architecturally guaranteed to cause 2237 * a VMExit prior to executing any instructions in the guest. 2238 */ 2239 if (preemption_timeout == 0) { 2240 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 2241 return; 2242 } 2243 2244 if (vcpu->arch.virtual_tsc_khz == 0) 2245 return; 2246 2247 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2248 preemption_timeout *= 1000000; 2249 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 2250 hrtimer_start(&vmx->nested.preemption_timer, 2251 ktime_add_ns(ktime_get(), preemption_timeout), 2252 HRTIMER_MODE_ABS_PINNED); 2253 } 2254 2255 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2256 { 2257 if (vmx->nested.nested_run_pending && 2258 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 2259 return vmcs12->guest_ia32_efer; 2260 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 2261 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 2262 else 2263 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 2264 } 2265 2266 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 2267 { 2268 struct kvm *kvm = vmx->vcpu.kvm; 2269 2270 /* 2271 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 2272 * according to L0's settings (vmcs12 is irrelevant here). Host 2273 * fields that come from L0 and are not constant, e.g. HOST_CR3, 2274 * will be set as needed prior to VMLAUNCH/VMRESUME. 2275 */ 2276 if (vmx->nested.vmcs02_initialized) 2277 return; 2278 vmx->nested.vmcs02_initialized = true; 2279 2280 /* 2281 * We don't care what the EPTP value is we just need to guarantee 2282 * it's valid so we don't get a false positive when doing early 2283 * consistency checks. 2284 */ 2285 if (enable_ept && nested_early_check) 2286 vmcs_write64(EPT_POINTER, 2287 construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); 2288 2289 if (vmx->ve_info) 2290 vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info)); 2291 2292 /* All VMFUNCs are currently emulated through L0 vmexits. */ 2293 if (cpu_has_vmx_vmfunc()) 2294 vmcs_write64(VM_FUNCTION_CONTROL, 0); 2295 2296 if (cpu_has_vmx_posted_intr()) 2297 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 2298 2299 if (cpu_has_vmx_msr_bitmap()) 2300 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2301 2302 /* 2303 * PML is emulated for L2, but never enabled in hardware as the MMU 2304 * handles A/D emulation. Disabling PML for L2 also avoids having to 2305 * deal with filtering out L2 GPAs from the buffer. 2306 */ 2307 if (enable_pml) { 2308 vmcs_write64(PML_ADDRESS, 0); 2309 vmcs_write16(GUEST_PML_INDEX, -1); 2310 } 2311 2312 if (cpu_has_vmx_encls_vmexit()) 2313 vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); 2314 2315 if (kvm_notify_vmexit_enabled(kvm)) 2316 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 2317 2318 /* 2319 * Set the MSR load/store lists to match L0's settings. Only the 2320 * addresses are constant (for vmcs02), the counts can change based 2321 * on L2's behavior, e.g. switching to/from long mode. 2322 */ 2323 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); 2324 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2325 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2326 2327 vmx_set_constant_host_state(vmx); 2328 } 2329 2330 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2331 struct vmcs12 *vmcs12) 2332 { 2333 prepare_vmcs02_constant_state(vmx); 2334 2335 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 2336 2337 /* 2338 * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the 2339 * same VPID as the host. Emulate this behavior by using vpid01 for L2 2340 * if VPID is disabled in vmcs12. Note, if VPID is disabled, VM-Enter 2341 * and VM-Exit are architecturally required to flush VPID=0, but *only* 2342 * VPID=0. I.e. using vpid02 would be ok (so long as KVM emulates the 2343 * required flushes), but doing so would cause KVM to over-flush. E.g. 2344 * if L1 runs L2 X with VPID12=1, then runs L2 Y with VPID12 disabled, 2345 * and then runs L2 X again, then KVM can and should retain TLB entries 2346 * for VPID12=1. 2347 */ 2348 if (enable_vpid) { 2349 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2350 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2351 else 2352 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2353 } 2354 } 2355 2356 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, 2357 struct vmcs12 *vmcs12) 2358 { 2359 u32 exec_control; 2360 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2361 2362 if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) 2363 prepare_vmcs02_early_rare(vmx, vmcs12); 2364 2365 /* 2366 * PIN CONTROLS 2367 */ 2368 exec_control = __pin_controls_get(vmcs01); 2369 exec_control |= (vmcs12->pin_based_vm_exec_control & 2370 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2371 2372 /* Posted interrupts setting is only taken from vmcs12. */ 2373 vmx->nested.pi_pending = false; 2374 if (nested_cpu_has_posted_intr(vmcs12)) { 2375 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2376 } else { 2377 vmx->nested.posted_intr_nv = -1; 2378 exec_control &= ~PIN_BASED_POSTED_INTR; 2379 } 2380 pin_controls_set(vmx, exec_control); 2381 2382 /* 2383 * EXEC CONTROLS 2384 */ 2385 exec_control = __exec_controls_get(vmcs01); /* L0's desires */ 2386 exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; 2387 exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; 2388 exec_control &= ~CPU_BASED_TPR_SHADOW; 2389 exec_control |= vmcs12->cpu_based_vm_exec_control; 2390 2391 vmx->nested.l1_tpr_threshold = -1; 2392 if (exec_control & CPU_BASED_TPR_SHADOW) 2393 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2394 #ifdef CONFIG_X86_64 2395 else 2396 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2397 CPU_BASED_CR8_STORE_EXITING; 2398 #endif 2399 2400 /* 2401 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2402 * for I/O port accesses. 2403 */ 2404 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2405 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2406 2407 /* 2408 * This bit will be computed in nested_get_vmcs12_pages, because 2409 * we do not have access to L1's MSR bitmap yet. For now, keep 2410 * the same bit as before, hoping to avoid multiple VMWRITEs that 2411 * only set/clear this bit. 2412 */ 2413 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2414 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2415 2416 exec_controls_set(vmx, exec_control); 2417 2418 /* 2419 * SECONDARY EXEC CONTROLS 2420 */ 2421 if (cpu_has_secondary_exec_ctrls()) { 2422 exec_control = __secondary_exec_controls_get(vmcs01); 2423 2424 /* Take the following fields only from vmcs12 */ 2425 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2426 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2427 SECONDARY_EXEC_ENABLE_INVPCID | 2428 SECONDARY_EXEC_ENABLE_RDTSCP | 2429 SECONDARY_EXEC_ENABLE_XSAVES | 2430 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2431 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2432 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2433 SECONDARY_EXEC_ENABLE_VMFUNC | 2434 SECONDARY_EXEC_DESC); 2435 2436 if (nested_cpu_has(vmcs12, 2437 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 2438 exec_control |= vmcs12->secondary_vm_exec_control; 2439 2440 /* PML is emulated and never enabled in hardware for L2. */ 2441 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 2442 2443 /* VMCS shadowing for L2 is emulated for now */ 2444 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2445 2446 /* 2447 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2448 * will not have to rewrite the controls just for this bit. 2449 */ 2450 if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2451 exec_control |= SECONDARY_EXEC_DESC; 2452 2453 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2454 vmcs_write16(GUEST_INTR_STATUS, 2455 vmcs12->guest_intr_status); 2456 2457 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 2458 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2459 2460 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) 2461 vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); 2462 2463 secondary_exec_controls_set(vmx, exec_control); 2464 } 2465 2466 /* 2467 * ENTRY CONTROLS 2468 * 2469 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2470 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2471 * on the related bits (if supported by the CPU) in the hope that 2472 * we can avoid VMWrites during vmx_set_efer(). 2473 * 2474 * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is 2475 * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to 2476 * do the same for L2. 2477 */ 2478 exec_control = __vm_entry_controls_get(vmcs01); 2479 exec_control |= (vmcs12->vm_entry_controls & 2480 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 2481 exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); 2482 if (cpu_has_load_ia32_efer()) { 2483 if (guest_efer & EFER_LMA) 2484 exec_control |= VM_ENTRY_IA32E_MODE; 2485 if (guest_efer != kvm_host.efer) 2486 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2487 } 2488 vm_entry_controls_set(vmx, exec_control); 2489 2490 /* 2491 * EXIT CONTROLS 2492 * 2493 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2494 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2495 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2496 */ 2497 exec_control = __vm_exit_controls_get(vmcs01); 2498 if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer) 2499 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2500 else 2501 exec_control &= ~VM_EXIT_LOAD_IA32_EFER; 2502 vm_exit_controls_set(vmx, exec_control); 2503 2504 /* 2505 * Interrupt/Exception Fields 2506 */ 2507 if (vmx->nested.nested_run_pending) { 2508 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2509 vmcs12->vm_entry_intr_info_field); 2510 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2511 vmcs12->vm_entry_exception_error_code); 2512 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2513 vmcs12->vm_entry_instruction_len); 2514 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2515 vmcs12->guest_interruptibility_info); 2516 vmx->loaded_vmcs->nmi_known_unmasked = 2517 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2518 } else { 2519 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2520 } 2521 } 2522 2523 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2524 { 2525 struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx); 2526 2527 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2528 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2529 2530 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2531 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2532 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2533 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2534 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2535 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2536 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2537 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2538 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2539 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2540 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2541 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2542 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2543 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2544 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2545 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2546 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2547 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2548 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2549 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2550 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2551 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2552 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2553 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2554 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2555 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2556 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2557 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2558 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2559 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2560 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2561 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2562 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2563 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2564 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2565 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2566 2567 vmx_segment_cache_clear(vmx); 2568 } 2569 2570 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2571 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2572 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2573 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2574 vmcs12->guest_pending_dbg_exceptions); 2575 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2576 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2577 2578 /* 2579 * L1 may access the L2's PDPTR, so save them to construct 2580 * vmcs12 2581 */ 2582 if (enable_ept) { 2583 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2584 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2585 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2586 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2587 } 2588 2589 if (kvm_mpx_supported() && vmx->nested.nested_run_pending && 2590 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2591 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2592 } 2593 2594 if (nested_cpu_has_xsaves(vmcs12)) 2595 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2596 2597 /* 2598 * Whether page-faults are trapped is determined by a combination of 2599 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 2600 * doesn't care about page faults then we should set all of these to 2601 * L1's desires. However, if L0 does care about (some) page faults, it 2602 * is not easy (if at all possible?) to merge L0 and L1's desires, we 2603 * simply ask to exit on each and every L2 page fault. This is done by 2604 * setting MASK=MATCH=0 and (see below) EB.PF=1. 2605 * Note that below we don't need special code to set EB.PF beyond the 2606 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2607 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2608 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2609 */ 2610 if (vmx_need_pf_intercept(&vmx->vcpu)) { 2611 /* 2612 * TODO: if both L0 and L1 need the same MASK and MATCH, 2613 * go ahead and use it? 2614 */ 2615 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 2616 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 2617 } else { 2618 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); 2619 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); 2620 } 2621 2622 if (cpu_has_vmx_apicv()) { 2623 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2624 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2625 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2626 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2627 } 2628 2629 /* 2630 * Make sure the msr_autostore list is up to date before we set the 2631 * count in the vmcs02. 2632 */ 2633 prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); 2634 2635 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); 2636 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2637 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2638 2639 set_cr4_guest_host_mask(vmx); 2640 } 2641 2642 /* 2643 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2644 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2645 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2646 * guest in a way that will both be appropriate to L1's requests, and our 2647 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2648 * function also has additional necessary side-effects, like setting various 2649 * vcpu->arch fields. 2650 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2651 * is assigned to entry_failure_code on failure. 2652 */ 2653 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2654 bool from_vmentry, 2655 enum vm_entry_failure_code *entry_failure_code) 2656 { 2657 struct vcpu_vmx *vmx = to_vmx(vcpu); 2658 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 2659 bool load_guest_pdptrs_vmcs12 = false; 2660 2661 if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) { 2662 prepare_vmcs02_rare(vmx, vmcs12); 2663 vmx->nested.dirty_vmcs12 = false; 2664 2665 load_guest_pdptrs_vmcs12 = !nested_vmx_is_evmptr12_valid(vmx) || 2666 !(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2667 } 2668 2669 if (vmx->nested.nested_run_pending && 2670 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2671 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2672 vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl & 2673 vmx_get_supported_debugctl(vcpu, false)); 2674 } else { 2675 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2676 vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl); 2677 } 2678 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2679 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2680 vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); 2681 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2682 2683 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2684 * bitwise-or of what L1 wants to trap for L2, and what we want to 2685 * trap. Note that CR0.TS also needs updating - we do this later. 2686 */ 2687 vmx_update_exception_bitmap(vcpu); 2688 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2689 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2690 2691 if (vmx->nested.nested_run_pending && 2692 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2693 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2694 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2695 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2696 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 2697 } 2698 2699 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( 2700 vcpu->arch.l1_tsc_offset, 2701 vmx_get_l2_tsc_offset(vcpu), 2702 vmx_get_l2_tsc_multiplier(vcpu)); 2703 2704 vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( 2705 vcpu->arch.l1_tsc_scaling_ratio, 2706 vmx_get_l2_tsc_multiplier(vcpu)); 2707 2708 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2709 if (kvm_caps.has_tsc_control) 2710 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 2711 2712 nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); 2713 2714 if (nested_cpu_has_ept(vmcs12)) 2715 nested_ept_init_mmu_context(vcpu); 2716 2717 /* 2718 * Override the CR0/CR4 read shadows after setting the effective guest 2719 * CR0/CR4. The common helpers also set the shadows, but they don't 2720 * account for vmcs12's cr0/4_guest_host_mask. 2721 */ 2722 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2723 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2724 2725 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2726 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2727 2728 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2729 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2730 vmx_set_efer(vcpu, vcpu->arch.efer); 2731 2732 /* 2733 * Guest state is invalid and unrestricted guest is disabled, 2734 * which means L1 attempted VMEntry to L2 with invalid state. 2735 * Fail the VMEntry. 2736 * 2737 * However when force loading the guest state (SMM exit or 2738 * loading nested state after migration, it is possible to 2739 * have invalid guest state now, which will be later fixed by 2740 * restoring L2 register state 2741 */ 2742 if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { 2743 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2744 return -EINVAL; 2745 } 2746 2747 /* Shadow page tables on either EPT or shadow page tables. */ 2748 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2749 from_vmentry, entry_failure_code)) 2750 return -EINVAL; 2751 2752 /* 2753 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 2754 * on nested VM-Exit, which can occur without actually running L2 and 2755 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with 2756 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the 2757 * transition to HLT instead of running L2. 2758 */ 2759 if (enable_ept) 2760 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); 2761 2762 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2763 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2764 is_pae_paging(vcpu)) { 2765 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2766 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2767 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2768 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2769 } 2770 2771 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2772 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && 2773 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2774 vmcs12->guest_ia32_perf_global_ctrl))) { 2775 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2776 return -EINVAL; 2777 } 2778 2779 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2780 kvm_rip_write(vcpu, vmcs12->guest_rip); 2781 2782 /* 2783 * It was observed that genuine Hyper-V running in L1 doesn't reset 2784 * 'hv_clean_fields' by itself, it only sets the corresponding dirty 2785 * bits when it changes a field in eVMCS. Mark all fields as clean 2786 * here. 2787 */ 2788 if (nested_vmx_is_evmptr12_valid(vmx)) 2789 evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2790 2791 return 0; 2792 } 2793 2794 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2795 { 2796 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2797 nested_cpu_has_virtual_nmis(vmcs12))) 2798 return -EINVAL; 2799 2800 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2801 nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) 2802 return -EINVAL; 2803 2804 return 0; 2805 } 2806 2807 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) 2808 { 2809 struct vcpu_vmx *vmx = to_vmx(vcpu); 2810 2811 /* Check for memory type validity */ 2812 switch (new_eptp & VMX_EPTP_MT_MASK) { 2813 case VMX_EPTP_MT_UC: 2814 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2815 return false; 2816 break; 2817 case VMX_EPTP_MT_WB: 2818 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2819 return false; 2820 break; 2821 default: 2822 return false; 2823 } 2824 2825 /* Page-walk levels validity. */ 2826 switch (new_eptp & VMX_EPTP_PWL_MASK) { 2827 case VMX_EPTP_PWL_5: 2828 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) 2829 return false; 2830 break; 2831 case VMX_EPTP_PWL_4: 2832 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) 2833 return false; 2834 break; 2835 default: 2836 return false; 2837 } 2838 2839 /* Reserved bits should not be set */ 2840 if (CC(!kvm_vcpu_is_legal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) 2841 return false; 2842 2843 /* AD, if set, should be supported */ 2844 if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { 2845 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2846 return false; 2847 } 2848 2849 return true; 2850 } 2851 2852 /* 2853 * Checks related to VM-Execution Control Fields 2854 */ 2855 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2856 struct vmcs12 *vmcs12) 2857 { 2858 struct vcpu_vmx *vmx = to_vmx(vcpu); 2859 2860 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2861 vmx->nested.msrs.pinbased_ctls_low, 2862 vmx->nested.msrs.pinbased_ctls_high)) || 2863 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2864 vmx->nested.msrs.procbased_ctls_low, 2865 vmx->nested.msrs.procbased_ctls_high))) 2866 return -EINVAL; 2867 2868 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2869 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2870 vmx->nested.msrs.secondary_ctls_low, 2871 vmx->nested.msrs.secondary_ctls_high))) 2872 return -EINVAL; 2873 2874 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2875 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2876 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2877 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2878 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2879 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2880 nested_vmx_check_nmi_controls(vmcs12) || 2881 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2882 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2883 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2884 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2885 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2886 return -EINVAL; 2887 2888 if (!nested_cpu_has_preemption_timer(vmcs12) && 2889 nested_cpu_has_save_preemption_timer(vmcs12)) 2890 return -EINVAL; 2891 2892 if (nested_cpu_has_ept(vmcs12) && 2893 CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) 2894 return -EINVAL; 2895 2896 if (nested_cpu_has_vmfunc(vmcs12)) { 2897 if (CC(vmcs12->vm_function_control & 2898 ~vmx->nested.msrs.vmfunc_controls)) 2899 return -EINVAL; 2900 2901 if (nested_cpu_has_eptp_switching(vmcs12)) { 2902 if (CC(!nested_cpu_has_ept(vmcs12)) || 2903 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2904 return -EINVAL; 2905 } 2906 } 2907 2908 return 0; 2909 } 2910 2911 /* 2912 * Checks related to VM-Exit Control Fields 2913 */ 2914 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2915 struct vmcs12 *vmcs12) 2916 { 2917 struct vcpu_vmx *vmx = to_vmx(vcpu); 2918 2919 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 2920 vmx->nested.msrs.exit_ctls_low, 2921 vmx->nested.msrs.exit_ctls_high)) || 2922 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 2923 return -EINVAL; 2924 2925 return 0; 2926 } 2927 2928 /* 2929 * Checks related to VM-Entry Control Fields 2930 */ 2931 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 2932 struct vmcs12 *vmcs12) 2933 { 2934 struct vcpu_vmx *vmx = to_vmx(vcpu); 2935 2936 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 2937 vmx->nested.msrs.entry_ctls_low, 2938 vmx->nested.msrs.entry_ctls_high))) 2939 return -EINVAL; 2940 2941 /* 2942 * From the Intel SDM, volume 3: 2943 * Fields relevant to VM-entry event injection must be set properly. 2944 * These fields are the VM-entry interruption-information field, the 2945 * VM-entry exception error code, and the VM-entry instruction length. 2946 */ 2947 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 2948 u32 intr_info = vmcs12->vm_entry_intr_info_field; 2949 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 2950 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 2951 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 2952 bool should_have_error_code; 2953 bool urg = nested_cpu_has2(vmcs12, 2954 SECONDARY_EXEC_UNRESTRICTED_GUEST); 2955 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 2956 2957 /* VM-entry interruption-info field: interruption type */ 2958 if (CC(intr_type == INTR_TYPE_RESERVED) || 2959 CC(intr_type == INTR_TYPE_OTHER_EVENT && 2960 !nested_cpu_supports_monitor_trap_flag(vcpu))) 2961 return -EINVAL; 2962 2963 /* VM-entry interruption-info field: vector */ 2964 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 2965 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 2966 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 2967 return -EINVAL; 2968 2969 /* VM-entry interruption-info field: deliver error code */ 2970 should_have_error_code = 2971 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && 2972 x86_exception_has_error_code(vector); 2973 if (CC(has_error_code != should_have_error_code)) 2974 return -EINVAL; 2975 2976 /* VM-entry exception error code */ 2977 if (CC(has_error_code && 2978 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) 2979 return -EINVAL; 2980 2981 /* VM-entry interruption-info field: reserved bits */ 2982 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 2983 return -EINVAL; 2984 2985 /* VM-entry instruction length */ 2986 switch (intr_type) { 2987 case INTR_TYPE_SOFT_EXCEPTION: 2988 case INTR_TYPE_SOFT_INTR: 2989 case INTR_TYPE_PRIV_SW_EXCEPTION: 2990 if (CC(vmcs12->vm_entry_instruction_len > X86_MAX_INSTRUCTION_LENGTH) || 2991 CC(vmcs12->vm_entry_instruction_len == 0 && 2992 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 2993 return -EINVAL; 2994 } 2995 } 2996 2997 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 2998 return -EINVAL; 2999 3000 return 0; 3001 } 3002 3003 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 3004 struct vmcs12 *vmcs12) 3005 { 3006 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 3007 nested_check_vm_exit_controls(vcpu, vmcs12) || 3008 nested_check_vm_entry_controls(vcpu, vmcs12)) 3009 return -EINVAL; 3010 3011 #ifdef CONFIG_KVM_HYPERV 3012 if (guest_cpu_cap_has_evmcs(vcpu)) 3013 return nested_evmcs_check_controls(vmcs12); 3014 #endif 3015 3016 return 0; 3017 } 3018 3019 static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, 3020 struct vmcs12 *vmcs12) 3021 { 3022 #ifdef CONFIG_X86_64 3023 if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != 3024 !!(vcpu->arch.efer & EFER_LMA))) 3025 return -EINVAL; 3026 #endif 3027 return 0; 3028 } 3029 3030 static bool is_l1_noncanonical_address_on_vmexit(u64 la, struct vmcs12 *vmcs12) 3031 { 3032 /* 3033 * Check that the given linear address is canonical after a VM exit 3034 * from L2, based on HOST_CR4.LA57 value that will be loaded for L1. 3035 */ 3036 u8 l1_address_bits_on_exit = (vmcs12->host_cr4 & X86_CR4_LA57) ? 57 : 48; 3037 3038 return !__is_canonical_address(la, l1_address_bits_on_exit); 3039 } 3040 3041 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 3042 struct vmcs12 *vmcs12) 3043 { 3044 bool ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); 3045 3046 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 3047 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 3048 CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3))) 3049 return -EINVAL; 3050 3051 if (CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 3052 CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 3053 return -EINVAL; 3054 3055 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 3056 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 3057 return -EINVAL; 3058 3059 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 3060 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 3061 vmcs12->host_ia32_perf_global_ctrl))) 3062 return -EINVAL; 3063 3064 if (ia32e) { 3065 if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) 3066 return -EINVAL; 3067 } else { 3068 if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || 3069 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || 3070 CC((vmcs12->host_rip) >> 32)) 3071 return -EINVAL; 3072 } 3073 3074 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3075 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3076 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3077 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3078 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3079 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3080 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3081 CC(vmcs12->host_cs_selector == 0) || 3082 CC(vmcs12->host_tr_selector == 0) || 3083 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 3084 return -EINVAL; 3085 3086 if (CC(is_noncanonical_base_address(vmcs12->host_fs_base, vcpu)) || 3087 CC(is_noncanonical_base_address(vmcs12->host_gs_base, vcpu)) || 3088 CC(is_noncanonical_base_address(vmcs12->host_gdtr_base, vcpu)) || 3089 CC(is_noncanonical_base_address(vmcs12->host_idtr_base, vcpu)) || 3090 CC(is_noncanonical_base_address(vmcs12->host_tr_base, vcpu)) || 3091 CC(is_l1_noncanonical_address_on_vmexit(vmcs12->host_rip, vmcs12))) 3092 return -EINVAL; 3093 3094 /* 3095 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 3096 * IA32_EFER MSR must be 0 in the field for that register. In addition, 3097 * the values of the LMA and LME bits in the field must each be that of 3098 * the host address-space size VM-exit control. 3099 */ 3100 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 3101 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 3102 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 3103 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 3104 return -EINVAL; 3105 } 3106 3107 return 0; 3108 } 3109 3110 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 3111 struct vmcs12 *vmcs12) 3112 { 3113 struct vcpu_vmx *vmx = to_vmx(vcpu); 3114 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 3115 struct vmcs_hdr hdr; 3116 3117 if (vmcs12->vmcs_link_pointer == INVALID_GPA) 3118 return 0; 3119 3120 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 3121 return -EINVAL; 3122 3123 if (ghc->gpa != vmcs12->vmcs_link_pointer && 3124 CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 3125 vmcs12->vmcs_link_pointer, VMCS12_SIZE))) 3126 return -EINVAL; 3127 3128 if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 3129 offsetof(struct vmcs12, hdr), 3130 sizeof(hdr)))) 3131 return -EINVAL; 3132 3133 if (CC(hdr.revision_id != VMCS12_REVISION) || 3134 CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 3135 return -EINVAL; 3136 3137 return 0; 3138 } 3139 3140 /* 3141 * Checks related to Guest Non-register State 3142 */ 3143 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 3144 { 3145 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 3146 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && 3147 vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) 3148 return -EINVAL; 3149 3150 return 0; 3151 } 3152 3153 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 3154 struct vmcs12 *vmcs12, 3155 enum vm_entry_failure_code *entry_failure_code) 3156 { 3157 bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE); 3158 3159 *entry_failure_code = ENTRY_FAIL_DEFAULT; 3160 3161 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 3162 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 3163 return -EINVAL; 3164 3165 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && 3166 (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) || 3167 CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false)))) 3168 return -EINVAL; 3169 3170 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 3171 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 3172 return -EINVAL; 3173 3174 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 3175 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; 3176 return -EINVAL; 3177 } 3178 3179 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 3180 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 3181 vmcs12->guest_ia32_perf_global_ctrl))) 3182 return -EINVAL; 3183 3184 if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG)) 3185 return -EINVAL; 3186 3187 if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) || 3188 CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG))) 3189 return -EINVAL; 3190 3191 /* 3192 * If the load IA32_EFER VM-entry control is 1, the following checks 3193 * are performed on the field for the IA32_EFER MSR: 3194 * - Bits reserved in the IA32_EFER MSR must be 0. 3195 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 3196 * the IA-32e mode guest VM-exit control. It must also be identical 3197 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 3198 * CR0.PG) is 1. 3199 */ 3200 if (to_vmx(vcpu)->nested.nested_run_pending && 3201 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 3202 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 3203 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 3204 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 3205 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 3206 return -EINVAL; 3207 } 3208 3209 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 3210 (CC(is_noncanonical_msr_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 3211 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 3212 return -EINVAL; 3213 3214 if (nested_check_guest_non_reg_state(vmcs12)) 3215 return -EINVAL; 3216 3217 return 0; 3218 } 3219 3220 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) 3221 { 3222 struct vcpu_vmx *vmx = to_vmx(vcpu); 3223 unsigned long cr3, cr4; 3224 bool vm_fail; 3225 3226 if (!nested_early_check) 3227 return 0; 3228 3229 if (vmx->msr_autoload.host.nr) 3230 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 3231 if (vmx->msr_autoload.guest.nr) 3232 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 3233 3234 preempt_disable(); 3235 3236 vmx_prepare_switch_to_guest(vcpu); 3237 3238 /* 3239 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, 3240 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to 3241 * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. 3242 * there is no need to preserve other bits or save/restore the field. 3243 */ 3244 vmcs_writel(GUEST_RFLAGS, 0); 3245 3246 cr3 = __get_current_cr3_fast(); 3247 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 3248 vmcs_writel(HOST_CR3, cr3); 3249 vmx->loaded_vmcs->host_state.cr3 = cr3; 3250 } 3251 3252 cr4 = cr4_read_shadow(); 3253 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 3254 vmcs_writel(HOST_CR4, cr4); 3255 vmx->loaded_vmcs->host_state.cr4 = cr4; 3256 } 3257 3258 vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 3259 __vmx_vcpu_run_flags(vmx)); 3260 3261 if (vmx->msr_autoload.host.nr) 3262 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 3263 if (vmx->msr_autoload.guest.nr) 3264 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 3265 3266 if (vm_fail) { 3267 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); 3268 3269 preempt_enable(); 3270 3271 trace_kvm_nested_vmenter_failed( 3272 "early hardware check VM-instruction error: ", error); 3273 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3274 return 1; 3275 } 3276 3277 /* 3278 * VMExit clears RFLAGS.IF and DR7, even on a consistency check. 3279 */ 3280 if (hw_breakpoint_active()) 3281 set_debugreg(__this_cpu_read(cpu_dr7), 7); 3282 local_irq_enable(); 3283 preempt_enable(); 3284 3285 /* 3286 * A non-failing VMEntry means we somehow entered guest mode with 3287 * an illegal RIP, and that's just the tip of the iceberg. There 3288 * is no telling what memory has been modified or what state has 3289 * been exposed to unknown code. Hitting this all but guarantees 3290 * a (very critical) hardware issue. 3291 */ 3292 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & 3293 VMX_EXIT_REASONS_FAILED_VMENTRY)); 3294 3295 return 0; 3296 } 3297 3298 #ifdef CONFIG_KVM_HYPERV 3299 static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) 3300 { 3301 struct vcpu_vmx *vmx = to_vmx(vcpu); 3302 3303 /* 3304 * hv_evmcs may end up being not mapped after migration (when 3305 * L2 was running), map it here to make sure vmcs12 changes are 3306 * properly reflected. 3307 */ 3308 if (guest_cpu_cap_has_evmcs(vcpu) && 3309 vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { 3310 enum nested_evmptrld_status evmptrld_status = 3311 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 3312 3313 if (evmptrld_status == EVMPTRLD_VMFAIL || 3314 evmptrld_status == EVMPTRLD_ERROR) 3315 return false; 3316 3317 /* 3318 * Post migration VMCS12 always provides the most actual 3319 * information, copy it to eVMCS upon entry. 3320 */ 3321 vmx->nested.need_vmcs12_to_shadow_sync = true; 3322 } 3323 3324 return true; 3325 } 3326 #endif 3327 3328 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 3329 { 3330 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3331 struct vcpu_vmx *vmx = to_vmx(vcpu); 3332 struct kvm_host_map *map; 3333 3334 if (!vcpu->arch.pdptrs_from_userspace && 3335 !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 3336 /* 3337 * Reload the guest's PDPTRs since after a migration 3338 * the guest CR3 might be restored prior to setting the nested 3339 * state which can lead to a load of wrong PDPTRs. 3340 */ 3341 if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) 3342 return false; 3343 } 3344 3345 3346 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3347 map = &vmx->nested.apic_access_page_map; 3348 3349 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) { 3350 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn)); 3351 } else { 3352 pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n", 3353 __func__); 3354 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3355 vcpu->run->internal.suberror = 3356 KVM_INTERNAL_ERROR_EMULATION; 3357 vcpu->run->internal.ndata = 0; 3358 return false; 3359 } 3360 } 3361 3362 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3363 map = &vmx->nested.virtual_apic_map; 3364 3365 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 3366 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 3367 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 3368 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 3369 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3370 /* 3371 * The processor will never use the TPR shadow, simply 3372 * clear the bit from the execution control. Such a 3373 * configuration is useless, but it happens in tests. 3374 * For any other configuration, failing the vm entry is 3375 * _not_ what the processor does but it's basically the 3376 * only possibility we have. 3377 */ 3378 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 3379 } else { 3380 /* 3381 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 3382 * force VM-Entry to fail. 3383 */ 3384 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); 3385 } 3386 } 3387 3388 if (nested_cpu_has_posted_intr(vmcs12)) { 3389 map = &vmx->nested.pi_desc_map; 3390 3391 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 3392 vmx->nested.pi_desc = 3393 (struct pi_desc *)(((void *)map->hva) + 3394 offset_in_page(vmcs12->posted_intr_desc_addr)); 3395 vmcs_write64(POSTED_INTR_DESC_ADDR, 3396 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 3397 } else { 3398 /* 3399 * Defer the KVM_INTERNAL_EXIT until KVM tries to 3400 * access the contents of the VMCS12 posted interrupt 3401 * descriptor. (Note that KVM may do this when it 3402 * should not, per the architectural specification.) 3403 */ 3404 vmx->nested.pi_desc = NULL; 3405 pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); 3406 } 3407 } 3408 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 3409 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3410 else 3411 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3412 3413 return true; 3414 } 3415 3416 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) 3417 { 3418 #ifdef CONFIG_KVM_HYPERV 3419 /* 3420 * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy 3421 * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory 3422 * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post 3423 * migration. 3424 */ 3425 if (!nested_get_evmcs_page(vcpu)) { 3426 pr_debug_ratelimited("%s: enlightened vmptrld failed\n", 3427 __func__); 3428 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3429 vcpu->run->internal.suberror = 3430 KVM_INTERNAL_ERROR_EMULATION; 3431 vcpu->run->internal.ndata = 0; 3432 3433 return false; 3434 } 3435 #endif 3436 3437 if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) 3438 return false; 3439 3440 return true; 3441 } 3442 3443 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) 3444 { 3445 struct vmcs12 *vmcs12; 3446 struct vcpu_vmx *vmx = to_vmx(vcpu); 3447 gpa_t dst; 3448 3449 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) 3450 return 0; 3451 3452 if (WARN_ON_ONCE(vmx->nested.pml_full)) 3453 return 1; 3454 3455 /* 3456 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is 3457 * set is already checked as part of A/D emulation. 3458 */ 3459 vmcs12 = get_vmcs12(vcpu); 3460 if (!nested_cpu_has_pml(vmcs12)) 3461 return 0; 3462 3463 if (vmcs12->guest_pml_index >= PML_LOG_NR_ENTRIES) { 3464 vmx->nested.pml_full = true; 3465 return 1; 3466 } 3467 3468 gpa &= ~0xFFFull; 3469 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; 3470 3471 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, 3472 offset_in_page(dst), sizeof(gpa))) 3473 return 0; 3474 3475 vmcs12->guest_pml_index--; 3476 3477 return 0; 3478 } 3479 3480 /* 3481 * Intel's VMX Instruction Reference specifies a common set of prerequisites 3482 * for running VMX instructions (except VMXON, whose prerequisites are 3483 * slightly different). It also specifies what exception to inject otherwise. 3484 * Note that many of these exceptions have priority over VM exits, so they 3485 * don't have to be checked again here. 3486 */ 3487 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 3488 { 3489 if (!to_vmx(vcpu)->nested.vmxon) { 3490 kvm_queue_exception(vcpu, UD_VECTOR); 3491 return 0; 3492 } 3493 3494 if (vmx_get_cpl(vcpu)) { 3495 kvm_inject_gp(vcpu, 0); 3496 return 0; 3497 } 3498 3499 return 1; 3500 } 3501 3502 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3503 struct vmcs12 *vmcs12); 3504 3505 /* 3506 * If from_vmentry is false, this is being called from state restore (either RSM 3507 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 3508 * 3509 * Returns: 3510 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode 3511 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail 3512 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit 3513 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error 3514 */ 3515 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, 3516 bool from_vmentry) 3517 { 3518 struct vcpu_vmx *vmx = to_vmx(vcpu); 3519 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3520 enum vm_entry_failure_code entry_failure_code; 3521 union vmx_exit_reason exit_reason = { 3522 .basic = EXIT_REASON_INVALID_STATE, 3523 .failed_vmentry = 1, 3524 }; 3525 u32 failed_index; 3526 3527 trace_kvm_nested_vmenter(kvm_rip_read(vcpu), 3528 vmx->nested.current_vmptr, 3529 vmcs12->guest_rip, 3530 vmcs12->guest_intr_status, 3531 vmcs12->vm_entry_intr_info_field, 3532 vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT, 3533 vmcs12->ept_pointer, 3534 vmcs12->guest_cr3, 3535 KVM_ISA_VMX); 3536 3537 kvm_service_local_tlb_flush_requests(vcpu); 3538 3539 if (!vmx->nested.nested_run_pending || 3540 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3541 vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read(); 3542 if (kvm_mpx_supported() && 3543 (!vmx->nested.nested_run_pending || 3544 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 3545 vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3546 3547 /* 3548 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* 3549 * nested early checks are disabled. In the event of a "late" VM-Fail, 3550 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its 3551 * software model to the pre-VMEntry host state. When EPT is disabled, 3552 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes 3553 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing 3554 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to 3555 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested 3556 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is 3557 * guaranteed to be overwritten with a shadow CR3 prior to re-entering 3558 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as 3559 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks 3560 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail 3561 * path would need to manually save/restore vmcs01.GUEST_CR3. 3562 */ 3563 if (!enable_ept && !nested_early_check) 3564 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3565 3566 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3567 3568 prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); 3569 3570 if (from_vmentry) { 3571 if (unlikely(!nested_get_vmcs12_pages(vcpu))) { 3572 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3573 return NVMX_VMENTRY_KVM_INTERNAL_ERROR; 3574 } 3575 3576 if (nested_vmx_check_vmentry_hw(vcpu)) { 3577 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3578 return NVMX_VMENTRY_VMFAIL; 3579 } 3580 3581 if (nested_vmx_check_guest_state(vcpu, vmcs12, 3582 &entry_failure_code)) { 3583 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3584 vmcs12->exit_qualification = entry_failure_code; 3585 goto vmentry_fail_vmexit; 3586 } 3587 } 3588 3589 enter_guest_mode(vcpu); 3590 3591 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) { 3592 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3593 vmcs12->exit_qualification = entry_failure_code; 3594 goto vmentry_fail_vmexit_guest_mode; 3595 } 3596 3597 if (from_vmentry) { 3598 failed_index = nested_vmx_load_msr(vcpu, 3599 vmcs12->vm_entry_msr_load_addr, 3600 vmcs12->vm_entry_msr_load_count); 3601 if (failed_index) { 3602 exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; 3603 vmcs12->exit_qualification = failed_index; 3604 goto vmentry_fail_vmexit_guest_mode; 3605 } 3606 } else { 3607 /* 3608 * The MMU is not initialized to point at the right entities yet and 3609 * "get pages" would need to read data from the guest (i.e. we will 3610 * need to perform gpa to hpa translation). Request a call 3611 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3612 * have already been set at vmentry time and should not be reset. 3613 */ 3614 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 3615 } 3616 3617 /* 3618 * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI 3619 * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can 3620 * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit 3621 * unconditionally. Take care to pull data from vmcs01 as appropriate, 3622 * e.g. when checking for interrupt windows, as vmcs02 is now loaded. 3623 */ 3624 if ((__exec_controls_get(&vmx->vmcs01) & (CPU_BASED_INTR_WINDOW_EXITING | 3625 CPU_BASED_NMI_WINDOW_EXITING)) || 3626 kvm_apic_has_pending_init_or_sipi(vcpu) || 3627 kvm_apic_has_interrupt(vcpu)) 3628 kvm_make_request(KVM_REQ_EVENT, vcpu); 3629 3630 /* 3631 * Do not start the preemption timer hrtimer until after we know 3632 * we are successful, so that only nested_vmx_vmexit needs to cancel 3633 * the timer. 3634 */ 3635 vmx->nested.preemption_timer_expired = false; 3636 if (nested_cpu_has_preemption_timer(vmcs12)) { 3637 u64 timer_value = vmx_calc_preemption_timer_value(vcpu); 3638 vmx_start_preemption_timer(vcpu, timer_value); 3639 } 3640 3641 /* 3642 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3643 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3644 * returned as far as L1 is concerned. It will only return (and set 3645 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3646 */ 3647 return NVMX_VMENTRY_SUCCESS; 3648 3649 /* 3650 * A failed consistency check that leads to a VMExit during L1's 3651 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3652 * 26.7 "VM-entry failures during or after loading guest state". 3653 */ 3654 vmentry_fail_vmexit_guest_mode: 3655 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3656 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3657 leave_guest_mode(vcpu); 3658 3659 vmentry_fail_vmexit: 3660 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3661 3662 if (!from_vmentry) 3663 return NVMX_VMENTRY_VMEXIT; 3664 3665 load_vmcs12_host_state(vcpu, vmcs12); 3666 vmcs12->vm_exit_reason = exit_reason.full; 3667 if (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)) 3668 vmx->nested.need_vmcs12_to_shadow_sync = true; 3669 return NVMX_VMENTRY_VMEXIT; 3670 } 3671 3672 /* 3673 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3674 * for running an L2 nested guest. 3675 */ 3676 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3677 { 3678 struct vmcs12 *vmcs12; 3679 enum nvmx_vmentry_status status; 3680 struct vcpu_vmx *vmx = to_vmx(vcpu); 3681 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3682 enum nested_evmptrld_status evmptrld_status; 3683 3684 if (!nested_vmx_check_permission(vcpu)) 3685 return 1; 3686 3687 evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); 3688 if (evmptrld_status == EVMPTRLD_ERROR) { 3689 kvm_queue_exception(vcpu, UD_VECTOR); 3690 return 1; 3691 } 3692 3693 kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); 3694 3695 if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) 3696 return nested_vmx_failInvalid(vcpu); 3697 3698 if (CC(!nested_vmx_is_evmptr12_valid(vmx) && 3699 vmx->nested.current_vmptr == INVALID_GPA)) 3700 return nested_vmx_failInvalid(vcpu); 3701 3702 vmcs12 = get_vmcs12(vcpu); 3703 3704 /* 3705 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3706 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3707 * rather than RFLAGS.ZF, and no error number is stored to the 3708 * VM-instruction error field. 3709 */ 3710 if (CC(vmcs12->hdr.shadow_vmcs)) 3711 return nested_vmx_failInvalid(vcpu); 3712 3713 if (nested_vmx_is_evmptr12_valid(vmx)) { 3714 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 3715 3716 copy_enlightened_to_vmcs12(vmx, evmcs->hv_clean_fields); 3717 /* Enlightened VMCS doesn't have launch state */ 3718 vmcs12->launch_state = !launch; 3719 } else if (enable_shadow_vmcs) { 3720 copy_shadow_to_vmcs12(vmx); 3721 } 3722 3723 /* 3724 * The nested entry process starts with enforcing various prerequisites 3725 * on vmcs12 as required by the Intel SDM, and act appropriately when 3726 * they fail: As the SDM explains, some conditions should cause the 3727 * instruction to fail, while others will cause the instruction to seem 3728 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3729 * To speed up the normal (success) code path, we should avoid checking 3730 * for misconfigurations which will anyway be caught by the processor 3731 * when using the merged vmcs02. 3732 */ 3733 if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) 3734 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3735 3736 if (CC(vmcs12->launch_state == launch)) 3737 return nested_vmx_fail(vcpu, 3738 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3739 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3740 3741 if (nested_vmx_check_controls(vcpu, vmcs12)) 3742 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3743 3744 if (nested_vmx_check_address_space_size(vcpu, vmcs12)) 3745 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3746 3747 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3748 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3749 3750 /* 3751 * We're finally done with prerequisite checking, and can start with 3752 * the nested entry. 3753 */ 3754 vmx->nested.nested_run_pending = 1; 3755 vmx->nested.has_preemption_timer_deadline = false; 3756 status = nested_vmx_enter_non_root_mode(vcpu, true); 3757 if (unlikely(status != NVMX_VMENTRY_SUCCESS)) 3758 goto vmentry_failed; 3759 3760 /* Hide L1D cache contents from the nested guest. */ 3761 vmx->vcpu.arch.l1tf_flush_l1d = true; 3762 3763 /* 3764 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3765 * also be used as part of restoring nVMX state for 3766 * snapshot restore (migration). 3767 * 3768 * In this flow, it is assumed that vmcs12 cache was 3769 * transferred as part of captured nVMX state and should 3770 * therefore not be read from guest memory (which may not 3771 * exist on destination host yet). 3772 */ 3773 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3774 3775 switch (vmcs12->guest_activity_state) { 3776 case GUEST_ACTIVITY_HLT: 3777 /* 3778 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3779 * awakened by event injection or by an NMI-window VM-exit or 3780 * by an interrupt-window VM-exit, halt the vcpu. 3781 */ 3782 if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3783 !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && 3784 !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && 3785 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3786 vmx->nested.nested_run_pending = 0; 3787 return kvm_emulate_halt_noskip(vcpu); 3788 } 3789 break; 3790 case GUEST_ACTIVITY_WAIT_SIPI: 3791 vmx->nested.nested_run_pending = 0; 3792 kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); 3793 break; 3794 default: 3795 break; 3796 } 3797 3798 return 1; 3799 3800 vmentry_failed: 3801 vmx->nested.nested_run_pending = 0; 3802 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) 3803 return 0; 3804 if (status == NVMX_VMENTRY_VMEXIT) 3805 return 1; 3806 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); 3807 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3808 } 3809 3810 /* 3811 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3812 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). 3813 * This function returns the new value we should put in vmcs12.guest_cr0. 3814 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3815 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3816 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3817 * didn't trap the bit, because if L1 did, so would L0). 3818 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3819 * been modified by L2, and L1 knows it. So just leave the old value of 3820 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3821 * isn't relevant, because if L0 traps this bit it can set it to anything. 3822 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3823 * changed these bits, and therefore they need to be updated, but L0 3824 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3825 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3826 */ 3827 static inline unsigned long 3828 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3829 { 3830 return 3831 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3832 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3833 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3834 vcpu->arch.cr0_guest_owned_bits)); 3835 } 3836 3837 static inline unsigned long 3838 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3839 { 3840 return 3841 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3842 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3843 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3844 vcpu->arch.cr4_guest_owned_bits)); 3845 } 3846 3847 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3848 struct vmcs12 *vmcs12, 3849 u32 vm_exit_reason, u32 exit_intr_info) 3850 { 3851 u32 idt_vectoring; 3852 unsigned int nr; 3853 3854 /* 3855 * Per the SDM, VM-Exits due to double and triple faults are never 3856 * considered to occur during event delivery, even if the double/triple 3857 * fault is the result of an escalating vectoring issue. 3858 * 3859 * Note, the SDM qualifies the double fault behavior with "The original 3860 * event results in a double-fault exception". It's unclear why the 3861 * qualification exists since exits due to double fault can occur only 3862 * while vectoring a different exception (injected events are never 3863 * subject to interception), i.e. there's _always_ an original event. 3864 * 3865 * The SDM also uses NMI as a confusing example for the "original event 3866 * causes the VM exit directly" clause. NMI isn't special in any way, 3867 * the same rule applies to all events that cause an exit directly. 3868 * NMI is an odd choice for the example because NMIs can only occur on 3869 * instruction boundaries, i.e. they _can't_ occur during vectoring. 3870 */ 3871 if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT || 3872 ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI && 3873 is_double_fault(exit_intr_info))) { 3874 vmcs12->idt_vectoring_info_field = 0; 3875 } else if (vcpu->arch.exception.injected) { 3876 nr = vcpu->arch.exception.vector; 3877 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3878 3879 if (kvm_exception_is_soft(nr)) { 3880 vmcs12->vm_exit_instruction_len = 3881 vcpu->arch.event_exit_inst_len; 3882 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 3883 } else 3884 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 3885 3886 if (vcpu->arch.exception.has_error_code) { 3887 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 3888 vmcs12->idt_vectoring_error_code = 3889 vcpu->arch.exception.error_code; 3890 } 3891 3892 vmcs12->idt_vectoring_info_field = idt_vectoring; 3893 } else if (vcpu->arch.nmi_injected) { 3894 vmcs12->idt_vectoring_info_field = 3895 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 3896 } else if (vcpu->arch.interrupt.injected) { 3897 nr = vcpu->arch.interrupt.nr; 3898 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3899 3900 if (vcpu->arch.interrupt.soft) { 3901 idt_vectoring |= INTR_TYPE_SOFT_INTR; 3902 vmcs12->vm_entry_instruction_len = 3903 vcpu->arch.event_exit_inst_len; 3904 } else 3905 idt_vectoring |= INTR_TYPE_EXT_INTR; 3906 3907 vmcs12->idt_vectoring_info_field = idt_vectoring; 3908 } else { 3909 vmcs12->idt_vectoring_info_field = 0; 3910 } 3911 } 3912 3913 3914 void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) 3915 { 3916 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3917 gfn_t gfn; 3918 3919 /* 3920 * Don't need to mark the APIC access page dirty; it is never 3921 * written to by the CPU during APIC virtualization. 3922 */ 3923 3924 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3925 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; 3926 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3927 } 3928 3929 if (nested_cpu_has_posted_intr(vmcs12)) { 3930 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; 3931 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3932 } 3933 } 3934 3935 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 3936 { 3937 struct vcpu_vmx *vmx = to_vmx(vcpu); 3938 int max_irr; 3939 void *vapic_page; 3940 u16 status; 3941 3942 if (!vmx->nested.pi_pending) 3943 return 0; 3944 3945 if (!vmx->nested.pi_desc) 3946 goto mmio_needed; 3947 3948 vmx->nested.pi_pending = false; 3949 3950 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 3951 return 0; 3952 3953 max_irr = pi_find_highest_vector(vmx->nested.pi_desc); 3954 if (max_irr > 0) { 3955 vapic_page = vmx->nested.virtual_apic_map.hva; 3956 if (!vapic_page) 3957 goto mmio_needed; 3958 3959 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 3960 vapic_page, &max_irr); 3961 status = vmcs_read16(GUEST_INTR_STATUS); 3962 if ((u8)max_irr > ((u8)status & 0xff)) { 3963 status &= ~0xff; 3964 status |= (u8)max_irr; 3965 vmcs_write16(GUEST_INTR_STATUS, status); 3966 } 3967 } 3968 3969 nested_mark_vmcs12_pages_dirty(vcpu); 3970 return 0; 3971 3972 mmio_needed: 3973 kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); 3974 return -ENXIO; 3975 } 3976 3977 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) 3978 { 3979 struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; 3980 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; 3981 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3982 unsigned long exit_qual; 3983 3984 if (ex->has_payload) { 3985 exit_qual = ex->payload; 3986 } else if (ex->vector == PF_VECTOR) { 3987 exit_qual = vcpu->arch.cr2; 3988 } else if (ex->vector == DB_VECTOR) { 3989 exit_qual = vcpu->arch.dr6; 3990 exit_qual &= ~DR6_BT; 3991 exit_qual ^= DR6_ACTIVE_LOW; 3992 } else { 3993 exit_qual = 0; 3994 } 3995 3996 /* 3997 * Unlike AMD's Paged Real Mode, which reports an error code on #PF 3998 * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the 3999 * "has error code" flags on VM-Exit if the CPU is in Real Mode. 4000 */ 4001 if (ex->has_error_code && is_protmode(vcpu)) { 4002 /* 4003 * Intel CPUs do not generate error codes with bits 31:16 set, 4004 * and more importantly VMX disallows setting bits 31:16 in the 4005 * injected error code for VM-Entry. Drop the bits to mimic 4006 * hardware and avoid inducing failure on nested VM-Entry if L1 4007 * chooses to inject the exception back to L2. AMD CPUs _do_ 4008 * generate "full" 32-bit error codes, so KVM allows userspace 4009 * to inject exception error codes with bits 31:16 set. 4010 */ 4011 vmcs12->vm_exit_intr_error_code = (u16)ex->error_code; 4012 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 4013 } 4014 4015 if (kvm_exception_is_soft(ex->vector)) 4016 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 4017 else 4018 intr_info |= INTR_TYPE_HARD_EXCEPTION; 4019 4020 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 4021 vmx_get_nmi_mask(vcpu)) 4022 intr_info |= INTR_INFO_UNBLOCK_NMI; 4023 4024 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 4025 } 4026 4027 /* 4028 * Returns true if a debug trap is (likely) pending delivery. Infer the class 4029 * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6). 4030 * Using the payload is flawed because code breakpoints (fault-like) and data 4031 * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e. 4032 * this will return false positives if a to-be-injected code breakpoint #DB is 4033 * pending (from KVM's perspective, but not "pending" across an instruction 4034 * boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it 4035 * too is trap-like. 4036 * 4037 * KVM "works" despite these flaws as ICEBP isn't currently supported by the 4038 * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the 4039 * #DB has already happened), and MTF isn't marked pending on code breakpoints 4040 * from the emulator (because such #DBs are fault-like and thus don't trigger 4041 * actions that fire on instruction retire). 4042 */ 4043 static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex) 4044 { 4045 if (!ex->pending || ex->vector != DB_VECTOR) 4046 return 0; 4047 4048 /* General Detect #DBs are always fault-like. */ 4049 return ex->payload & ~DR6_BD; 4050 } 4051 4052 /* 4053 * Returns true if there's a pending #DB exception that is lower priority than 4054 * a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by 4055 * KVM, but could theoretically be injected by userspace. Note, this code is 4056 * imperfect, see above. 4057 */ 4058 static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex) 4059 { 4060 return vmx_get_pending_dbg_trap(ex) & ~DR6_BT; 4061 } 4062 4063 /* 4064 * Certain VM-exits set the 'pending debug exceptions' field to indicate a 4065 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM 4066 * represents these debug traps with a payload that is said to be compatible 4067 * with the 'pending debug exceptions' field, write the payload to the VMCS 4068 * field if a VM-exit is delivered before the debug trap. 4069 */ 4070 static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) 4071 { 4072 unsigned long pending_dbg; 4073 4074 pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception); 4075 if (pending_dbg) 4076 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg); 4077 } 4078 4079 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) 4080 { 4081 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 4082 to_vmx(vcpu)->nested.preemption_timer_expired; 4083 } 4084 4085 static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection) 4086 { 4087 struct vcpu_vmx *vmx = to_vmx(vcpu); 4088 void *vapic = vmx->nested.virtual_apic_map.hva; 4089 int max_irr, vppr; 4090 4091 if (nested_vmx_preemption_timer_pending(vcpu) || 4092 vmx->nested.mtf_pending) 4093 return true; 4094 4095 /* 4096 * Virtual Interrupt Delivery doesn't require manual injection. Either 4097 * the interrupt is already in GUEST_RVI and will be recognized by CPU 4098 * at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move 4099 * the interrupt from the PIR to RVI prior to entering the guest. 4100 */ 4101 if (for_injection) 4102 return false; 4103 4104 if (!nested_cpu_has_vid(get_vmcs12(vcpu)) || 4105 __vmx_interrupt_blocked(vcpu)) 4106 return false; 4107 4108 if (!vapic) 4109 return false; 4110 4111 vppr = *((u32 *)(vapic + APIC_PROCPRI)); 4112 4113 max_irr = vmx_get_rvi(); 4114 if ((max_irr & 0xf0) > (vppr & 0xf0)) 4115 return true; 4116 4117 if (vmx->nested.pi_pending && vmx->nested.pi_desc && 4118 pi_test_on(vmx->nested.pi_desc)) { 4119 max_irr = pi_find_highest_vector(vmx->nested.pi_desc); 4120 if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0)) 4121 return true; 4122 } 4123 4124 return false; 4125 } 4126 4127 /* 4128 * Per the Intel SDM's table "Priority Among Concurrent Events", with minor 4129 * edits to fill in missing examples, e.g. #DB due to split-lock accesses, 4130 * and less minor edits to splice in the priority of VMX Non-Root specific 4131 * events, e.g. MTF and NMI/INTR-window exiting. 4132 * 4133 * 1 Hardware Reset and Machine Checks 4134 * - RESET 4135 * - Machine Check 4136 * 4137 * 2 Trap on Task Switch 4138 * - T flag in TSS is set (on task switch) 4139 * 4140 * 3 External Hardware Interventions 4141 * - FLUSH 4142 * - STOPCLK 4143 * - SMI 4144 * - INIT 4145 * 4146 * 3.5 Monitor Trap Flag (MTF) VM-exit[1] 4147 * 4148 * 4 Traps on Previous Instruction 4149 * - Breakpoints 4150 * - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O 4151 * breakpoint, or #DB due to a split-lock access) 4152 * 4153 * 4.3 VMX-preemption timer expired VM-exit 4154 * 4155 * 4.6 NMI-window exiting VM-exit[2] 4156 * 4157 * 5 Nonmaskable Interrupts (NMI) 4158 * 4159 * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery 4160 * 4161 * 6 Maskable Hardware Interrupts 4162 * 4163 * 7 Code Breakpoint Fault 4164 * 4165 * 8 Faults from Fetching Next Instruction 4166 * - Code-Segment Limit Violation 4167 * - Code Page Fault 4168 * - Control protection exception (missing ENDBRANCH at target of indirect 4169 * call or jump) 4170 * 4171 * 9 Faults from Decoding Next Instruction 4172 * - Instruction length > 15 bytes 4173 * - Invalid Opcode 4174 * - Coprocessor Not Available 4175 * 4176 *10 Faults on Executing Instruction 4177 * - Overflow 4178 * - Bound error 4179 * - Invalid TSS 4180 * - Segment Not Present 4181 * - Stack fault 4182 * - General Protection 4183 * - Data Page Fault 4184 * - Alignment Check 4185 * - x86 FPU Floating-point exception 4186 * - SIMD floating-point exception 4187 * - Virtualization exception 4188 * - Control protection exception 4189 * 4190 * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs), 4191 * INIT signals, and higher priority events take priority over MTF VM exits. 4192 * MTF VM exits take priority over debug-trap exceptions and lower priority 4193 * events. 4194 * 4195 * [2] Debug-trap exceptions and higher priority events take priority over VM exits 4196 * caused by the VMX-preemption timer. VM exits caused by the VMX-preemption 4197 * timer take priority over VM exits caused by the "NMI-window exiting" 4198 * VM-execution control and lower priority events. 4199 * 4200 * [3] Debug-trap exceptions and higher priority events take priority over VM exits 4201 * caused by "NMI-window exiting". VM exits caused by this control take 4202 * priority over non-maskable interrupts (NMIs) and lower priority events. 4203 * 4204 * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to 4205 * the 1-setting of the "interrupt-window exiting" VM-execution control. Thus, 4206 * non-maskable interrupts (NMIs) and higher priority events take priority over 4207 * delivery of a virtual interrupt; delivery of a virtual interrupt takes 4208 * priority over external interrupts and lower priority events. 4209 */ 4210 static int vmx_check_nested_events(struct kvm_vcpu *vcpu) 4211 { 4212 struct kvm_lapic *apic = vcpu->arch.apic; 4213 struct vcpu_vmx *vmx = to_vmx(vcpu); 4214 /* 4215 * Only a pending nested run blocks a pending exception. If there is a 4216 * previously injected event, the pending exception occurred while said 4217 * event was being delivered and thus needs to be handled. 4218 */ 4219 bool block_nested_exceptions = vmx->nested.nested_run_pending; 4220 /* 4221 * Events that don't require injection, i.e. that are virtualized by 4222 * hardware, aren't blocked by a pending VM-Enter as KVM doesn't need 4223 * to regain control in order to deliver the event, and hardware will 4224 * handle event ordering, e.g. with respect to injected exceptions. 4225 * 4226 * But, new events (not exceptions) are only recognized at instruction 4227 * boundaries. If an event needs reinjection, then KVM is handling a 4228 * VM-Exit that occurred _during_ instruction execution; new events, 4229 * irrespective of whether or not they're injected, are blocked until 4230 * the instruction completes. 4231 */ 4232 bool block_non_injected_events = kvm_event_needs_reinjection(vcpu); 4233 /* 4234 * Inject events are blocked by nested VM-Enter, as KVM is responsible 4235 * for managing priority between concurrent events, i.e. KVM needs to 4236 * wait until after VM-Enter completes to deliver injected events. 4237 */ 4238 bool block_nested_events = block_nested_exceptions || 4239 block_non_injected_events; 4240 4241 if (lapic_in_kernel(vcpu) && 4242 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 4243 if (block_nested_events) 4244 return -EBUSY; 4245 nested_vmx_update_pending_dbg(vcpu); 4246 clear_bit(KVM_APIC_INIT, &apic->pending_events); 4247 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) 4248 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 4249 4250 /* MTF is discarded if the vCPU is in WFS. */ 4251 vmx->nested.mtf_pending = false; 4252 return 0; 4253 } 4254 4255 if (lapic_in_kernel(vcpu) && 4256 test_bit(KVM_APIC_SIPI, &apic->pending_events)) { 4257 if (block_nested_events) 4258 return -EBUSY; 4259 4260 clear_bit(KVM_APIC_SIPI, &apic->pending_events); 4261 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 4262 nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, 4263 apic->sipi_vector & 0xFFUL); 4264 return 0; 4265 } 4266 /* Fallthrough, the SIPI is completely ignored. */ 4267 } 4268 4269 /* 4270 * Process exceptions that are higher priority than Monitor Trap Flag: 4271 * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but 4272 * could theoretically come in from userspace), and ICEBP (INT1). 4273 * 4274 * TODO: SMIs have higher priority than MTF and trap-like #DBs (except 4275 * for TSS T flag #DBs). KVM also doesn't save/restore pending MTF 4276 * across SMI/RSM as it should; that needs to be addressed in order to 4277 * prioritize SMI over MTF and trap-like #DBs. 4278 */ 4279 if (vcpu->arch.exception_vmexit.pending && 4280 !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) { 4281 if (block_nested_exceptions) 4282 return -EBUSY; 4283 4284 nested_vmx_inject_exception_vmexit(vcpu); 4285 return 0; 4286 } 4287 4288 if (vcpu->arch.exception.pending && 4289 !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) { 4290 if (block_nested_exceptions) 4291 return -EBUSY; 4292 goto no_vmexit; 4293 } 4294 4295 if (vmx->nested.mtf_pending) { 4296 if (block_nested_events) 4297 return -EBUSY; 4298 nested_vmx_update_pending_dbg(vcpu); 4299 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); 4300 return 0; 4301 } 4302 4303 if (vcpu->arch.exception_vmexit.pending) { 4304 if (block_nested_exceptions) 4305 return -EBUSY; 4306 4307 nested_vmx_inject_exception_vmexit(vcpu); 4308 return 0; 4309 } 4310 4311 if (vcpu->arch.exception.pending) { 4312 if (block_nested_exceptions) 4313 return -EBUSY; 4314 goto no_vmexit; 4315 } 4316 4317 if (nested_vmx_preemption_timer_pending(vcpu)) { 4318 if (block_nested_events) 4319 return -EBUSY; 4320 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 4321 return 0; 4322 } 4323 4324 if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 4325 if (block_nested_events) 4326 return -EBUSY; 4327 goto no_vmexit; 4328 } 4329 4330 if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { 4331 if (block_nested_events) 4332 return -EBUSY; 4333 if (!nested_exit_on_nmi(vcpu)) 4334 goto no_vmexit; 4335 4336 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 4337 NMI_VECTOR | INTR_TYPE_NMI_INTR | 4338 INTR_INFO_VALID_MASK, 0); 4339 /* 4340 * The NMI-triggered VM exit counts as injection: 4341 * clear this one and block further NMIs. 4342 */ 4343 vcpu->arch.nmi_pending = 0; 4344 vmx_set_nmi_mask(vcpu, true); 4345 return 0; 4346 } 4347 4348 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { 4349 int irq; 4350 4351 if (!nested_exit_on_intr(vcpu)) { 4352 if (block_nested_events) 4353 return -EBUSY; 4354 4355 goto no_vmexit; 4356 } 4357 4358 if (!nested_exit_intr_ack_set(vcpu)) { 4359 if (block_nested_events) 4360 return -EBUSY; 4361 4362 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 4363 return 0; 4364 } 4365 4366 irq = kvm_cpu_get_extint(vcpu); 4367 if (irq != -1) { 4368 if (block_nested_events) 4369 return -EBUSY; 4370 4371 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 4372 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); 4373 return 0; 4374 } 4375 4376 irq = kvm_apic_has_interrupt(vcpu); 4377 if (WARN_ON_ONCE(irq < 0)) 4378 goto no_vmexit; 4379 4380 /* 4381 * If the IRQ is L2's PI notification vector, process posted 4382 * interrupts for L2 instead of injecting VM-Exit, as the 4383 * detection/morphing architecturally occurs when the IRQ is 4384 * delivered to the CPU. Note, only interrupts that are routed 4385 * through the local APIC trigger posted interrupt processing, 4386 * and enabling posted interrupts requires ACK-on-exit. 4387 */ 4388 if (irq == vmx->nested.posted_intr_nv) { 4389 /* 4390 * Nested posted interrupts are delivered via RVI, i.e. 4391 * aren't injected by KVM, and so can be queued even if 4392 * manual event injection is disallowed. 4393 */ 4394 if (block_non_injected_events) 4395 return -EBUSY; 4396 4397 vmx->nested.pi_pending = true; 4398 kvm_apic_clear_irr(vcpu, irq); 4399 goto no_vmexit; 4400 } 4401 4402 if (block_nested_events) 4403 return -EBUSY; 4404 4405 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 4406 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); 4407 4408 /* 4409 * ACK the interrupt _after_ emulating VM-Exit, as the IRQ must 4410 * be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI 4411 * if APICv is active. 4412 */ 4413 kvm_apic_ack_interrupt(vcpu, irq); 4414 return 0; 4415 } 4416 4417 no_vmexit: 4418 return vmx_complete_nested_posted_interrupt(vcpu); 4419 } 4420 4421 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 4422 { 4423 ktime_t remaining = 4424 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 4425 u64 value; 4426 4427 if (ktime_to_ns(remaining) <= 0) 4428 return 0; 4429 4430 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 4431 do_div(value, 1000000); 4432 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 4433 } 4434 4435 static bool is_vmcs12_ext_field(unsigned long field) 4436 { 4437 switch (field) { 4438 case GUEST_ES_SELECTOR: 4439 case GUEST_CS_SELECTOR: 4440 case GUEST_SS_SELECTOR: 4441 case GUEST_DS_SELECTOR: 4442 case GUEST_FS_SELECTOR: 4443 case GUEST_GS_SELECTOR: 4444 case GUEST_LDTR_SELECTOR: 4445 case GUEST_TR_SELECTOR: 4446 case GUEST_ES_LIMIT: 4447 case GUEST_CS_LIMIT: 4448 case GUEST_SS_LIMIT: 4449 case GUEST_DS_LIMIT: 4450 case GUEST_FS_LIMIT: 4451 case GUEST_GS_LIMIT: 4452 case GUEST_LDTR_LIMIT: 4453 case GUEST_TR_LIMIT: 4454 case GUEST_GDTR_LIMIT: 4455 case GUEST_IDTR_LIMIT: 4456 case GUEST_ES_AR_BYTES: 4457 case GUEST_DS_AR_BYTES: 4458 case GUEST_FS_AR_BYTES: 4459 case GUEST_GS_AR_BYTES: 4460 case GUEST_LDTR_AR_BYTES: 4461 case GUEST_TR_AR_BYTES: 4462 case GUEST_ES_BASE: 4463 case GUEST_CS_BASE: 4464 case GUEST_SS_BASE: 4465 case GUEST_DS_BASE: 4466 case GUEST_FS_BASE: 4467 case GUEST_GS_BASE: 4468 case GUEST_LDTR_BASE: 4469 case GUEST_TR_BASE: 4470 case GUEST_GDTR_BASE: 4471 case GUEST_IDTR_BASE: 4472 case GUEST_PENDING_DBG_EXCEPTIONS: 4473 case GUEST_BNDCFGS: 4474 return true; 4475 default: 4476 break; 4477 } 4478 4479 return false; 4480 } 4481 4482 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4483 struct vmcs12 *vmcs12) 4484 { 4485 struct vcpu_vmx *vmx = to_vmx(vcpu); 4486 4487 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 4488 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 4489 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 4490 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 4491 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 4492 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 4493 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 4494 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 4495 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 4496 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 4497 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 4498 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 4499 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 4500 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 4501 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 4502 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 4503 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 4504 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 4505 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 4506 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 4507 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 4508 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 4509 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 4510 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 4511 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 4512 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 4513 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 4514 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 4515 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 4516 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 4517 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 4518 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 4519 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 4520 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 4521 vmcs12->guest_pending_dbg_exceptions = 4522 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 4523 4524 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 4525 } 4526 4527 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4528 struct vmcs12 *vmcs12) 4529 { 4530 struct vcpu_vmx *vmx = to_vmx(vcpu); 4531 int cpu; 4532 4533 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 4534 return; 4535 4536 4537 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 4538 4539 cpu = get_cpu(); 4540 vmx->loaded_vmcs = &vmx->nested.vmcs02; 4541 vmx_vcpu_load_vmcs(vcpu, cpu); 4542 4543 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4544 4545 vmx->loaded_vmcs = &vmx->vmcs01; 4546 vmx_vcpu_load_vmcs(vcpu, cpu); 4547 put_cpu(); 4548 } 4549 4550 /* 4551 * Update the guest state fields of vmcs12 to reflect changes that 4552 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 4553 * VM-entry controls is also updated, since this is really a guest 4554 * state bit.) 4555 */ 4556 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 4557 { 4558 struct vcpu_vmx *vmx = to_vmx(vcpu); 4559 4560 if (nested_vmx_is_evmptr12_valid(vmx)) 4561 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4562 4563 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = 4564 !nested_vmx_is_evmptr12_valid(vmx); 4565 4566 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 4567 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 4568 4569 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 4570 vmcs12->guest_rip = kvm_rip_read(vcpu); 4571 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 4572 4573 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 4574 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 4575 4576 vmcs12->guest_interruptibility_info = 4577 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 4578 4579 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 4580 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 4581 else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 4582 vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; 4583 else 4584 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 4585 4586 if (nested_cpu_has_preemption_timer(vmcs12) && 4587 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && 4588 !vmx->nested.nested_run_pending) 4589 vmcs12->vmx_preemption_timer_value = 4590 vmx_get_preemption_timer_value(vcpu); 4591 4592 /* 4593 * In some cases (usually, nested EPT), L2 is allowed to change its 4594 * own CR3 without exiting. If it has changed it, we must keep it. 4595 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 4596 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 4597 * 4598 * Additionally, restore L2's PDPTR to vmcs12. 4599 */ 4600 if (enable_ept) { 4601 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 4602 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 4603 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 4604 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 4605 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 4606 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 4607 } 4608 } 4609 4610 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 4611 4612 if (nested_cpu_has_vid(vmcs12)) 4613 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 4614 4615 vmcs12->vm_entry_controls = 4616 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 4617 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 4618 4619 /* 4620 * Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02. 4621 * Writes to DEBUGCTL that aren't intercepted by L1 are immediately 4622 * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into 4623 * vmcs02 doesn't strictly track vmcs12. 4624 */ 4625 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 4626 vmcs12->guest_dr7 = vcpu->arch.dr7; 4627 4628 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 4629 vmcs12->guest_ia32_efer = vcpu->arch.efer; 4630 } 4631 4632 /* 4633 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 4634 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 4635 * and this function updates it to reflect the changes to the guest state while 4636 * L2 was running (and perhaps made some exits which were handled directly by L0 4637 * without going back to L1), and to reflect the exit reason. 4638 * Note that we do not have to copy here all VMCS fields, just those that 4639 * could have changed by the L2 guest or the exit - i.e., the guest-state and 4640 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 4641 * which already writes to vmcs12 directly. 4642 */ 4643 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 4644 u32 vm_exit_reason, u32 exit_intr_info, 4645 unsigned long exit_qualification, u32 exit_insn_len) 4646 { 4647 /* update exit information fields: */ 4648 vmcs12->vm_exit_reason = vm_exit_reason; 4649 if (vmx_get_exit_reason(vcpu).enclave_mode) 4650 vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; 4651 vmcs12->exit_qualification = exit_qualification; 4652 4653 /* 4654 * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched 4655 * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other 4656 * exit info fields are unmodified. 4657 */ 4658 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 4659 vmcs12->launch_state = 1; 4660 4661 /* vm_entry_intr_info_field is cleared on exit. Emulate this 4662 * instead of reading the real value. */ 4663 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 4664 4665 /* 4666 * Transfer the event that L0 or L1 may wanted to inject into 4667 * L2 to IDT_VECTORING_INFO_FIELD. 4668 */ 4669 vmcs12_save_pending_event(vcpu, vmcs12, 4670 vm_exit_reason, exit_intr_info); 4671 4672 vmcs12->vm_exit_intr_info = exit_intr_info; 4673 vmcs12->vm_exit_instruction_len = exit_insn_len; 4674 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4675 4676 /* 4677 * According to spec, there's no need to store the guest's 4678 * MSRs if the exit is due to a VM-entry failure that occurs 4679 * during or after loading the guest state. Since this exit 4680 * does not fall in that category, we need to save the MSRs. 4681 */ 4682 if (nested_vmx_store_msr(vcpu, 4683 vmcs12->vm_exit_msr_store_addr, 4684 vmcs12->vm_exit_msr_store_count)) 4685 nested_vmx_abort(vcpu, 4686 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 4687 } 4688 } 4689 4690 /* 4691 * A part of what we need to when the nested L2 guest exits and we want to 4692 * run its L1 parent, is to reset L1's guest state to the host state specified 4693 * in vmcs12. 4694 * This function is to be called not only on normal nested exit, but also on 4695 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 4696 * Failures During or After Loading Guest State"). 4697 * This function should be called when the active VMCS is L1's (vmcs01). 4698 */ 4699 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 4700 struct vmcs12 *vmcs12) 4701 { 4702 enum vm_entry_failure_code ignored; 4703 struct kvm_segment seg; 4704 4705 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 4706 vcpu->arch.efer = vmcs12->host_ia32_efer; 4707 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4708 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 4709 else 4710 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 4711 vmx_set_efer(vcpu, vcpu->arch.efer); 4712 4713 kvm_rsp_write(vcpu, vmcs12->host_rsp); 4714 kvm_rip_write(vcpu, vmcs12->host_rip); 4715 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 4716 vmx_set_interrupt_shadow(vcpu, 0); 4717 4718 /* 4719 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 4720 * actually changed, because vmx_set_cr0 refers to efer set above. 4721 * 4722 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 4723 * (KVM doesn't change it); 4724 */ 4725 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4726 vmx_set_cr0(vcpu, vmcs12->host_cr0); 4727 4728 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 4729 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4730 vmx_set_cr4(vcpu, vmcs12->host_cr4); 4731 4732 nested_ept_uninit_mmu_context(vcpu); 4733 4734 /* 4735 * Only PDPTE load can fail as the value of cr3 was checked on entry and 4736 * couldn't have changed. 4737 */ 4738 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored)) 4739 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 4740 4741 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); 4742 4743 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 4744 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 4745 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 4746 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 4747 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 4748 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 4749 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 4750 4751 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 4752 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 4753 vmcs_write64(GUEST_BNDCFGS, 0); 4754 4755 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 4756 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 4757 vcpu->arch.pat = vmcs12->host_ia32_pat; 4758 } 4759 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 4760 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) 4761 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4762 vmcs12->host_ia32_perf_global_ctrl)); 4763 4764 /* Set L1 segment info according to Intel SDM 4765 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 4766 seg = (struct kvm_segment) { 4767 .base = 0, 4768 .limit = 0xFFFFFFFF, 4769 .selector = vmcs12->host_cs_selector, 4770 .type = 11, 4771 .present = 1, 4772 .s = 1, 4773 .g = 1 4774 }; 4775 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4776 seg.l = 1; 4777 else 4778 seg.db = 1; 4779 __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 4780 seg = (struct kvm_segment) { 4781 .base = 0, 4782 .limit = 0xFFFFFFFF, 4783 .type = 3, 4784 .present = 1, 4785 .s = 1, 4786 .db = 1, 4787 .g = 1 4788 }; 4789 seg.selector = vmcs12->host_ds_selector; 4790 __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 4791 seg.selector = vmcs12->host_es_selector; 4792 __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 4793 seg.selector = vmcs12->host_ss_selector; 4794 __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 4795 seg.selector = vmcs12->host_fs_selector; 4796 seg.base = vmcs12->host_fs_base; 4797 __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 4798 seg.selector = vmcs12->host_gs_selector; 4799 seg.base = vmcs12->host_gs_base; 4800 __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 4801 seg = (struct kvm_segment) { 4802 .base = vmcs12->host_tr_base, 4803 .limit = 0x67, 4804 .selector = vmcs12->host_tr_selector, 4805 .type = 11, 4806 .present = 1 4807 }; 4808 __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 4809 4810 memset(&seg, 0, sizeof(seg)); 4811 seg.unusable = 1; 4812 __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); 4813 4814 kvm_set_dr(vcpu, 7, 0x400); 4815 vmx_guest_debugctl_write(vcpu, 0); 4816 4817 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 4818 vmcs12->vm_exit_msr_load_count)) 4819 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4820 4821 to_vt(vcpu)->emulation_required = vmx_emulation_required(vcpu); 4822 } 4823 4824 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 4825 { 4826 struct vmx_uret_msr *efer_msr; 4827 unsigned int i; 4828 4829 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 4830 return vmcs_read64(GUEST_IA32_EFER); 4831 4832 if (cpu_has_load_ia32_efer()) 4833 return kvm_host.efer; 4834 4835 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 4836 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 4837 return vmx->msr_autoload.guest.val[i].value; 4838 } 4839 4840 efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); 4841 if (efer_msr) 4842 return efer_msr->data; 4843 4844 return kvm_host.efer; 4845 } 4846 4847 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 4848 { 4849 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4850 struct vcpu_vmx *vmx = to_vmx(vcpu); 4851 struct vmx_msr_entry g, h; 4852 gpa_t gpa; 4853 u32 i, j; 4854 4855 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 4856 4857 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 4858 /* 4859 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 4860 * as vmcs01.GUEST_DR7 contains a userspace defined value 4861 * and vcpu->arch.dr7 is not squirreled away before the 4862 * nested VMENTER (not worth adding a variable in nested_vmx). 4863 */ 4864 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 4865 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 4866 else 4867 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 4868 } 4869 4870 /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */ 4871 vmx_reload_guest_debugctl(vcpu); 4872 4873 /* 4874 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 4875 * handle a variety of side effects to KVM's software model. 4876 */ 4877 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 4878 4879 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4880 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 4881 4882 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4883 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 4884 4885 nested_ept_uninit_mmu_context(vcpu); 4886 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 4887 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 4888 4889 /* 4890 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 4891 * from vmcs01 (if necessary). The PDPTRs are not loaded on 4892 * VMFail, like everything else we just need to ensure our 4893 * software model is up-to-date. 4894 */ 4895 if (enable_ept && is_pae_paging(vcpu)) 4896 ept_save_pdptrs(vcpu); 4897 4898 kvm_mmu_reset_context(vcpu); 4899 4900 /* 4901 * This nasty bit of open coding is a compromise between blindly 4902 * loading L1's MSRs using the exit load lists (incorrect emulation 4903 * of VMFail), leaving the nested VM's MSRs in the software model 4904 * (incorrect behavior) and snapshotting the modified MSRs (too 4905 * expensive since the lists are unbound by hardware). For each 4906 * MSR that was (prematurely) loaded from the nested VMEntry load 4907 * list, reload it from the exit load list if it exists and differs 4908 * from the guest value. The intent is to stuff host state as 4909 * silently as possible, not to fully process the exit load list. 4910 */ 4911 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 4912 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 4913 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 4914 pr_debug_ratelimited( 4915 "%s read MSR index failed (%u, 0x%08llx)\n", 4916 __func__, i, gpa); 4917 goto vmabort; 4918 } 4919 4920 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 4921 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 4922 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 4923 pr_debug_ratelimited( 4924 "%s read MSR failed (%u, 0x%08llx)\n", 4925 __func__, j, gpa); 4926 goto vmabort; 4927 } 4928 if (h.index != g.index) 4929 continue; 4930 if (h.value == g.value) 4931 break; 4932 4933 if (nested_vmx_load_msr_check(vcpu, &h)) { 4934 pr_debug_ratelimited( 4935 "%s check failed (%u, 0x%x, 0x%x)\n", 4936 __func__, j, h.index, h.reserved); 4937 goto vmabort; 4938 } 4939 4940 if (kvm_set_msr_with_filter(vcpu, h.index, h.value)) { 4941 pr_debug_ratelimited( 4942 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 4943 __func__, j, h.index, h.value); 4944 goto vmabort; 4945 } 4946 } 4947 } 4948 4949 return; 4950 4951 vmabort: 4952 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4953 } 4954 4955 /* 4956 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 4957 * and modify vmcs12 to make it see what it would expect to see there if 4958 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 4959 */ 4960 void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, 4961 u32 exit_intr_info, unsigned long exit_qualification, 4962 u32 exit_insn_len) 4963 { 4964 struct vcpu_vmx *vmx = to_vmx(vcpu); 4965 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4966 4967 /* Pending MTF traps are discarded on VM-Exit. */ 4968 vmx->nested.mtf_pending = false; 4969 4970 /* trying to cancel vmlaunch/vmresume is a bug */ 4971 WARN_ON_ONCE(vmx->nested.nested_run_pending); 4972 4973 #ifdef CONFIG_KVM_HYPERV 4974 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { 4975 /* 4976 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map 4977 * Enlightened VMCS after migration and we still need to 4978 * do that when something is forcing L2->L1 exit prior to 4979 * the first L2 run. 4980 */ 4981 (void)nested_get_evmcs_page(vcpu); 4982 } 4983 #endif 4984 4985 /* Service pending TLB flush requests for L2 before switching to L1. */ 4986 kvm_service_local_tlb_flush_requests(vcpu); 4987 4988 /* 4989 * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between 4990 * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are 4991 * up-to-date before switching to L1. 4992 */ 4993 if (enable_ept && is_pae_paging(vcpu)) 4994 vmx_ept_load_pdptrs(vcpu); 4995 4996 leave_guest_mode(vcpu); 4997 4998 if (nested_cpu_has_preemption_timer(vmcs12)) 4999 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 5000 5001 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { 5002 vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; 5003 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 5004 vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; 5005 } 5006 5007 if (likely(!vmx->fail)) { 5008 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 5009 5010 if (vm_exit_reason != -1) 5011 prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, 5012 exit_intr_info, exit_qualification, 5013 exit_insn_len); 5014 5015 /* 5016 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 5017 * also be used to capture vmcs12 cache as part of 5018 * capturing nVMX state for snapshot (migration). 5019 * 5020 * Otherwise, this flush will dirty guest memory at a 5021 * point it is already assumed by user-space to be 5022 * immutable. 5023 */ 5024 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 5025 } else { 5026 /* 5027 * The only expected VM-instruction error is "VM entry with 5028 * invalid control field(s)." Anything else indicates a 5029 * problem with L0. And we should never get here with a 5030 * VMFail of any type if early consistency checks are enabled. 5031 */ 5032 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 5033 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 5034 WARN_ON_ONCE(nested_early_check); 5035 } 5036 5037 /* 5038 * Drop events/exceptions that were queued for re-injection to L2 5039 * (picked up via vmx_complete_interrupts()), as well as exceptions 5040 * that were pending for L2. Note, this must NOT be hoisted above 5041 * prepare_vmcs12(), events/exceptions queued for re-injection need to 5042 * be captured in vmcs12 (see vmcs12_save_pending_event()). 5043 */ 5044 vcpu->arch.nmi_injected = false; 5045 kvm_clear_exception_queue(vcpu); 5046 kvm_clear_interrupt_queue(vcpu); 5047 5048 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 5049 5050 kvm_nested_vmexit_handle_ibrs(vcpu); 5051 5052 /* Update any VMCS fields that might have changed while L2 ran */ 5053 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 5054 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 5055 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 5056 if (kvm_caps.has_tsc_control) 5057 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 5058 5059 if (vmx->nested.l1_tpr_threshold != -1) 5060 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); 5061 5062 if (vmx->nested.change_vmcs01_virtual_apic_mode) { 5063 vmx->nested.change_vmcs01_virtual_apic_mode = false; 5064 vmx_set_virtual_apic_mode(vcpu); 5065 } 5066 5067 if (vmx->nested.update_vmcs01_cpu_dirty_logging) { 5068 vmx->nested.update_vmcs01_cpu_dirty_logging = false; 5069 vmx_update_cpu_dirty_logging(vcpu); 5070 } 5071 5072 nested_put_vmcs12_pages(vcpu); 5073 5074 if (vmx->nested.reload_vmcs01_apic_access_page) { 5075 vmx->nested.reload_vmcs01_apic_access_page = false; 5076 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 5077 } 5078 5079 if (vmx->nested.update_vmcs01_apicv_status) { 5080 vmx->nested.update_vmcs01_apicv_status = false; 5081 kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); 5082 } 5083 5084 if (vmx->nested.update_vmcs01_hwapic_isr) { 5085 vmx->nested.update_vmcs01_hwapic_isr = false; 5086 kvm_apic_update_hwapic_isr(vcpu); 5087 } 5088 5089 if ((vm_exit_reason != -1) && 5090 (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))) 5091 vmx->nested.need_vmcs12_to_shadow_sync = true; 5092 5093 /* in case we halted in L2 */ 5094 kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); 5095 5096 if (likely(!vmx->fail)) { 5097 if (vm_exit_reason != -1) 5098 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 5099 vmcs12->exit_qualification, 5100 vmcs12->idt_vectoring_info_field, 5101 vmcs12->vm_exit_intr_info, 5102 vmcs12->vm_exit_intr_error_code, 5103 KVM_ISA_VMX); 5104 5105 load_vmcs12_host_state(vcpu, vmcs12); 5106 5107 /* 5108 * Process events if an injectable IRQ or NMI is pending, even 5109 * if the event is blocked (RFLAGS.IF is cleared on VM-Exit). 5110 * If an event became pending while L2 was active, KVM needs to 5111 * either inject the event or request an IRQ/NMI window. SMIs 5112 * don't need to be processed as SMM is mutually exclusive with 5113 * non-root mode. INIT/SIPI don't need to be checked as INIT 5114 * is blocked post-VMXON, and SIPIs are ignored. 5115 */ 5116 if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending) 5117 kvm_make_request(KVM_REQ_EVENT, vcpu); 5118 return; 5119 } 5120 5121 /* 5122 * After an early L2 VM-entry failure, we're now back 5123 * in L1 which thinks it just finished a VMLAUNCH or 5124 * VMRESUME instruction, so we need to set the failure 5125 * flag and the VM-instruction error field of the VMCS 5126 * accordingly, and skip the emulated instruction. 5127 */ 5128 (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 5129 5130 /* 5131 * Restore L1's host state to KVM's software model. We're here 5132 * because a consistency check was caught by hardware, which 5133 * means some amount of guest state has been propagated to KVM's 5134 * model and needs to be unwound to the host's state. 5135 */ 5136 nested_vmx_restore_host_state(vcpu); 5137 5138 vmx->fail = 0; 5139 } 5140 5141 static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) 5142 { 5143 kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); 5144 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 5145 } 5146 5147 /* 5148 * Decode the memory-address operand of a vmx instruction, as recorded on an 5149 * exit caused by such an instruction (run by a guest hypervisor). 5150 * On success, returns 0. When the operand is invalid, returns 1 and throws 5151 * #UD, #GP, or #SS. 5152 */ 5153 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 5154 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 5155 { 5156 gva_t off; 5157 bool exn; 5158 struct kvm_segment s; 5159 5160 /* 5161 * According to Vol. 3B, "Information for VM Exits Due to Instruction 5162 * Execution", on an exit, vmx_instruction_info holds most of the 5163 * addressing components of the operand. Only the displacement part 5164 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 5165 * For how an actual address is calculated from all these components, 5166 * refer to Vol. 1, "Operand Addressing". 5167 */ 5168 int scaling = vmx_instruction_info & 3; 5169 int addr_size = (vmx_instruction_info >> 7) & 7; 5170 bool is_reg = vmx_instruction_info & (1u << 10); 5171 int seg_reg = (vmx_instruction_info >> 15) & 7; 5172 int index_reg = (vmx_instruction_info >> 18) & 0xf; 5173 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 5174 int base_reg = (vmx_instruction_info >> 23) & 0xf; 5175 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 5176 5177 if (is_reg) { 5178 kvm_queue_exception(vcpu, UD_VECTOR); 5179 return 1; 5180 } 5181 5182 /* Addr = segment_base + offset */ 5183 /* offset = base + [index * scale] + displacement */ 5184 off = exit_qualification; /* holds the displacement */ 5185 if (addr_size == 1) 5186 off = (gva_t)sign_extend64(off, 31); 5187 else if (addr_size == 0) 5188 off = (gva_t)sign_extend64(off, 15); 5189 if (base_is_valid) 5190 off += kvm_register_read(vcpu, base_reg); 5191 if (index_is_valid) 5192 off += kvm_register_read(vcpu, index_reg) << scaling; 5193 vmx_get_segment(vcpu, &s, seg_reg); 5194 5195 /* 5196 * The effective address, i.e. @off, of a memory operand is truncated 5197 * based on the address size of the instruction. Note that this is 5198 * the *effective address*, i.e. the address prior to accounting for 5199 * the segment's base. 5200 */ 5201 if (addr_size == 1) /* 32 bit */ 5202 off &= 0xffffffff; 5203 else if (addr_size == 0) /* 16 bit */ 5204 off &= 0xffff; 5205 5206 /* Checks for #GP/#SS exceptions. */ 5207 exn = false; 5208 if (is_long_mode(vcpu)) { 5209 /* 5210 * The virtual/linear address is never truncated in 64-bit 5211 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 5212 * address when using FS/GS with a non-zero base. 5213 */ 5214 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 5215 *ret = s.base + off; 5216 else 5217 *ret = off; 5218 5219 *ret = vmx_get_untagged_addr(vcpu, *ret, 0); 5220 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 5221 * non-canonical form. This is the only check on the memory 5222 * destination for long mode! 5223 */ 5224 exn = is_noncanonical_address(*ret, vcpu, 0); 5225 } else { 5226 /* 5227 * When not in long mode, the virtual/linear address is 5228 * unconditionally truncated to 32 bits regardless of the 5229 * address size. 5230 */ 5231 *ret = (s.base + off) & 0xffffffff; 5232 5233 /* Protected mode: apply checks for segment validity in the 5234 * following order: 5235 * - segment type check (#GP(0) may be thrown) 5236 * - usability check (#GP(0)/#SS(0)) 5237 * - limit check (#GP(0)/#SS(0)) 5238 */ 5239 if (wr) 5240 /* #GP(0) if the destination operand is located in a 5241 * read-only data segment or any code segment. 5242 */ 5243 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 5244 else 5245 /* #GP(0) if the source operand is located in an 5246 * execute-only code segment 5247 */ 5248 exn = ((s.type & 0xa) == 8); 5249 if (exn) { 5250 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 5251 return 1; 5252 } 5253 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 5254 */ 5255 exn = (s.unusable != 0); 5256 5257 /* 5258 * Protected mode: #GP(0)/#SS(0) if the memory operand is 5259 * outside the segment limit. All CPUs that support VMX ignore 5260 * limit checks for flat segments, i.e. segments with base==0, 5261 * limit==0xffffffff and of type expand-up data or code. 5262 */ 5263 if (!(s.base == 0 && s.limit == 0xffffffff && 5264 ((s.type & 8) || !(s.type & 4)))) 5265 exn = exn || ((u64)off + len - 1 > s.limit); 5266 } 5267 if (exn) { 5268 kvm_queue_exception_e(vcpu, 5269 seg_reg == VCPU_SREG_SS ? 5270 SS_VECTOR : GP_VECTOR, 5271 0); 5272 return 1; 5273 } 5274 5275 return 0; 5276 } 5277 5278 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, 5279 int *ret) 5280 { 5281 gva_t gva; 5282 struct x86_exception e; 5283 int r; 5284 5285 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5286 vmcs_read32(VMX_INSTRUCTION_INFO), false, 5287 sizeof(*vmpointer), &gva)) { 5288 *ret = 1; 5289 return -EINVAL; 5290 } 5291 5292 r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); 5293 if (r != X86EMUL_CONTINUE) { 5294 *ret = kvm_handle_memory_failure(vcpu, r, &e); 5295 return -EINVAL; 5296 } 5297 5298 return 0; 5299 } 5300 5301 /* 5302 * Allocate a shadow VMCS and associate it with the currently loaded 5303 * VMCS, unless such a shadow VMCS already exists. The newly allocated 5304 * VMCS is also VMCLEARed, so that it is ready for use. 5305 */ 5306 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 5307 { 5308 struct vcpu_vmx *vmx = to_vmx(vcpu); 5309 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 5310 5311 /* 5312 * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it 5313 * when L1 executes VMXOFF or the vCPU is forced out of nested 5314 * operation. VMXON faults if the CPU is already post-VMXON, so it 5315 * should be impossible to already have an allocated shadow VMCS. KVM 5316 * doesn't support virtualization of VMCS shadowing, so vmcs01 should 5317 * always be the loaded VMCS. 5318 */ 5319 if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs)) 5320 return loaded_vmcs->shadow_vmcs; 5321 5322 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 5323 if (loaded_vmcs->shadow_vmcs) 5324 vmcs_clear(loaded_vmcs->shadow_vmcs); 5325 5326 return loaded_vmcs->shadow_vmcs; 5327 } 5328 5329 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 5330 { 5331 struct vcpu_vmx *vmx = to_vmx(vcpu); 5332 int r; 5333 5334 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 5335 if (r < 0) 5336 goto out_vmcs02; 5337 5338 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5339 if (!vmx->nested.cached_vmcs12) 5340 goto out_cached_vmcs12; 5341 5342 vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; 5343 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5344 if (!vmx->nested.cached_shadow_vmcs12) 5345 goto out_cached_shadow_vmcs12; 5346 5347 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 5348 goto out_shadow_vmcs; 5349 5350 hrtimer_setup(&vmx->nested.preemption_timer, vmx_preemption_timer_fn, CLOCK_MONOTONIC, 5351 HRTIMER_MODE_ABS_PINNED); 5352 5353 vmx->nested.vpid02 = allocate_vpid(); 5354 5355 vmx->nested.vmcs02_initialized = false; 5356 vmx->nested.vmxon = true; 5357 5358 if (vmx_pt_mode_is_host_guest()) { 5359 vmx->pt_desc.guest.ctl = 0; 5360 pt_update_intercept_for_msr(vcpu); 5361 } 5362 5363 return 0; 5364 5365 out_shadow_vmcs: 5366 kfree(vmx->nested.cached_shadow_vmcs12); 5367 5368 out_cached_shadow_vmcs12: 5369 kfree(vmx->nested.cached_vmcs12); 5370 5371 out_cached_vmcs12: 5372 free_loaded_vmcs(&vmx->nested.vmcs02); 5373 5374 out_vmcs02: 5375 return -ENOMEM; 5376 } 5377 5378 /* Emulate the VMXON instruction. */ 5379 static int handle_vmxon(struct kvm_vcpu *vcpu) 5380 { 5381 int ret; 5382 gpa_t vmptr; 5383 uint32_t revision; 5384 struct vcpu_vmx *vmx = to_vmx(vcpu); 5385 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED 5386 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 5387 5388 /* 5389 * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter 5390 * the guest and so cannot rely on hardware to perform the check, 5391 * which has higher priority than VM-Exit (see Intel SDM's pseudocode 5392 * for VMXON). 5393 * 5394 * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 5395 * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't 5396 * force any of the relevant guest state. For a restricted guest, KVM 5397 * does force CR0.PE=1, but only to also force VM86 in order to emulate 5398 * Real Mode, and so there's no need to check CR0.PE manually. 5399 */ 5400 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) { 5401 kvm_queue_exception(vcpu, UD_VECTOR); 5402 return 1; 5403 } 5404 5405 /* 5406 * The CPL is checked for "not in VMX operation" and for "in VMX root", 5407 * and has higher priority than the VM-Fail due to being post-VMXON, 5408 * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, 5409 * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits 5410 * from L2 to L1, i.e. there's no need to check for the vCPU being in 5411 * VMX non-root. 5412 * 5413 * Forwarding the VM-Exit unconditionally, i.e. without performing the 5414 * #UD checks (see above), is functionally ok because KVM doesn't allow 5415 * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's 5416 * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are 5417 * missed by hardware due to shadowing CR0 and/or CR4. 5418 */ 5419 if (vmx_get_cpl(vcpu)) { 5420 kvm_inject_gp(vcpu, 0); 5421 return 1; 5422 } 5423 5424 if (vmx->nested.vmxon) 5425 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 5426 5427 /* 5428 * Invalid CR0/CR4 generates #GP. These checks are performed if and 5429 * only if the vCPU isn't already in VMX operation, i.e. effectively 5430 * have lower priority than the VM-Fail above. 5431 */ 5432 if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || 5433 !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { 5434 kvm_inject_gp(vcpu, 0); 5435 return 1; 5436 } 5437 5438 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 5439 != VMXON_NEEDED_FEATURES) { 5440 kvm_inject_gp(vcpu, 0); 5441 return 1; 5442 } 5443 5444 if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) 5445 return ret; 5446 5447 /* 5448 * SDM 3: 24.11.5 5449 * The first 4 bytes of VMXON region contain the supported 5450 * VMCS revision identifier 5451 * 5452 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 5453 * which replaces physical address width with 32 5454 */ 5455 if (!page_address_valid(vcpu, vmptr)) 5456 return nested_vmx_failInvalid(vcpu); 5457 5458 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 5459 revision != VMCS12_REVISION) 5460 return nested_vmx_failInvalid(vcpu); 5461 5462 vmx->nested.vmxon_ptr = vmptr; 5463 ret = enter_vmx_operation(vcpu); 5464 if (ret) 5465 return ret; 5466 5467 return nested_vmx_succeed(vcpu); 5468 } 5469 5470 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 5471 { 5472 struct vcpu_vmx *vmx = to_vmx(vcpu); 5473 5474 if (vmx->nested.current_vmptr == INVALID_GPA) 5475 return; 5476 5477 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 5478 5479 if (enable_shadow_vmcs) { 5480 /* copy to memory all shadowed fields in case 5481 they were modified */ 5482 copy_shadow_to_vmcs12(vmx); 5483 vmx_disable_shadow_vmcs(vmx); 5484 } 5485 vmx->nested.posted_intr_nv = -1; 5486 5487 /* Flush VMCS12 to guest memory */ 5488 kvm_vcpu_write_guest_page(vcpu, 5489 vmx->nested.current_vmptr >> PAGE_SHIFT, 5490 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 5491 5492 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 5493 5494 vmx->nested.current_vmptr = INVALID_GPA; 5495 } 5496 5497 /* Emulate the VMXOFF instruction */ 5498 static int handle_vmxoff(struct kvm_vcpu *vcpu) 5499 { 5500 if (!nested_vmx_check_permission(vcpu)) 5501 return 1; 5502 5503 free_nested(vcpu); 5504 5505 if (kvm_apic_has_pending_init_or_sipi(vcpu)) 5506 kvm_make_request(KVM_REQ_EVENT, vcpu); 5507 5508 return nested_vmx_succeed(vcpu); 5509 } 5510 5511 /* Emulate the VMCLEAR instruction */ 5512 static int handle_vmclear(struct kvm_vcpu *vcpu) 5513 { 5514 struct vcpu_vmx *vmx = to_vmx(vcpu); 5515 u32 zero = 0; 5516 gpa_t vmptr; 5517 int r; 5518 5519 if (!nested_vmx_check_permission(vcpu)) 5520 return 1; 5521 5522 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5523 return r; 5524 5525 if (!page_address_valid(vcpu, vmptr)) 5526 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); 5527 5528 if (vmptr == vmx->nested.vmxon_ptr) 5529 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); 5530 5531 if (likely(!nested_evmcs_handle_vmclear(vcpu, vmptr))) { 5532 if (vmptr == vmx->nested.current_vmptr) 5533 nested_release_vmcs12(vcpu); 5534 5535 /* 5536 * Silently ignore memory errors on VMCLEAR, Intel's pseudocode 5537 * for VMCLEAR includes a "ensure that data for VMCS referenced 5538 * by the operand is in memory" clause that guards writes to 5539 * memory, i.e. doing nothing for I/O is architecturally valid. 5540 * 5541 * FIXME: Suppress failures if and only if no memslot is found, 5542 * i.e. exit to userspace if __copy_to_user() fails. 5543 */ 5544 (void)kvm_vcpu_write_guest(vcpu, 5545 vmptr + offsetof(struct vmcs12, 5546 launch_state), 5547 &zero, sizeof(zero)); 5548 } 5549 5550 return nested_vmx_succeed(vcpu); 5551 } 5552 5553 /* Emulate the VMLAUNCH instruction */ 5554 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 5555 { 5556 return nested_vmx_run(vcpu, true); 5557 } 5558 5559 /* Emulate the VMRESUME instruction */ 5560 static int handle_vmresume(struct kvm_vcpu *vcpu) 5561 { 5562 5563 return nested_vmx_run(vcpu, false); 5564 } 5565 5566 static int handle_vmread(struct kvm_vcpu *vcpu) 5567 { 5568 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5569 : get_vmcs12(vcpu); 5570 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5571 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5572 struct vcpu_vmx *vmx = to_vmx(vcpu); 5573 struct x86_exception e; 5574 unsigned long field; 5575 u64 value; 5576 gva_t gva = 0; 5577 short offset; 5578 int len, r; 5579 5580 if (!nested_vmx_check_permission(vcpu)) 5581 return 1; 5582 5583 /* Decode instruction info and find the field to read */ 5584 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5585 5586 if (!nested_vmx_is_evmptr12_valid(vmx)) { 5587 /* 5588 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5589 * any VMREAD sets the ALU flags for VMfailInvalid. 5590 */ 5591 if (vmx->nested.current_vmptr == INVALID_GPA || 5592 (is_guest_mode(vcpu) && 5593 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5594 return nested_vmx_failInvalid(vcpu); 5595 5596 offset = get_vmcs12_field_offset(field); 5597 if (offset < 0) 5598 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5599 5600 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 5601 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5602 5603 /* Read the field, zero-extended to a u64 value */ 5604 value = vmcs12_read_any(vmcs12, field, offset); 5605 } else { 5606 /* 5607 * Hyper-V TLFS (as of 6.0b) explicitly states, that while an 5608 * enlightened VMCS is active VMREAD/VMWRITE instructions are 5609 * unsupported. Unfortunately, certain versions of Windows 11 5610 * don't comply with this requirement which is not enforced in 5611 * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a 5612 * workaround, as misbehaving guests will panic on VM-Fail. 5613 * Note, enlightened VMCS is incompatible with shadow VMCS so 5614 * all VMREADs from L2 should go to L1. 5615 */ 5616 if (WARN_ON_ONCE(is_guest_mode(vcpu))) 5617 return nested_vmx_failInvalid(vcpu); 5618 5619 offset = evmcs_field_offset(field, NULL); 5620 if (offset < 0) 5621 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5622 5623 /* Read the field, zero-extended to a u64 value */ 5624 value = evmcs_read_any(nested_vmx_evmcs(vmx), field, offset); 5625 } 5626 5627 /* 5628 * Now copy part of this value to register or memory, as requested. 5629 * Note that the number of bits actually copied is 32 or 64 depending 5630 * on the guest's mode (32 or 64 bit), not on the given field's length. 5631 */ 5632 if (instr_info & BIT(10)) { 5633 kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); 5634 } else { 5635 len = is_64_bit_mode(vcpu) ? 8 : 4; 5636 if (get_vmx_mem_address(vcpu, exit_qualification, 5637 instr_info, true, len, &gva)) 5638 return 1; 5639 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 5640 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); 5641 if (r != X86EMUL_CONTINUE) 5642 return kvm_handle_memory_failure(vcpu, r, &e); 5643 } 5644 5645 return nested_vmx_succeed(vcpu); 5646 } 5647 5648 static bool is_shadow_field_rw(unsigned long field) 5649 { 5650 switch (field) { 5651 #define SHADOW_FIELD_RW(x, y) case x: 5652 #include "vmcs_shadow_fields.h" 5653 return true; 5654 default: 5655 break; 5656 } 5657 return false; 5658 } 5659 5660 static bool is_shadow_field_ro(unsigned long field) 5661 { 5662 switch (field) { 5663 #define SHADOW_FIELD_RO(x, y) case x: 5664 #include "vmcs_shadow_fields.h" 5665 return true; 5666 default: 5667 break; 5668 } 5669 return false; 5670 } 5671 5672 static int handle_vmwrite(struct kvm_vcpu *vcpu) 5673 { 5674 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5675 : get_vmcs12(vcpu); 5676 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5677 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5678 struct vcpu_vmx *vmx = to_vmx(vcpu); 5679 struct x86_exception e; 5680 unsigned long field; 5681 short offset; 5682 gva_t gva; 5683 int len, r; 5684 5685 /* 5686 * The value to write might be 32 or 64 bits, depending on L1's long 5687 * mode, and eventually we need to write that into a field of several 5688 * possible lengths. The code below first zero-extends the value to 64 5689 * bit (value), and then copies only the appropriate number of 5690 * bits into the vmcs12 field. 5691 */ 5692 u64 value = 0; 5693 5694 if (!nested_vmx_check_permission(vcpu)) 5695 return 1; 5696 5697 /* 5698 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5699 * any VMWRITE sets the ALU flags for VMfailInvalid. 5700 */ 5701 if (vmx->nested.current_vmptr == INVALID_GPA || 5702 (is_guest_mode(vcpu) && 5703 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5704 return nested_vmx_failInvalid(vcpu); 5705 5706 if (instr_info & BIT(10)) 5707 value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); 5708 else { 5709 len = is_64_bit_mode(vcpu) ? 8 : 4; 5710 if (get_vmx_mem_address(vcpu, exit_qualification, 5711 instr_info, false, len, &gva)) 5712 return 1; 5713 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); 5714 if (r != X86EMUL_CONTINUE) 5715 return kvm_handle_memory_failure(vcpu, r, &e); 5716 } 5717 5718 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5719 5720 offset = get_vmcs12_field_offset(field); 5721 if (offset < 0) 5722 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5723 5724 /* 5725 * If the vCPU supports "VMWRITE to any supported field in the 5726 * VMCS," then the "read-only" fields are actually read/write. 5727 */ 5728 if (vmcs_field_readonly(field) && 5729 !nested_cpu_has_vmwrite_any_field(vcpu)) 5730 return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 5731 5732 /* 5733 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 5734 * vmcs12, else we may crush a field or consume a stale value. 5735 */ 5736 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) 5737 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5738 5739 /* 5740 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 5741 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 5742 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 5743 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 5744 * from L1 will return a different value than VMREAD from L2 (L1 sees 5745 * the stripped down value, L2 sees the full value as stored by KVM). 5746 */ 5747 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 5748 value &= 0x1f0ff; 5749 5750 vmcs12_write_any(vmcs12, field, offset, value); 5751 5752 /* 5753 * Do not track vmcs12 dirty-state if in guest-mode as we actually 5754 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 5755 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 5756 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 5757 */ 5758 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 5759 /* 5760 * L1 can read these fields without exiting, ensure the 5761 * shadow VMCS is up-to-date. 5762 */ 5763 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 5764 preempt_disable(); 5765 vmcs_load(vmx->vmcs01.shadow_vmcs); 5766 5767 __vmcs_writel(field, value); 5768 5769 vmcs_clear(vmx->vmcs01.shadow_vmcs); 5770 vmcs_load(vmx->loaded_vmcs->vmcs); 5771 preempt_enable(); 5772 } 5773 vmx->nested.dirty_vmcs12 = true; 5774 } 5775 5776 return nested_vmx_succeed(vcpu); 5777 } 5778 5779 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 5780 { 5781 vmx->nested.current_vmptr = vmptr; 5782 if (enable_shadow_vmcs) { 5783 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 5784 vmcs_write64(VMCS_LINK_POINTER, 5785 __pa(vmx->vmcs01.shadow_vmcs)); 5786 vmx->nested.need_vmcs12_to_shadow_sync = true; 5787 } 5788 vmx->nested.dirty_vmcs12 = true; 5789 vmx->nested.force_msr_bitmap_recalc = true; 5790 } 5791 5792 /* Emulate the VMPTRLD instruction */ 5793 static int handle_vmptrld(struct kvm_vcpu *vcpu) 5794 { 5795 struct vcpu_vmx *vmx = to_vmx(vcpu); 5796 gpa_t vmptr; 5797 int r; 5798 5799 if (!nested_vmx_check_permission(vcpu)) 5800 return 1; 5801 5802 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5803 return r; 5804 5805 if (!page_address_valid(vcpu, vmptr)) 5806 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); 5807 5808 if (vmptr == vmx->nested.vmxon_ptr) 5809 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); 5810 5811 /* Forbid normal VMPTRLD if Enlightened version was used */ 5812 if (nested_vmx_is_evmptr12_valid(vmx)) 5813 return 1; 5814 5815 if (vmx->nested.current_vmptr != vmptr) { 5816 struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; 5817 struct vmcs_hdr hdr; 5818 5819 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { 5820 /* 5821 * Reads from an unbacked page return all 1s, 5822 * which means that the 32 bits located at the 5823 * given physical address won't match the required 5824 * VMCS12_REVISION identifier. 5825 */ 5826 return nested_vmx_fail(vcpu, 5827 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5828 } 5829 5830 if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 5831 offsetof(struct vmcs12, hdr), 5832 sizeof(hdr))) { 5833 return nested_vmx_fail(vcpu, 5834 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5835 } 5836 5837 if (hdr.revision_id != VMCS12_REVISION || 5838 (hdr.shadow_vmcs && 5839 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 5840 return nested_vmx_fail(vcpu, 5841 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5842 } 5843 5844 nested_release_vmcs12(vcpu); 5845 5846 /* 5847 * Load VMCS12 from guest memory since it is not already 5848 * cached. 5849 */ 5850 if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12, 5851 VMCS12_SIZE)) { 5852 return nested_vmx_fail(vcpu, 5853 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5854 } 5855 5856 set_current_vmptr(vmx, vmptr); 5857 } 5858 5859 return nested_vmx_succeed(vcpu); 5860 } 5861 5862 /* Emulate the VMPTRST instruction */ 5863 static int handle_vmptrst(struct kvm_vcpu *vcpu) 5864 { 5865 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 5866 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5867 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 5868 struct x86_exception e; 5869 gva_t gva; 5870 int r; 5871 5872 if (!nested_vmx_check_permission(vcpu)) 5873 return 1; 5874 5875 if (unlikely(nested_vmx_is_evmptr12_valid(to_vmx(vcpu)))) 5876 return 1; 5877 5878 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 5879 true, sizeof(gpa_t), &gva)) 5880 return 1; 5881 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 5882 r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 5883 sizeof(gpa_t), &e); 5884 if (r != X86EMUL_CONTINUE) 5885 return kvm_handle_memory_failure(vcpu, r, &e); 5886 5887 return nested_vmx_succeed(vcpu); 5888 } 5889 5890 /* Emulate the INVEPT instruction */ 5891 static int handle_invept(struct kvm_vcpu *vcpu) 5892 { 5893 struct vcpu_vmx *vmx = to_vmx(vcpu); 5894 u32 vmx_instruction_info, types; 5895 unsigned long type, roots_to_free; 5896 struct kvm_mmu *mmu; 5897 gva_t gva; 5898 struct x86_exception e; 5899 struct { 5900 u64 eptp, gpa; 5901 } operand; 5902 int i, r, gpr_index; 5903 5904 if (!(vmx->nested.msrs.secondary_ctls_high & 5905 SECONDARY_EXEC_ENABLE_EPT) || 5906 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 5907 kvm_queue_exception(vcpu, UD_VECTOR); 5908 return 1; 5909 } 5910 5911 if (!nested_vmx_check_permission(vcpu)) 5912 return 1; 5913 5914 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5915 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5916 type = kvm_register_read(vcpu, gpr_index); 5917 5918 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 5919 5920 if (type >= 32 || !(types & (1 << type))) 5921 return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5922 5923 /* According to the Intel VMX instruction reference, the memory 5924 * operand is read even if it isn't needed (e.g., for type==global) 5925 */ 5926 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5927 vmx_instruction_info, false, sizeof(operand), &gva)) 5928 return 1; 5929 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5930 if (r != X86EMUL_CONTINUE) 5931 return kvm_handle_memory_failure(vcpu, r, &e); 5932 5933 /* 5934 * Nested EPT roots are always held through guest_mmu, 5935 * not root_mmu. 5936 */ 5937 mmu = &vcpu->arch.guest_mmu; 5938 5939 switch (type) { 5940 case VMX_EPT_EXTENT_CONTEXT: 5941 if (!nested_vmx_check_eptp(vcpu, operand.eptp)) 5942 return nested_vmx_fail(vcpu, 5943 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5944 5945 roots_to_free = 0; 5946 if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd, 5947 operand.eptp)) 5948 roots_to_free |= KVM_MMU_ROOT_CURRENT; 5949 5950 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 5951 if (nested_ept_root_matches(mmu->prev_roots[i].hpa, 5952 mmu->prev_roots[i].pgd, 5953 operand.eptp)) 5954 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 5955 } 5956 break; 5957 case VMX_EPT_EXTENT_GLOBAL: 5958 roots_to_free = KVM_MMU_ROOTS_ALL; 5959 break; 5960 default: 5961 BUG(); 5962 break; 5963 } 5964 5965 if (roots_to_free) 5966 kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); 5967 5968 return nested_vmx_succeed(vcpu); 5969 } 5970 5971 static int handle_invvpid(struct kvm_vcpu *vcpu) 5972 { 5973 struct vcpu_vmx *vmx = to_vmx(vcpu); 5974 u32 vmx_instruction_info; 5975 unsigned long type, types; 5976 gva_t gva; 5977 struct x86_exception e; 5978 struct { 5979 u64 vpid; 5980 u64 gla; 5981 } operand; 5982 u16 vpid02; 5983 int r, gpr_index; 5984 5985 if (!(vmx->nested.msrs.secondary_ctls_high & 5986 SECONDARY_EXEC_ENABLE_VPID) || 5987 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 5988 kvm_queue_exception(vcpu, UD_VECTOR); 5989 return 1; 5990 } 5991 5992 if (!nested_vmx_check_permission(vcpu)) 5993 return 1; 5994 5995 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5996 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5997 type = kvm_register_read(vcpu, gpr_index); 5998 5999 types = (vmx->nested.msrs.vpid_caps & 6000 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 6001 6002 if (type >= 32 || !(types & (1 << type))) 6003 return nested_vmx_fail(vcpu, 6004 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6005 6006 /* according to the intel vmx instruction reference, the memory 6007 * operand is read even if it isn't needed (e.g., for type==global) 6008 */ 6009 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 6010 vmx_instruction_info, false, sizeof(operand), &gva)) 6011 return 1; 6012 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 6013 if (r != X86EMUL_CONTINUE) 6014 return kvm_handle_memory_failure(vcpu, r, &e); 6015 6016 if (operand.vpid >> 16) 6017 return nested_vmx_fail(vcpu, 6018 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6019 6020 /* 6021 * Always flush the effective vpid02, i.e. never flush the current VPID 6022 * and never explicitly flush vpid01. INVVPID targets a VPID, not a 6023 * VMCS, and so whether or not the current vmcs12 has VPID enabled is 6024 * irrelevant (and there may not be a loaded vmcs12). 6025 */ 6026 vpid02 = nested_get_vpid02(vcpu); 6027 switch (type) { 6028 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 6029 /* 6030 * LAM doesn't apply to addresses that are inputs to TLB 6031 * invalidation. 6032 */ 6033 if (!operand.vpid || 6034 is_noncanonical_invlpg_address(operand.gla, vcpu)) 6035 return nested_vmx_fail(vcpu, 6036 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6037 vpid_sync_vcpu_addr(vpid02, operand.gla); 6038 break; 6039 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 6040 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 6041 if (!operand.vpid) 6042 return nested_vmx_fail(vcpu, 6043 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6044 vpid_sync_context(vpid02); 6045 break; 6046 case VMX_VPID_EXTENT_ALL_CONTEXT: 6047 vpid_sync_context(vpid02); 6048 break; 6049 default: 6050 WARN_ON_ONCE(1); 6051 return kvm_skip_emulated_instruction(vcpu); 6052 } 6053 6054 /* 6055 * Sync the shadow page tables if EPT is disabled, L1 is invalidating 6056 * linear mappings for L2 (tagged with L2's VPID). Free all guest 6057 * roots as VPIDs are not tracked in the MMU role. 6058 * 6059 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share 6060 * an MMU when EPT is disabled. 6061 * 6062 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. 6063 */ 6064 if (!enable_ept) 6065 kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu); 6066 6067 return nested_vmx_succeed(vcpu); 6068 } 6069 6070 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 6071 struct vmcs12 *vmcs12) 6072 { 6073 u32 index = kvm_rcx_read(vcpu); 6074 u64 new_eptp; 6075 6076 if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) 6077 return 1; 6078 if (index >= VMFUNC_EPTP_ENTRIES) 6079 return 1; 6080 6081 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 6082 &new_eptp, index * 8, 8)) 6083 return 1; 6084 6085 /* 6086 * If the (L2) guest does a vmfunc to the currently 6087 * active ept pointer, we don't have to do anything else 6088 */ 6089 if (vmcs12->ept_pointer != new_eptp) { 6090 if (!nested_vmx_check_eptp(vcpu, new_eptp)) 6091 return 1; 6092 6093 vmcs12->ept_pointer = new_eptp; 6094 nested_ept_new_eptp(vcpu); 6095 6096 if (!nested_cpu_has_vpid(vmcs12)) 6097 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 6098 } 6099 6100 return 0; 6101 } 6102 6103 static int handle_vmfunc(struct kvm_vcpu *vcpu) 6104 { 6105 struct vcpu_vmx *vmx = to_vmx(vcpu); 6106 struct vmcs12 *vmcs12; 6107 u32 function = kvm_rax_read(vcpu); 6108 6109 /* 6110 * VMFUNC should never execute cleanly while L1 is active; KVM supports 6111 * VMFUNC for nested VMs, but not for L1. 6112 */ 6113 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) { 6114 kvm_queue_exception(vcpu, UD_VECTOR); 6115 return 1; 6116 } 6117 6118 vmcs12 = get_vmcs12(vcpu); 6119 6120 /* 6121 * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC 6122 * is enabled in vmcs02 if and only if it's enabled in vmcs12. 6123 */ 6124 if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { 6125 kvm_queue_exception(vcpu, UD_VECTOR); 6126 return 1; 6127 } 6128 6129 if (!(vmcs12->vm_function_control & BIT_ULL(function))) 6130 goto fail; 6131 6132 switch (function) { 6133 case 0: 6134 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 6135 goto fail; 6136 break; 6137 default: 6138 goto fail; 6139 } 6140 return kvm_skip_emulated_instruction(vcpu); 6141 6142 fail: 6143 /* 6144 * This is effectively a reflected VM-Exit, as opposed to a synthesized 6145 * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode 6146 * EXIT_REASON_VMFUNC as the exit reason. 6147 */ 6148 nested_vmx_vmexit(vcpu, vmx->vt.exit_reason.full, 6149 vmx_get_intr_info(vcpu), 6150 vmx_get_exit_qual(vcpu)); 6151 return 1; 6152 } 6153 6154 /* 6155 * Return true if an IO instruction with the specified port and size should cause 6156 * a VM-exit into L1. 6157 */ 6158 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, 6159 int size) 6160 { 6161 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6162 gpa_t bitmap, last_bitmap; 6163 u8 b; 6164 6165 last_bitmap = INVALID_GPA; 6166 b = -1; 6167 6168 while (size > 0) { 6169 if (port < 0x8000) 6170 bitmap = vmcs12->io_bitmap_a; 6171 else if (port < 0x10000) 6172 bitmap = vmcs12->io_bitmap_b; 6173 else 6174 return true; 6175 bitmap += (port & 0x7fff) / 8; 6176 6177 if (last_bitmap != bitmap) 6178 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 6179 return true; 6180 if (b & (1 << (port & 7))) 6181 return true; 6182 6183 port++; 6184 size--; 6185 last_bitmap = bitmap; 6186 } 6187 6188 return false; 6189 } 6190 6191 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 6192 struct vmcs12 *vmcs12) 6193 { 6194 unsigned long exit_qualification; 6195 unsigned short port; 6196 int size; 6197 6198 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 6199 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 6200 6201 exit_qualification = vmx_get_exit_qual(vcpu); 6202 6203 port = exit_qualification >> 16; 6204 size = (exit_qualification & 7) + 1; 6205 6206 return nested_vmx_check_io_bitmaps(vcpu, port, size); 6207 } 6208 6209 /* 6210 * Return 1 if we should exit from L2 to L1 to handle an MSR access, 6211 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 6212 * disinterest in the current event (read or write a specific MSR) by using an 6213 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 6214 */ 6215 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 6216 struct vmcs12 *vmcs12, 6217 union vmx_exit_reason exit_reason) 6218 { 6219 u32 msr_index = kvm_rcx_read(vcpu); 6220 gpa_t bitmap; 6221 6222 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 6223 return true; 6224 6225 /* 6226 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 6227 * for the four combinations of read/write and low/high MSR numbers. 6228 * First we need to figure out which of the four to use: 6229 */ 6230 bitmap = vmcs12->msr_bitmap; 6231 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 6232 bitmap += 2048; 6233 if (msr_index >= 0xc0000000) { 6234 msr_index -= 0xc0000000; 6235 bitmap += 1024; 6236 } 6237 6238 /* Then read the msr_index'th bit from this bitmap: */ 6239 if (msr_index < 1024*8) { 6240 unsigned char b; 6241 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 6242 return true; 6243 return 1 & (b >> (msr_index & 7)); 6244 } else 6245 return true; /* let L1 handle the wrong parameter */ 6246 } 6247 6248 /* 6249 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 6250 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 6251 * intercept (via guest_host_mask etc.) the current event. 6252 */ 6253 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 6254 struct vmcs12 *vmcs12) 6255 { 6256 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 6257 int cr = exit_qualification & 15; 6258 int reg; 6259 unsigned long val; 6260 6261 switch ((exit_qualification >> 4) & 3) { 6262 case 0: /* mov to cr */ 6263 reg = (exit_qualification >> 8) & 15; 6264 val = kvm_register_read(vcpu, reg); 6265 switch (cr) { 6266 case 0: 6267 if (vmcs12->cr0_guest_host_mask & 6268 (val ^ vmcs12->cr0_read_shadow)) 6269 return true; 6270 break; 6271 case 3: 6272 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 6273 return true; 6274 break; 6275 case 4: 6276 if (vmcs12->cr4_guest_host_mask & 6277 (vmcs12->cr4_read_shadow ^ val)) 6278 return true; 6279 break; 6280 case 8: 6281 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 6282 return true; 6283 break; 6284 } 6285 break; 6286 case 2: /* clts */ 6287 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 6288 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 6289 return true; 6290 break; 6291 case 1: /* mov from cr */ 6292 switch (cr) { 6293 case 3: 6294 if (vmcs12->cpu_based_vm_exec_control & 6295 CPU_BASED_CR3_STORE_EXITING) 6296 return true; 6297 break; 6298 case 8: 6299 if (vmcs12->cpu_based_vm_exec_control & 6300 CPU_BASED_CR8_STORE_EXITING) 6301 return true; 6302 break; 6303 } 6304 break; 6305 case 3: /* lmsw */ 6306 /* 6307 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 6308 * cr0. Other attempted changes are ignored, with no exit. 6309 */ 6310 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 6311 if (vmcs12->cr0_guest_host_mask & 0xe & 6312 (val ^ vmcs12->cr0_read_shadow)) 6313 return true; 6314 if ((vmcs12->cr0_guest_host_mask & 0x1) && 6315 !(vmcs12->cr0_read_shadow & 0x1) && 6316 (val & 0x1)) 6317 return true; 6318 break; 6319 } 6320 return false; 6321 } 6322 6323 static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, 6324 struct vmcs12 *vmcs12) 6325 { 6326 u32 encls_leaf; 6327 6328 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) || 6329 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) 6330 return false; 6331 6332 encls_leaf = kvm_rax_read(vcpu); 6333 if (encls_leaf > 62) 6334 encls_leaf = 63; 6335 return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); 6336 } 6337 6338 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 6339 struct vmcs12 *vmcs12, gpa_t bitmap) 6340 { 6341 u32 vmx_instruction_info; 6342 unsigned long field; 6343 u8 b; 6344 6345 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 6346 return true; 6347 6348 /* Decode instruction info and find the field to access */ 6349 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6350 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 6351 6352 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 6353 if (field >> 15) 6354 return true; 6355 6356 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 6357 return true; 6358 6359 return 1 & (b >> (field & 7)); 6360 } 6361 6362 static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) 6363 { 6364 u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; 6365 6366 if (nested_cpu_has_mtf(vmcs12)) 6367 return true; 6368 6369 /* 6370 * An MTF VM-exit may be injected into the guest by setting the 6371 * interruption-type to 7 (other event) and the vector field to 0. Such 6372 * is the case regardless of the 'monitor trap flag' VM-execution 6373 * control. 6374 */ 6375 return entry_intr_info == (INTR_INFO_VALID_MASK 6376 | INTR_TYPE_OTHER_EVENT); 6377 } 6378 6379 /* 6380 * Return true if L0 wants to handle an exit from L2 regardless of whether or not 6381 * L1 wants the exit. Only call this when in is_guest_mode (L2). 6382 */ 6383 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, 6384 union vmx_exit_reason exit_reason) 6385 { 6386 u32 intr_info; 6387 6388 switch ((u16)exit_reason.basic) { 6389 case EXIT_REASON_EXCEPTION_NMI: 6390 intr_info = vmx_get_intr_info(vcpu); 6391 if (is_nmi(intr_info)) 6392 return true; 6393 else if (is_page_fault(intr_info)) 6394 return vcpu->arch.apf.host_apf_flags || 6395 vmx_need_pf_intercept(vcpu); 6396 else if (is_debug(intr_info) && 6397 vcpu->guest_debug & 6398 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 6399 return true; 6400 else if (is_breakpoint(intr_info) && 6401 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 6402 return true; 6403 else if (is_alignment_check(intr_info) && 6404 !vmx_guest_inject_ac(vcpu)) 6405 return true; 6406 else if (is_ve_fault(intr_info)) 6407 return true; 6408 return false; 6409 case EXIT_REASON_EXTERNAL_INTERRUPT: 6410 return true; 6411 case EXIT_REASON_MCE_DURING_VMENTRY: 6412 return true; 6413 case EXIT_REASON_EPT_VIOLATION: 6414 /* 6415 * L0 always deals with the EPT violation. If nested EPT is 6416 * used, and the nested mmu code discovers that the address is 6417 * missing in the guest EPT table (EPT12), the EPT violation 6418 * will be injected with nested_ept_inject_page_fault() 6419 */ 6420 return true; 6421 case EXIT_REASON_EPT_MISCONFIG: 6422 /* 6423 * L2 never uses directly L1's EPT, but rather L0's own EPT 6424 * table (shadow on EPT) or a merged EPT table that L0 built 6425 * (EPT on EPT). So any problems with the structure of the 6426 * table is L0's fault. 6427 */ 6428 return true; 6429 case EXIT_REASON_PREEMPTION_TIMER: 6430 return true; 6431 case EXIT_REASON_PML_FULL: 6432 /* 6433 * PML is emulated for an L1 VMM and should never be enabled in 6434 * vmcs02, always "handle" PML_FULL by exiting to userspace. 6435 */ 6436 return true; 6437 case EXIT_REASON_VMFUNC: 6438 /* VM functions are emulated through L2->L0 vmexits. */ 6439 return true; 6440 case EXIT_REASON_BUS_LOCK: 6441 /* 6442 * At present, bus lock VM exit is never exposed to L1. 6443 * Handle L2's bus locks in L0 directly. 6444 */ 6445 return true; 6446 #ifdef CONFIG_KVM_HYPERV 6447 case EXIT_REASON_VMCALL: 6448 /* Hyper-V L2 TLB flush hypercall is handled by L0 */ 6449 return guest_hv_cpuid_has_l2_tlb_flush(vcpu) && 6450 nested_evmcs_l2_tlb_flush_enabled(vcpu) && 6451 kvm_hv_is_tlb_flush_hcall(vcpu); 6452 #endif 6453 default: 6454 break; 6455 } 6456 return false; 6457 } 6458 6459 /* 6460 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in 6461 * is_guest_mode (L2). 6462 */ 6463 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, 6464 union vmx_exit_reason exit_reason) 6465 { 6466 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6467 u32 intr_info; 6468 6469 switch ((u16)exit_reason.basic) { 6470 case EXIT_REASON_EXCEPTION_NMI: 6471 intr_info = vmx_get_intr_info(vcpu); 6472 if (is_nmi(intr_info)) 6473 return true; 6474 else if (is_page_fault(intr_info)) 6475 return true; 6476 return vmcs12->exception_bitmap & 6477 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 6478 case EXIT_REASON_EXTERNAL_INTERRUPT: 6479 return nested_exit_on_intr(vcpu); 6480 case EXIT_REASON_TRIPLE_FAULT: 6481 return true; 6482 case EXIT_REASON_INTERRUPT_WINDOW: 6483 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); 6484 case EXIT_REASON_NMI_WINDOW: 6485 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); 6486 case EXIT_REASON_TASK_SWITCH: 6487 return true; 6488 case EXIT_REASON_CPUID: 6489 return true; 6490 case EXIT_REASON_HLT: 6491 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 6492 case EXIT_REASON_INVD: 6493 return true; 6494 case EXIT_REASON_INVLPG: 6495 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6496 case EXIT_REASON_RDPMC: 6497 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 6498 case EXIT_REASON_RDRAND: 6499 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 6500 case EXIT_REASON_RDSEED: 6501 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 6502 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 6503 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 6504 case EXIT_REASON_VMREAD: 6505 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6506 vmcs12->vmread_bitmap); 6507 case EXIT_REASON_VMWRITE: 6508 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6509 vmcs12->vmwrite_bitmap); 6510 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 6511 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 6512 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 6513 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 6514 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 6515 /* 6516 * VMX instructions trap unconditionally. This allows L1 to 6517 * emulate them for its L2 guest, i.e., allows 3-level nesting! 6518 */ 6519 return true; 6520 case EXIT_REASON_CR_ACCESS: 6521 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 6522 case EXIT_REASON_DR_ACCESS: 6523 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 6524 case EXIT_REASON_IO_INSTRUCTION: 6525 return nested_vmx_exit_handled_io(vcpu, vmcs12); 6526 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 6527 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 6528 case EXIT_REASON_MSR_READ: 6529 case EXIT_REASON_MSR_WRITE: 6530 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 6531 case EXIT_REASON_INVALID_STATE: 6532 return true; 6533 case EXIT_REASON_MWAIT_INSTRUCTION: 6534 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 6535 case EXIT_REASON_MONITOR_TRAP_FLAG: 6536 return nested_vmx_exit_handled_mtf(vmcs12); 6537 case EXIT_REASON_MONITOR_INSTRUCTION: 6538 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 6539 case EXIT_REASON_PAUSE_INSTRUCTION: 6540 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 6541 nested_cpu_has2(vmcs12, 6542 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 6543 case EXIT_REASON_MCE_DURING_VMENTRY: 6544 return true; 6545 case EXIT_REASON_TPR_BELOW_THRESHOLD: 6546 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 6547 case EXIT_REASON_APIC_ACCESS: 6548 case EXIT_REASON_APIC_WRITE: 6549 case EXIT_REASON_EOI_INDUCED: 6550 /* 6551 * The controls for "virtualize APIC accesses," "APIC- 6552 * register virtualization," and "virtual-interrupt 6553 * delivery" only come from vmcs12. 6554 */ 6555 return true; 6556 case EXIT_REASON_INVPCID: 6557 return 6558 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 6559 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6560 case EXIT_REASON_WBINVD: 6561 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 6562 case EXIT_REASON_XSETBV: 6563 return true; 6564 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 6565 /* 6566 * This should never happen, since it is not possible to 6567 * set XSS to a non-zero value---neither in L1 nor in L2. 6568 * If if it were, XSS would have to be checked against 6569 * the XSS exit bitmap in vmcs12. 6570 */ 6571 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES); 6572 case EXIT_REASON_UMWAIT: 6573 case EXIT_REASON_TPAUSE: 6574 return nested_cpu_has2(vmcs12, 6575 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); 6576 case EXIT_REASON_ENCLS: 6577 return nested_vmx_exit_handled_encls(vcpu, vmcs12); 6578 case EXIT_REASON_NOTIFY: 6579 /* Notify VM exit is not exposed to L1 */ 6580 return false; 6581 default: 6582 return true; 6583 } 6584 } 6585 6586 /* 6587 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was 6588 * reflected into L1. 6589 */ 6590 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) 6591 { 6592 struct vcpu_vmx *vmx = to_vmx(vcpu); 6593 union vmx_exit_reason exit_reason = vmx->vt.exit_reason; 6594 unsigned long exit_qual; 6595 u32 exit_intr_info; 6596 6597 WARN_ON_ONCE(vmx->nested.nested_run_pending); 6598 6599 /* 6600 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM 6601 * has already loaded L2's state. 6602 */ 6603 if (unlikely(vmx->fail)) { 6604 trace_kvm_nested_vmenter_failed( 6605 "hardware VM-instruction error: ", 6606 vmcs_read32(VM_INSTRUCTION_ERROR)); 6607 exit_intr_info = 0; 6608 exit_qual = 0; 6609 goto reflect_vmexit; 6610 } 6611 6612 trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); 6613 6614 /* If L0 (KVM) wants the exit, it trumps L1's desires. */ 6615 if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) 6616 return false; 6617 6618 /* If L1 doesn't want the exit, handle it in L0. */ 6619 if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) 6620 return false; 6621 6622 /* 6623 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For 6624 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would 6625 * need to be synthesized by querying the in-kernel LAPIC, but external 6626 * interrupts are never reflected to L1 so it's a non-issue. 6627 */ 6628 exit_intr_info = vmx_get_intr_info(vcpu); 6629 if (is_exception_with_error_code(exit_intr_info)) { 6630 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6631 6632 vmcs12->vm_exit_intr_error_code = 6633 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6634 } 6635 exit_qual = vmx_get_exit_qual(vcpu); 6636 6637 reflect_vmexit: 6638 nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); 6639 return true; 6640 } 6641 6642 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 6643 struct kvm_nested_state __user *user_kvm_nested_state, 6644 u32 user_data_size) 6645 { 6646 struct vcpu_vmx *vmx; 6647 struct vmcs12 *vmcs12; 6648 struct kvm_nested_state kvm_state = { 6649 .flags = 0, 6650 .format = KVM_STATE_NESTED_FORMAT_VMX, 6651 .size = sizeof(kvm_state), 6652 .hdr.vmx.flags = 0, 6653 .hdr.vmx.vmxon_pa = INVALID_GPA, 6654 .hdr.vmx.vmcs12_pa = INVALID_GPA, 6655 .hdr.vmx.preemption_timer_deadline = 0, 6656 }; 6657 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6658 &user_kvm_nested_state->data.vmx[0]; 6659 6660 if (!vcpu) 6661 return kvm_state.size + sizeof(*user_vmx_nested_state); 6662 6663 vmx = to_vmx(vcpu); 6664 vmcs12 = get_vmcs12(vcpu); 6665 6666 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) && 6667 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 6668 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 6669 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 6670 6671 if (vmx_has_valid_vmcs12(vcpu)) { 6672 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 6673 6674 /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ 6675 if (nested_vmx_is_evmptr12_set(vmx)) 6676 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 6677 6678 if (is_guest_mode(vcpu) && 6679 nested_cpu_has_shadow_vmcs(vmcs12) && 6680 vmcs12->vmcs_link_pointer != INVALID_GPA) 6681 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 6682 } 6683 6684 if (vmx->nested.smm.vmxon) 6685 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 6686 6687 if (vmx->nested.smm.guest_mode) 6688 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 6689 6690 if (is_guest_mode(vcpu)) { 6691 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 6692 6693 if (vmx->nested.nested_run_pending) 6694 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 6695 6696 if (vmx->nested.mtf_pending) 6697 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; 6698 6699 if (nested_cpu_has_preemption_timer(vmcs12) && 6700 vmx->nested.has_preemption_timer_deadline) { 6701 kvm_state.hdr.vmx.flags |= 6702 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; 6703 kvm_state.hdr.vmx.preemption_timer_deadline = 6704 vmx->nested.preemption_timer_deadline; 6705 } 6706 } 6707 } 6708 6709 if (user_data_size < kvm_state.size) 6710 goto out; 6711 6712 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 6713 return -EFAULT; 6714 6715 if (!vmx_has_valid_vmcs12(vcpu)) 6716 goto out; 6717 6718 /* 6719 * When running L2, the authoritative vmcs12 state is in the 6720 * vmcs02. When running L1, the authoritative vmcs12 state is 6721 * in the shadow or enlightened vmcs linked to vmcs01, unless 6722 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 6723 * vmcs12 state is in the vmcs12 already. 6724 */ 6725 if (is_guest_mode(vcpu)) { 6726 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 6727 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 6728 } else { 6729 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 6730 if (!vmx->nested.need_vmcs12_to_shadow_sync) { 6731 if (nested_vmx_is_evmptr12_valid(vmx)) 6732 /* 6733 * L1 hypervisor is not obliged to keep eVMCS 6734 * clean fields data always up-to-date while 6735 * not in guest mode, 'hv_clean_fields' is only 6736 * supposed to be actual upon vmentry so we need 6737 * to ignore it here and do full copy. 6738 */ 6739 copy_enlightened_to_vmcs12(vmx, 0); 6740 else if (enable_shadow_vmcs) 6741 copy_shadow_to_vmcs12(vmx); 6742 } 6743 } 6744 6745 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 6746 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 6747 6748 /* 6749 * Copy over the full allocated size of vmcs12 rather than just the size 6750 * of the struct. 6751 */ 6752 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 6753 return -EFAULT; 6754 6755 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6756 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6757 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 6758 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 6759 return -EFAULT; 6760 } 6761 out: 6762 return kvm_state.size; 6763 } 6764 6765 void vmx_leave_nested(struct kvm_vcpu *vcpu) 6766 { 6767 if (is_guest_mode(vcpu)) { 6768 to_vmx(vcpu)->nested.nested_run_pending = 0; 6769 nested_vmx_vmexit(vcpu, -1, 0, 0); 6770 } 6771 free_nested(vcpu); 6772 } 6773 6774 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 6775 struct kvm_nested_state __user *user_kvm_nested_state, 6776 struct kvm_nested_state *kvm_state) 6777 { 6778 struct vcpu_vmx *vmx = to_vmx(vcpu); 6779 struct vmcs12 *vmcs12; 6780 enum vm_entry_failure_code ignored; 6781 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6782 &user_kvm_nested_state->data.vmx[0]; 6783 int ret; 6784 6785 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 6786 return -EINVAL; 6787 6788 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { 6789 if (kvm_state->hdr.vmx.smm.flags) 6790 return -EINVAL; 6791 6792 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) 6793 return -EINVAL; 6794 6795 /* 6796 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 6797 * enable eVMCS capability on vCPU. However, since then 6798 * code was changed such that flag signals vmcs12 should 6799 * be copied into eVMCS in guest memory. 6800 * 6801 * To preserve backwards compatibility, allow user 6802 * to set this flag even when there is no VMXON region. 6803 */ 6804 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 6805 return -EINVAL; 6806 } else { 6807 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 6808 return -EINVAL; 6809 6810 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 6811 return -EINVAL; 6812 } 6813 6814 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6815 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6816 return -EINVAL; 6817 6818 if (kvm_state->hdr.vmx.smm.flags & 6819 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 6820 return -EINVAL; 6821 6822 if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) 6823 return -EINVAL; 6824 6825 /* 6826 * SMM temporarily disables VMX, so we cannot be in guest mode, 6827 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 6828 * must be zero. 6829 */ 6830 if (is_smm(vcpu) ? 6831 (kvm_state->flags & 6832 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 6833 : kvm_state->hdr.vmx.smm.flags) 6834 return -EINVAL; 6835 6836 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6837 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 6838 return -EINVAL; 6839 6840 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 6841 (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) || 6842 !vmx->nested.enlightened_vmcs_enabled)) 6843 return -EINVAL; 6844 6845 vmx_leave_nested(vcpu); 6846 6847 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) 6848 return 0; 6849 6850 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 6851 ret = enter_vmx_operation(vcpu); 6852 if (ret) 6853 return ret; 6854 6855 /* Empty 'VMXON' state is permitted if no VMCS loaded */ 6856 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { 6857 /* See vmx_has_valid_vmcs12. */ 6858 if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || 6859 (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || 6860 (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) 6861 return -EINVAL; 6862 else 6863 return 0; 6864 } 6865 6866 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { 6867 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 6868 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 6869 return -EINVAL; 6870 6871 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 6872 #ifdef CONFIG_KVM_HYPERV 6873 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 6874 /* 6875 * nested_vmx_handle_enlightened_vmptrld() cannot be called 6876 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be 6877 * restored yet. EVMCS will be mapped from 6878 * nested_get_vmcs12_pages(). 6879 */ 6880 vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; 6881 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 6882 #endif 6883 } else { 6884 return -EINVAL; 6885 } 6886 6887 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 6888 vmx->nested.smm.vmxon = true; 6889 vmx->nested.vmxon = false; 6890 6891 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 6892 vmx->nested.smm.guest_mode = true; 6893 } 6894 6895 vmcs12 = get_vmcs12(vcpu); 6896 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 6897 return -EFAULT; 6898 6899 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 6900 return -EINVAL; 6901 6902 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6903 return 0; 6904 6905 vmx->nested.nested_run_pending = 6906 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 6907 6908 vmx->nested.mtf_pending = 6909 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); 6910 6911 ret = -EINVAL; 6912 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6913 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6914 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 6915 6916 if (kvm_state->size < 6917 sizeof(*kvm_state) + 6918 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 6919 goto error_guest_mode; 6920 6921 if (copy_from_user(shadow_vmcs12, 6922 user_vmx_nested_state->shadow_vmcs12, 6923 sizeof(*shadow_vmcs12))) { 6924 ret = -EFAULT; 6925 goto error_guest_mode; 6926 } 6927 6928 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 6929 !shadow_vmcs12->hdr.shadow_vmcs) 6930 goto error_guest_mode; 6931 } 6932 6933 vmx->nested.has_preemption_timer_deadline = false; 6934 if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { 6935 vmx->nested.has_preemption_timer_deadline = true; 6936 vmx->nested.preemption_timer_deadline = 6937 kvm_state->hdr.vmx.preemption_timer_deadline; 6938 } 6939 6940 if (nested_vmx_check_controls(vcpu, vmcs12) || 6941 nested_vmx_check_host_state(vcpu, vmcs12) || 6942 nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) 6943 goto error_guest_mode; 6944 6945 vmx->nested.dirty_vmcs12 = true; 6946 vmx->nested.force_msr_bitmap_recalc = true; 6947 ret = nested_vmx_enter_non_root_mode(vcpu, false); 6948 if (ret) 6949 goto error_guest_mode; 6950 6951 if (vmx->nested.mtf_pending) 6952 kvm_make_request(KVM_REQ_EVENT, vcpu); 6953 6954 return 0; 6955 6956 error_guest_mode: 6957 vmx->nested.nested_run_pending = 0; 6958 return ret; 6959 } 6960 6961 void nested_vmx_set_vmcs_shadowing_bitmap(void) 6962 { 6963 if (enable_shadow_vmcs) { 6964 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 6965 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 6966 } 6967 } 6968 6969 /* 6970 * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo 6971 * that madness to get the encoding for comparison. 6972 */ 6973 #define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10))) 6974 6975 static u64 nested_vmx_calc_vmcs_enum_msr(void) 6976 { 6977 /* 6978 * Note these are the so called "index" of the VMCS field encoding, not 6979 * the index into vmcs12. 6980 */ 6981 unsigned int max_idx, idx; 6982 int i; 6983 6984 /* 6985 * For better or worse, KVM allows VMREAD/VMWRITE to all fields in 6986 * vmcs12, regardless of whether or not the associated feature is 6987 * exposed to L1. Simply find the field with the highest index. 6988 */ 6989 max_idx = 0; 6990 for (i = 0; i < nr_vmcs12_fields; i++) { 6991 /* The vmcs12 table is very, very sparsely populated. */ 6992 if (!vmcs12_field_offsets[i]) 6993 continue; 6994 6995 idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); 6996 if (idx > max_idx) 6997 max_idx = idx; 6998 } 6999 7000 return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; 7001 } 7002 7003 static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf, 7004 struct nested_vmx_msrs *msrs) 7005 { 7006 msrs->pinbased_ctls_low = 7007 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 7008 7009 msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl; 7010 msrs->pinbased_ctls_high &= 7011 PIN_BASED_EXT_INTR_MASK | 7012 PIN_BASED_NMI_EXITING | 7013 PIN_BASED_VIRTUAL_NMIS | 7014 (enable_apicv ? PIN_BASED_POSTED_INTR : 0); 7015 msrs->pinbased_ctls_high |= 7016 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 7017 PIN_BASED_VMX_PREEMPTION_TIMER; 7018 } 7019 7020 static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf, 7021 struct nested_vmx_msrs *msrs) 7022 { 7023 msrs->exit_ctls_low = 7024 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 7025 7026 msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl; 7027 msrs->exit_ctls_high &= 7028 #ifdef CONFIG_X86_64 7029 VM_EXIT_HOST_ADDR_SPACE_SIZE | 7030 #endif 7031 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | 7032 VM_EXIT_CLEAR_BNDCFGS; 7033 msrs->exit_ctls_high |= 7034 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 7035 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 7036 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT | 7037 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 7038 7039 /* We support free control of debug control saving. */ 7040 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 7041 } 7042 7043 static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf, 7044 struct nested_vmx_msrs *msrs) 7045 { 7046 msrs->entry_ctls_low = 7047 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 7048 7049 msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl; 7050 msrs->entry_ctls_high &= 7051 #ifdef CONFIG_X86_64 7052 VM_ENTRY_IA32E_MODE | 7053 #endif 7054 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; 7055 msrs->entry_ctls_high |= 7056 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER | 7057 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 7058 7059 /* We support free control of debug control loading. */ 7060 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 7061 } 7062 7063 static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf, 7064 struct nested_vmx_msrs *msrs) 7065 { 7066 msrs->procbased_ctls_low = 7067 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 7068 7069 msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl; 7070 msrs->procbased_ctls_high &= 7071 CPU_BASED_INTR_WINDOW_EXITING | 7072 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | 7073 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 7074 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 7075 CPU_BASED_CR3_STORE_EXITING | 7076 #ifdef CONFIG_X86_64 7077 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 7078 #endif 7079 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 7080 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 7081 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 7082 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 7083 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 7084 /* 7085 * We can allow some features even when not supported by the 7086 * hardware. For example, L1 can specify an MSR bitmap - and we 7087 * can use it to avoid exits to L1 - even when L0 runs L2 7088 * without MSR bitmaps. 7089 */ 7090 msrs->procbased_ctls_high |= 7091 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 7092 CPU_BASED_USE_MSR_BITMAPS; 7093 7094 /* We support free control of CR3 access interception. */ 7095 msrs->procbased_ctls_low &= 7096 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 7097 } 7098 7099 static void nested_vmx_setup_secondary_ctls(u32 ept_caps, 7100 struct vmcs_config *vmcs_conf, 7101 struct nested_vmx_msrs *msrs) 7102 { 7103 msrs->secondary_ctls_low = 0; 7104 7105 msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl; 7106 msrs->secondary_ctls_high &= 7107 SECONDARY_EXEC_DESC | 7108 SECONDARY_EXEC_ENABLE_RDTSCP | 7109 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 7110 SECONDARY_EXEC_WBINVD_EXITING | 7111 SECONDARY_EXEC_APIC_REGISTER_VIRT | 7112 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 7113 SECONDARY_EXEC_RDRAND_EXITING | 7114 SECONDARY_EXEC_ENABLE_INVPCID | 7115 SECONDARY_EXEC_ENABLE_VMFUNC | 7116 SECONDARY_EXEC_RDSEED_EXITING | 7117 SECONDARY_EXEC_ENABLE_XSAVES | 7118 SECONDARY_EXEC_TSC_SCALING | 7119 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; 7120 7121 /* 7122 * We can emulate "VMCS shadowing," even if the hardware 7123 * doesn't support it. 7124 */ 7125 msrs->secondary_ctls_high |= 7126 SECONDARY_EXEC_SHADOW_VMCS; 7127 7128 if (enable_ept) { 7129 /* nested EPT: emulate EPT also to L1 */ 7130 msrs->secondary_ctls_high |= 7131 SECONDARY_EXEC_ENABLE_EPT; 7132 msrs->ept_caps = 7133 VMX_EPT_PAGE_WALK_4_BIT | 7134 VMX_EPT_PAGE_WALK_5_BIT | 7135 VMX_EPTP_WB_BIT | 7136 VMX_EPT_INVEPT_BIT | 7137 VMX_EPT_EXECUTE_ONLY_BIT; 7138 7139 msrs->ept_caps &= ept_caps; 7140 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 7141 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 7142 VMX_EPT_1GB_PAGE_BIT; 7143 if (enable_ept_ad_bits) { 7144 msrs->secondary_ctls_high |= 7145 SECONDARY_EXEC_ENABLE_PML; 7146 msrs->ept_caps |= VMX_EPT_AD_BIT; 7147 } 7148 7149 /* 7150 * Advertise EPTP switching irrespective of hardware support, 7151 * KVM emulates it in software so long as VMFUNC is supported. 7152 */ 7153 if (cpu_has_vmx_vmfunc()) 7154 msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING; 7155 } 7156 7157 /* 7158 * Old versions of KVM use the single-context version without 7159 * checking for support, so declare that it is supported even 7160 * though it is treated as global context. The alternative is 7161 * not failing the single-context invvpid, and it is worse. 7162 */ 7163 if (enable_vpid) { 7164 msrs->secondary_ctls_high |= 7165 SECONDARY_EXEC_ENABLE_VPID; 7166 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 7167 VMX_VPID_EXTENT_SUPPORTED_MASK; 7168 } 7169 7170 if (enable_unrestricted_guest) 7171 msrs->secondary_ctls_high |= 7172 SECONDARY_EXEC_UNRESTRICTED_GUEST; 7173 7174 if (flexpriority_enabled) 7175 msrs->secondary_ctls_high |= 7176 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 7177 7178 if (enable_sgx) 7179 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; 7180 } 7181 7182 static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf, 7183 struct nested_vmx_msrs *msrs) 7184 { 7185 msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; 7186 msrs->misc_low |= 7187 VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 7188 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 7189 VMX_MISC_ACTIVITY_HLT | 7190 VMX_MISC_ACTIVITY_WAIT_SIPI; 7191 msrs->misc_high = 0; 7192 } 7193 7194 static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) 7195 { 7196 /* 7197 * This MSR reports some information about VMX support. We 7198 * should return information about the VMX we emulate for the 7199 * guest, and the VMCS structure we give it - not about the 7200 * VMX support of the underlying hardware. 7201 */ 7202 msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE, 7203 X86_MEMTYPE_WB); 7204 7205 msrs->basic |= VMX_BASIC_TRUE_CTLS; 7206 if (cpu_has_vmx_basic_inout()) 7207 msrs->basic |= VMX_BASIC_INOUT; 7208 } 7209 7210 static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) 7211 { 7212 /* 7213 * These MSRs specify bits which the guest must keep fixed on 7214 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 7215 * We picked the standard core2 setting. 7216 */ 7217 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 7218 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 7219 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 7220 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 7221 7222 /* These MSRs specify bits which the guest must keep fixed off. */ 7223 rdmsrq(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 7224 rdmsrq(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 7225 7226 if (vmx_umip_emulated()) 7227 msrs->cr4_fixed1 |= X86_CR4_UMIP; 7228 } 7229 7230 /* 7231 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 7232 * returned for the various VMX controls MSRs when nested VMX is enabled. 7233 * The same values should also be used to verify that vmcs12 control fields are 7234 * valid during nested entry from L1 to L2. 7235 * Each of these control msrs has a low and high 32-bit half: A low bit is on 7236 * if the corresponding bit in the (32-bit) control field *must* be on, and a 7237 * bit in the high half is on if the corresponding bit in the control field 7238 * may be on. See also vmx_control_verify(). 7239 */ 7240 void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) 7241 { 7242 struct nested_vmx_msrs *msrs = &vmcs_conf->nested; 7243 7244 /* 7245 * Note that as a general rule, the high half of the MSRs (bits in 7246 * the control fields which may be 1) should be initialized by the 7247 * intersection of the underlying hardware's MSR (i.e., features which 7248 * can be supported) and the list of features we want to expose - 7249 * because they are known to be properly supported in our code. 7250 * Also, usually, the low half of the MSRs (bits which must be 1) can 7251 * be set to 0, meaning that L1 may turn off any of these bits. The 7252 * reason is that if one of these bits is necessary, it will appear 7253 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 7254 * fields of vmcs01 and vmcs02, will turn these bits off - and 7255 * nested_vmx_l1_wants_exit() will not pass related exits to L1. 7256 * These rules have exceptions below. 7257 */ 7258 nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs); 7259 7260 nested_vmx_setup_exit_ctls(vmcs_conf, msrs); 7261 7262 nested_vmx_setup_entry_ctls(vmcs_conf, msrs); 7263 7264 nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs); 7265 7266 nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs); 7267 7268 nested_vmx_setup_misc_data(vmcs_conf, msrs); 7269 7270 nested_vmx_setup_basic(msrs); 7271 7272 nested_vmx_setup_cr_fixed(msrs); 7273 7274 msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); 7275 } 7276 7277 void nested_vmx_hardware_unsetup(void) 7278 { 7279 int i; 7280 7281 if (enable_shadow_vmcs) { 7282 for (i = 0; i < VMX_BITMAP_NR; i++) 7283 free_page((unsigned long)vmx_bitmap[i]); 7284 } 7285 } 7286 7287 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 7288 { 7289 int i; 7290 7291 if (!cpu_has_vmx_shadow_vmcs()) 7292 enable_shadow_vmcs = 0; 7293 if (enable_shadow_vmcs) { 7294 for (i = 0; i < VMX_BITMAP_NR; i++) { 7295 /* 7296 * The vmx_bitmap is not tied to a VM and so should 7297 * not be charged to a memcg. 7298 */ 7299 vmx_bitmap[i] = (unsigned long *) 7300 __get_free_page(GFP_KERNEL); 7301 if (!vmx_bitmap[i]) { 7302 nested_vmx_hardware_unsetup(); 7303 return -ENOMEM; 7304 } 7305 } 7306 7307 init_vmcs_shadow_fields(); 7308 } 7309 7310 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; 7311 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; 7312 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; 7313 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; 7314 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; 7315 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; 7316 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; 7317 exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff; 7318 exit_handlers[EXIT_REASON_VMON] = handle_vmxon; 7319 exit_handlers[EXIT_REASON_INVEPT] = handle_invept; 7320 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; 7321 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; 7322 7323 return 0; 7324 } 7325 7326 struct kvm_x86_nested_ops vmx_nested_ops = { 7327 .leave_nested = vmx_leave_nested, 7328 .is_exception_vmexit = nested_vmx_is_exception_vmexit, 7329 .check_events = vmx_check_nested_events, 7330 .has_events = vmx_has_nested_events, 7331 .triple_fault = nested_vmx_triple_fault, 7332 .get_state = vmx_get_nested_state, 7333 .set_state = vmx_set_nested_state, 7334 .get_nested_state_pages = vmx_get_nested_state_pages, 7335 .write_log_dirty = nested_vmx_write_pml_buffer, 7336 #ifdef CONFIG_KVM_HYPERV 7337 .enable_evmcs = nested_enable_evmcs, 7338 .get_evmcs_version = nested_get_evmcs_version, 7339 .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush, 7340 #endif 7341 }; 7342