x86.c - OpenGrok cross reference for /linux/arch/x86/kvm/x86.c

Lines Matching +full:supervisor +full:- +full:mode +full:- +full:visible
1 // SPDX-License-Identifier: GPL-2.0-only
3  * Kernel-based Virtual Machine driver for Linux
16  *   Ben-Ami Yassour <benami@il.ibm.com>
48 #include <linux/user-return-notifier.h>
107 	((struct kvm_vcpu *)(ctxt)->vcpu)
110  * - enable syscall per default because its emulated by KVM
111  * - enable LME and LMA per default on 64 bit KVM
145 				*(((struct kvm_x86_ops *)0)->func));
148 #include <asm/kvm-x86-ops.h>
165 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
174  * Flags to manipulate forced emulation behavior (any non-zero value will
181 int __read_mostly pi_inject_timer = -1;
461  * List of MSRs that control the existence of MSR-based features, i.e. MSRs
474 			      (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)];
566 					  size - useroffset, NULL);
575 		vcpu->arch.apf.gfns[i] = ~0;
591 	if (msrs->registered) {
592 		msrs->registered = false;
597 		values = &msrs->values[slot];
598 		if (values->host != values->curr) {
599 			wrmsrq(kvm_uret_msrs_list[slot], values->host);
600 			values->curr = values->host;
625 		return -1;
640 	return -1;
652 		msrs->values[i].host = value;
653 		msrs->values[i].curr = value;
659 	if (!msrs->registered) {
660 		msrs->urn.on_user_return = kvm_on_user_return;
661 		user_return_notifier_register(&msrs->urn);
662 		msrs->registered = true;
671 	value = (value & mask) | (msrs->values[slot].host & ~mask);
672 	if (value == msrs->values[slot].curr)
678 	msrs->values[slot].curr = value;
688 	msrs->values[slot].curr = value;
695 	return this_cpu_ptr(user_return_msrs)->values[slot].curr;
703 	if (msrs->registered)
704 		kvm_on_user_return(&msrs->urn);
758 	 * #DBs can be trap-like or fault-like, the caller must check other CPU
777 	if (!ex->has_payload)
780 	switch (ex->vector) {
783 		 * "Certain debug exceptions may clear bit 0-3.  The
787 		vcpu->arch.dr6 &= ~DR_TRAP_BITS;
796 		 * Active low bits should be cleared if 1-setting in payload.
797 		 * Active high bits should be set if 1-setting in payload.
804 		vcpu->arch.dr6 |= DR6_ACTIVE_LOW;
805 		vcpu->arch.dr6 |= ex->payload;
806 		vcpu->arch.dr6 ^= ex->payload & DR6_ACTIVE_LOW;
814 		vcpu->arch.dr6 &= ~BIT(12);
817 		vcpu->arch.cr2 = ex->payload;
821 	ex->has_payload = false;
822 	ex->payload = 0;
830 	struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
832 	ex->vector = vector;
833 	ex->injected = false;
834 	ex->pending = true;
835 	ex->has_error_code = has_error_code;
836 	ex->error_code = error_code;
837 	ex->has_payload = has_payload;
838 	ex->payload = payload;
851 	 * If the exception is destined for L2, morph it to a VM-Exit if L1
855 	    kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) {
861 	if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
863 		vcpu->arch.exception.pending = true;
864 		vcpu->arch.exception.injected = false;
866 		vcpu->arch.exception.has_error_code = has_error;
867 		vcpu->arch.exception.vector = nr;
868 		vcpu->arch.exception.error_code = error_code;
869 		vcpu->arch.exception.has_payload = has_payload;
870 		vcpu->arch.exception.payload = payload;
873 						      &vcpu->arch.exception);
878 	prev_nr = vcpu->arch.exception.vector;
880 		/* triple fault -> shutdown */
892 		vcpu->arch.exception.injected = false;
893 		vcpu->arch.exception.pending = false;
898 		   that instruction re-execution will regenerate lost
929 	 * On VM-Entry, an exception can be pending if and only if event
939 	 * re-checking is incorrect if _L1_ injected the exception, in which
944 	vcpu->arch.exception.injected = true;
945 	vcpu->arch.exception.has_error_code = has_error_code;
946 	vcpu->arch.exception.vector = nr;
947 	vcpu->arch.exception.error_code = error_code;
948 	vcpu->arch.exception.has_payload = false;
949 	vcpu->arch.exception.payload = 0;
977 	++vcpu->stat.pf_guest;
980 	 * Async #PF in L2 is always forwarded to L1 as a VM-Exit regardless of
983 	if (is_guest_mode(vcpu) && fault->async_page_fault)
985 					   true, fault->error_code,
986 					   true, fault->address);
988 		kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
989 					fault->address);
996 	WARN_ON_ONCE(fault->vector != PF_VECTOR);
998 	fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu :
999 					       vcpu->arch.walk_mmu;
1005 	if ((fault->error_code & PFERR_PRESENT_MASK) &&
1006 	    !(fault->error_code & PFERR_RSVD_MASK))
1007 		kvm_mmu_invalidate_addr(vcpu, fault_mmu, fault->address,
1010 	fault_mmu->inject_page_fault(vcpu, fault);
1016 	atomic_inc(&vcpu->arch.nmi_queued);
1050 	return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
1058 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
1063 	u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
1091 	if (!tdp_enabled && memcmp(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)))
1092 		kvm_mmu_free_roots(vcpu->kvm, mmu, KVM_MMU_ROOT_CURRENT);
1094 	memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
1097 	vcpu->arch.pdptrs_from_userspace = false;
1122 	 * CR0.WP is incorporated into the MMU role, but only for non-nested,
1169 	if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) &&
1180 	if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) &&
1208 	if (vcpu->arch.guest_state_protected)
1213 		if (vcpu->arch.xcr0 != kvm_host.xcr0)
1214 			xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
1217 		    vcpu->arch.ia32_xss != kvm_host.xss)
1218 			wrmsrq(MSR_IA32_XSS, vcpu->arch.ia32_xss);
1222 	    vcpu->arch.pkru != vcpu->arch.host_pkru &&
1223 	    ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
1225 		wrpkru(vcpu->arch.pkru);
1231 	if (vcpu->arch.guest_state_protected)
1235 	    ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
1237 		vcpu->arch.pkru = rdpkru();
1238 		if (vcpu->arch.pkru != vcpu->arch.host_pkru)
1239 			wrpkru(vcpu->arch.host_pkru);
1244 		if (vcpu->arch.xcr0 != kvm_host.xcr0)
1248 		    vcpu->arch.ia32_xss != kvm_host.xss)
1258 	return vcpu->arch.guest_supported_xcr0 & XFEATURE_MASK_USER_DYNAMIC;
1265 	u64 old_xcr0 = vcpu->arch.xcr0;
1281 	valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
1300 	vcpu->arch.xcr0 = xcr0;
1303 		vcpu->arch.cpuid_dynamic_bits_dirty = true;
1333 	 * If CR4.PCIDE is changed 0 -> 1, there is no need to flush the TLB
1347 	 * - CR4.PCIDE is changed from 1 to 0
1348 	 * - CR4.PGE is toggled
1359 	 * - CR4.SMEP is changed from 0 to 1
1360 	 * - CR4.PAE is toggled
1405 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1440 		if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid)
1443 	kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free);
1463 	 * Do not condition the GPA check on long mode, this helper is used to
1465 	 * the current vCPU mode is accurate.
1476 	vcpu->arch.cr3 = cr3;
1485 	 * and it's impossible to use a non-zero PCID when PCID is disabled,
1502 		vcpu->arch.cr8 = cr8;
1512 		return vcpu->arch.cr8;
1520 	if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
1522 			vcpu->arch.eff_db[i] = vcpu->arch.db[i];
1530 	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1531 		dr7 = vcpu->arch.guest_debug_dr7;
1533 		dr7 = vcpu->arch.dr7;
1535 	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
1537 		vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
1555 	size_t size = ARRAY_SIZE(vcpu->arch.db);
1559 		vcpu->arch.db[array_index_nospec(dr, size)] = val;
1560 		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
1561 			vcpu->arch.eff_db[dr] = val;
1567 		vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
1573 		vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
1584 	size_t size = ARRAY_SIZE(vcpu->arch.db);
1588 		return vcpu->arch.db[array_index_nospec(dr, size)];
1591 		return vcpu->arch.dr6;
1594 		return vcpu->arch.dr7;
1618  *   10 - MISC_PACKAGE_CTRLS
1619  *   11 - ENERGY_FILTERING_CTL
1620  *   12 - DOITM
1621  *   18 - FB_CLEAR_CTRL
1622  *   21 - XAPIC_DISABLE_STATUS
1623  *   23 - OVERCLOCKING_STATUS
1756 	u64 old_efer = vcpu->arch.efer;
1757 	u64 efer = msr_info->data;
1763 	if (!msr_info->host_initiated) {
1768 		    (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
1773 	efer |= vcpu->arch.efer & EFER_LMA;
1801 	struct kvm *kvm = vcpu->kvm;
1810 	idx = srcu_read_lock(&kvm->srcu);
1812 	msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu);
1818 	allowed = msr_filter->default_allow;
1819 	ranges = msr_filter->ranges;
1821 	for (i = 0; i < msr_filter->count; i++) {
1828 			allowed = test_bit(index - start, bitmap);
1834 	srcu_read_unlock(&kvm->srcu, idx);
1843  * Returns 0 on success, non-0 otherwise.
1864 		 * non-canonical address is written on Intel but not on
1865 		 * AMD (which ignores the top 32-bits, because it does
1866 		 * not implement 64-bit SYSENTER).
1868 		 * 64-bit code should hence be able to write a non-canonical
1870 		 * vmentry does not fail on Intel after writing a non-canonical
1872 		 * invokes 64-bit SYSENTER.
1891 		 * clear the bits.  This ensures cross-vendor migration will
1933 		/* All SSP MSRs except MSR_IA32_INT_SSP_TAB must be 4-byte aligned */
1962  * Returns 0 on success, non-0 otherwise.
2056 	if (!vcpu->run->msr.error) {
2057 		kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
2058 		kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
2064 	return complete_emulated_insn_gp(vcpu, vcpu->run->msr.error);
2075 	return kvm_x86_call(complete_emulated_msr)(vcpu, vcpu->run->msr.error);
2086 	if (!vcpu->run->msr.error)
2087 		kvm_register_write(vcpu, vcpu->arch.cui_rdmsr_imm_reg,
2088 				   vcpu->run->msr.data);
2113 	if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason))
2116 	vcpu->run->exit_reason = exit_reason;
2117 	vcpu->run->msr.error = 0;
2118 	memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad));
2119 	vcpu->run->msr.reason = msr_reason;
2120 	vcpu->run->msr.index = index;
2121 	vcpu->run->msr.data = data;
2122 	vcpu->arch.complete_userspace_io = completion;
2139 			kvm_rax_write(vcpu, data & -1u);
2140 			kvm_rdx_write(vcpu, (data >> 32) & -1u);
2157 	return __kvm_emulate_rdmsr(vcpu, kvm_rcx_read(vcpu), -1,
2164 	vcpu->arch.cui_rdmsr_imm_reg = reg;
2237 	if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS))
2240 	if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT))
2243 		enabled = vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT;
2268 	return READ_ONCE(vcpu->mode) == EXITING_GUEST_MODE ||
2276 		if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic) ||
2277 		    kvm_x2apic_icr_write_fast(vcpu->arch.apic, data))
2329 		return -EINVAL;
2361 	write_seqcount_begin(&vdata->seq);
2364 	vdata->clock.vclock_mode	= tk->tkr_mono.clock->vdso_clock_mode;
2365 	vdata->clock.cycle_last		= tk->tkr_mono.cycle_last;
2366 	vdata->clock.mask		= tk->tkr_mono.mask;
2367 	vdata->clock.mult		= tk->tkr_mono.mult;
2368 	vdata->clock.shift		= tk->tkr_mono.shift;
2369 	vdata->clock.base_cycles	= tk->tkr_mono.xtime_nsec;
2370 	vdata->clock.offset		= tk->tkr_mono.base;
2372 	vdata->raw_clock.vclock_mode	= tk->tkr_raw.clock->vdso_clock_mode;
2373 	vdata->raw_clock.cycle_last	= tk->tkr_raw.cycle_last;
2374 	vdata->raw_clock.mask		= tk->tkr_raw.mask;
2375 	vdata->raw_clock.mult		= tk->tkr_raw.mult;
2376 	vdata->raw_clock.shift		= tk->tkr_raw.shift;
2377 	vdata->raw_clock.base_cycles	= tk->tkr_raw.xtime_nsec;
2378 	vdata->raw_clock.offset		= tk->tkr_raw.base;
2380 	vdata->wall_time_sec            = tk->xtime_sec;
2382 	vdata->offs_boot		= tk->offs_boot;
2384 	write_seqcount_end(&vdata->seq);
2444 	struct kvm_arch *ka = &vcpu->kvm->arch;
2446 	if (vcpu->vcpu_id == 0 && !host_initiated) {
2447 		if (ka->boot_vcpu_runs_old_kvmclock != old_msr)
2450 		ka->boot_vcpu_runs_old_kvmclock = old_msr;
2453 	vcpu->arch.time = system_time;
2458 		kvm_gpc_activate(&vcpu->arch.pv_time, system_time & ~1ULL,
2461 		kvm_gpc_deactivate(&vcpu->arch.pv_time);
2484 		shift--;
2529 			vcpu->arch.tsc_catchup = 1;
2530 			vcpu->arch.tsc_always_catchup = 1;
2534 			return -1;
2538 	/* TSC scaling required  - calculate ratio */
2543 		pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
2545 		return -1;
2561 		return -1;
2566 			   &vcpu->arch.virtual_tsc_shift,
2567 			   &vcpu->arch.virtual_tsc_mult);
2568 	vcpu->arch.virtual_tsc_khz = user_tsc_khz;
2576 	thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
2588 	u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
2589 				      vcpu->arch.virtual_tsc_mult,
2590 				      vcpu->arch.virtual_tsc_shift);
2591 	tsc += vcpu->arch.this_tsc_write;
2596 static inline bool gtod_is_based_on_tsc(int mode)
2598 	return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK;
2605 	struct kvm_arch *ka = &vcpu->kvm->arch;
2613 	bool use_master_clock = (ka->nr_vcpus_matched_tsc + 1 ==
2614 				 atomic_read(&vcpu->kvm->online_vcpus)) &&
2615 				gtod_is_based_on_tsc(gtod->clock.vclock_mode);
2623 	if ((ka->use_master_clock && new_generation) ||
2624 	    (ka->use_master_clock != use_master_clock))
2627 	trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
2628 			    atomic_read(&vcpu->kvm->online_vcpus),
2629 		            ka->use_master_clock, gtod->clock.vclock_mode);
2636  * The most significant 64-N bits (mult) of ratio represent the
2639  * point number (mult + frac * 2^(-N)).
2662 	tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio);
2664 	return target_tsc - tsc;
2669 	return vcpu->arch.l1_tsc_offset +
2670 		kvm_scale_tsc(host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
2701 	if (vcpu->arch.guest_tsc_protected)
2704 	trace_kvm_write_tsc_offset(vcpu->vcpu_id,
2705 				   vcpu->arch.l1_tsc_offset,
2708 	vcpu->arch.l1_tsc_offset = l1_offset;
2716 		vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
2721 		vcpu->arch.tsc_offset = l1_offset;
2728 	vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier;
2732 		vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
2736 		vcpu->arch.tsc_scaling_ratio = l1_multiplier;
2746 	 * TSC is marked unstable when we're running on Hyper-V,
2763 	struct kvm *kvm = vcpu->kvm;
2765 	lockdep_assert_held(&kvm->arch.tsc_write_lock);
2767 	if (vcpu->arch.guest_tsc_protected)
2771 		vcpu->kvm->arch.user_set_tsc = true;
2777 	kvm->arch.last_tsc_nsec = ns;
2778 	kvm->arch.last_tsc_write = tsc;
2779 	kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
2780 	kvm->arch.last_tsc_offset = offset;
2782 	vcpu->arch.last_guest_tsc = tsc;
2794 		 * These values are tracked in kvm->arch.cur_xxx variables.
2796 		kvm->arch.cur_tsc_generation++;
2797 		kvm->arch.cur_tsc_nsec = ns;
2798 		kvm->arch.cur_tsc_write = tsc;
2799 		kvm->arch.cur_tsc_offset = offset;
2800 		kvm->arch.nr_vcpus_matched_tsc = 0;
2801 	} else if (vcpu->arch.this_tsc_generation != kvm->arch.cur_tsc_generation) {
2802 		kvm->arch.nr_vcpus_matched_tsc++;
2806 	vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
2807 	vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
2808 	vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
2816 	struct kvm *kvm = vcpu->kvm;
2822 	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
2825 	elapsed = ns - kvm->arch.last_tsc_nsec;
2827 	if (vcpu->arch.virtual_tsc_khz) {
2834 		} else if (kvm->arch.user_set_tsc) {
2835 			u64 tsc_exp = kvm->arch.last_tsc_write +
2837 			u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
2839 			 * Here lies UAPI baggage: when a user-initiated TSC write has
2850 			 * come from the kernel's default vCPU creation. Make the 1-second
2866 	    vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
2868 			offset = kvm->arch.cur_tsc_offset;
2878 	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
2884 	u64 tsc_offset = vcpu->arch.l1_tsc_offset;
2890 	if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio)
2893 				   vcpu->arch.l1_tsc_scaling_ratio);
2920 			  int *mode)
2925 	switch (clock->vclock_mode) {
2930 			*mode = VDSO_CLOCKMODE_HVCLOCK;
2931 			v = (tsc_pg_val - clock->cycle_last) &
2932 				clock->mask;
2935 			*mode = VDSO_CLOCKMODE_NONE;
2939 		*mode = VDSO_CLOCKMODE_TSC;
2941 		v = (*tsc_timestamp - clock->cycle_last) &
2942 			clock->mask;
2945 		*mode = VDSO_CLOCKMODE_NONE;
2948 	if (*mode == VDSO_CLOCKMODE_NONE)
2951 	return v * clock->mult;
2956  * frequency of CLOCK_MONOTONIC_RAW (hence adding gtos->offs_boot).
2962 	int mode;
2966 		seq = read_seqcount_begin(&gtod->seq);
2967 		ns = gtod->raw_clock.base_cycles;
2968 		ns += vgettsc(&gtod->raw_clock, tsc_timestamp, &mode);
2969 		ns >>= gtod->raw_clock.shift;
2970 		ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot));
2971 	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
2974 	return mode;
2985 	int mode;
2989 		seq = read_seqcount_begin(&gtod->seq);
2990 		ns = gtod->clock.base_cycles;
2991 		ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
2992 		ns >>= gtod->clock.shift;
2993 		ns += ktime_to_ns(gtod->clock.offset);
2994 	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
2997 	return mode;
3004 	int mode;
3008 		seq = read_seqcount_begin(&gtod->seq);
3009 		ts->tv_sec = gtod->wall_time_sec;
3010 		ns = gtod->clock.base_cycles;
3011 		ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
3012 		ns >>= gtod->clock.shift;
3013 	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
3015 	ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
3016 	ts->tv_nsec = ns;
3018 	return mode;
3072  * Each numbered line represents an event visible to both
3084  * 4. ret0 = timespec0 + (rdtsc - tsc0) |
3085  * 5.				        | ret1 = timespec1 + (rdtsc - tsc1)
3086  * 				        | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
3088  * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
3090  * 	- ret0 < ret1
3091  *	- timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
3093  *	- 0 < N - M => M < N
3112 	struct kvm_arch *ka = &kvm->arch;
3116 	lockdep_assert_held(&kvm->arch.tsc_write_lock);
3117 	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
3118 			atomic_read(&kvm->online_vcpus));
3125 					&ka->master_kernel_ns,
3126 					&ka->master_cycle_now);
3128 	ka->use_master_clock = host_tsc_clocksource && vcpus_matched
3129 				&& !ka->backwards_tsc_observed
3130 				&& !ka->boot_vcpu_runs_old_kvmclock;
3132 	if (ka->use_master_clock)
3136 	trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
3148 	raw_spin_lock_irq(&kvm->arch.tsc_write_lock);
3149 	write_seqcount_begin(&kvm->arch.pvclock_sc);
3162 	struct kvm_arch *ka = &kvm->arch;
3166 	write_seqcount_end(&ka->pvclock_sc);
3167 	raw_spin_unlock_irq(&ka->tsc_write_lock);
3186  * per-CPU value (which may be zero if a CPU is going offline).  Note, tsc_khz
3200 /* Called within read_seqcount_begin/retry for kvm->pvclock_sc.  */
3203 	struct kvm_arch *ka = &kvm->arch;
3209 	data->flags = 0;
3210 	if (ka->use_master_clock &&
3215 		if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) {
3216 			data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
3217 			data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
3220 		data->host_tsc = rdtsc();
3222 		data->flags |= KVM_CLOCK_TSC_STABLE;
3223 		hv_clock.tsc_timestamp = ka->master_cycle_now;
3224 		hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
3228 		data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
3230 		data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
3238 	struct kvm_arch *ka = &kvm->arch;
3242 		seq = read_seqcount_begin(&ka->pvclock_sc);
3244 	} while (read_seqcount_retry(&ka->pvclock_sc, seq));
3266 	read_lock_irqsave(&gpc->lock, flags);
3268 		read_unlock_irqrestore(&gpc->lock, flags);
3273 		read_lock_irqsave(&gpc->lock, flags);
3276 	guest_hv_clock = (void *)(gpc->khva + offset);
3285 	guest_hv_clock->version = hv_clock.version = (guest_hv_clock->version + 1) | 1;
3289 	hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
3295 	guest_hv_clock->version = ++hv_clock.version;
3298 	read_unlock_irqrestore(&gpc->lock, flags);
3300 	trace_kvm_pvclock_update(vcpu->vcpu_id, &hv_clock);
3308 	struct kvm_vcpu_arch *vcpu = &v->arch;
3309 	struct kvm_arch *ka = &v->kvm->arch;
3322 		seq = read_seqcount_begin(&ka->pvclock_sc);
3323 		use_master_clock = ka->use_master_clock;
3325 			host_tsc = ka->master_cycle_now;
3326 			kernel_ns = ka->master_kernel_ns;
3328 	} while (read_seqcount_retry(&ka->pvclock_sc, seq));
3355 	if (vcpu->tsc_catchup) {
3358 			adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
3369 					    v->arch.l1_tsc_scaling_ratio);
3373 	if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
3375 				   &vcpu->pvclock_tsc_shift,
3376 				   &vcpu->pvclock_tsc_mul);
3377 		vcpu->hw_tsc_khz = tgt_tsc_khz;
3380 	hv_clock.tsc_shift = vcpu->pvclock_tsc_shift;
3381 	hv_clock.tsc_to_system_mul = vcpu->pvclock_tsc_mul;
3383 	hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
3384 	vcpu->last_guest_tsc = tsc_timestamp;
3391 	if (vcpu->pv_time.active) {
3397 		if (vcpu->pvclock_set_guest_stopped_request) {
3399 			vcpu->pvclock_set_guest_stopped_request = false;
3401 		kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->pv_time, 0);
3406 	kvm_hv_setup_tsc_page(v->kvm, &hv_clock);
3417 	if (ka->xen.hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE)
3420 	if (vcpu->xen.vcpu_info_cache.active)
3421 		kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_info_cache,
3423 	if (vcpu->xen.vcpu_time_info_cache.active)
3424 		kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_time_info_cache, 0);
3452 	struct kvm_arch *ka = &kvm->arch;
3458 		seq = read_seqcount_begin(&ka->pvclock_sc);
3461 		if (!ka->use_master_clock)
3483 		hv_clock.tsc_timestamp = ka->master_cycle_now;
3484 		hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
3486 	} while (read_seqcount_retry(&ka->pvclock_sc, seq));
3492 	 * since 1970-01-01.
3498 		return ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec -
3502 	return ktime_get_real_ns() - get_kvmclock_ns(kvm);
3507  * vcpu->cpu migration, should not allow system_timestamp from
3513  * We need to rate-limit these requests though, as they can
3516  * by the delay we use to rate-limit the updates.
3538 	struct kvm *kvm = v->kvm;
3541 	schedule_delayed_work(&kvm->arch.kvmclock_update_work,
3554 	schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
3555 	schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
3576 		return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
3583 	u64 mcg_cap = vcpu->arch.mcg_cap;
3585 	u32 msr = msr_info->index;
3586 	u64 data = msr_info->data;
3591 		vcpu->arch.mcg_status = data;
3595 		    (data || !msr_info->host_initiated))
3599 		vcpu->arch.mcg_ctl = data;
3601 	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
3602 		last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
3606 		if (!(mcg_cap & MCG_CMCI_P) && (data || !msr_info->host_initiated))
3611 		offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
3612 					    last_msr + 1 - MSR_IA32_MC0_CTL2);
3613 		vcpu->arch.mci_ctl2_banks[offset] = data;
3615 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
3616 		last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
3628 		 * single-bit ECC data errors.
3636 		 * AMD-based CPUs allow non-zero values, but if and only if
3639 		if (!msr_info->host_initiated && is_mci_status_msr(msr) &&
3643 		offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
3644 					    last_msr + 1 - MSR_IA32_MC0_CTL);
3645 		vcpu->arch.mce_banks[offset] = data;
3657 	return (vcpu->arch.apf.msr_en_val & mask) == mask;
3679 	vcpu->arch.apf.msr_en_val = data;
3687 	if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
3691 	vcpu->arch.apf.send_always = (data & KVM_ASYNC_PF_SEND_ALWAYS);
3692 	vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
3701 	/* Bits 8-63 are reserved */
3708 	vcpu->arch.apf.msr_int_val = data;
3710 	vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK;
3717 	kvm_gpc_deactivate(&vcpu->arch.pv_time);
3718 	vcpu->arch.time = 0;
3723 	++vcpu->stat.tlb_flush;
3732 	++vcpu->stat.tlb_flush;
3748 	 * Flushing all "guest" TLB is always a superset of Hyper-V's fine
3757 	++vcpu->stat.tlb_flush;
3765  * prior before nested VM-Enter/VM-Exit.
3779 	struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
3782 	gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
3786 	if (kvm_xen_msr_enabled(vcpu->kvm)) {
3791 	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
3794 	if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
3797 	slots = kvm_memslots(vcpu->kvm);
3799 	if (unlikely(slots->generation != ghc->generation ||
3800 		     gpa != ghc->gpa ||
3801 		     kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
3803 		BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
3805 		if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st)) ||
3806 		    kvm_is_error_hva(ghc->hva) || !ghc->memslot)
3810 	st = (struct kvm_steal_time __user *)ghc->hva;
3817 		int err = -EFAULT;
3828 			       "+m" (st->preempted));
3834 		vcpu->arch.st.preempted = 0;
3836 		trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
3847 		unsafe_put_user(0, &st->preempted, out);
3848 		vcpu->arch.st.preempted = 0;
3851 	unsafe_get_user(version, &st->version, out);
3856 	unsafe_put_user(version, &st->version, out);
3860 	unsafe_get_user(steal, &st->steal, out);
3861 	steal += current->sched_info.run_delay -
3862 		vcpu->arch.st.last_steal;
3863 	vcpu->arch.st.last_steal = current->sched_info.run_delay;
3864 	unsafe_put_user(steal, &st->steal, out);
3867 	unsafe_put_user(version, &st->version, out);
3872 	mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
3879  * Because S_CET is loaded on VM-Enter and VM-Exit via dedicated VMCS fields,
3883  * PL[0-3]_SSP while executing in the kernel is safe, as U_CET is specific to
3884  * userspace, and PL[0-3]_SSP are only consumed when transitioning to lower
3904  * Lock (and if necessary, re-load) the guest FPU, i.e. XSTATE, and access an
3915 	KVM_BUG_ON(!is_xstate_managed_msr(vcpu, msr_info->index), vcpu->kvm);
3916 	KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm);
3920 		rdmsrq(msr_info->index, msr_info->data);
3922 		wrmsrq(msr_info->index, msr_info->data);
3938 	u32 msr = msr_info->index;
3939 	u64 data = msr_info->data;
3942 	 * Do not allow host-initiated writes to trigger the Xen hypercall
3946 	if (kvm_xen_is_hypercall_page_msr(vcpu->kvm, msr) &&
3947 	    !msr_info->host_initiated)
3962 		if (msr_info->host_initiated)
3963 			vcpu->arch.microcode_version = data;
3966 		if (!msr_info->host_initiated ||
3969 		vcpu->arch.arch_capabilities = data;
3972 		if (!msr_info->host_initiated ||
3984 		if (vcpu->arch.perf_capabilities == data)
3987 		vcpu->arch.perf_capabilities = data;
3993 		if (!msr_info->host_initiated) {
4021 		if (!msr_info->host_initiated &&
4048 		vcpu->arch.msr_hwcr = data;
4060 		vcpu->arch.pat = data;
4066 		return kvm_apic_set_base(vcpu, data, msr_info->host_initiated);
4074 			if (!msr_info->host_initiated) {
4075 				s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
4082 			vcpu->arch.ia32_tsc_adjust_msr = data;
4086 		u64 old_val = vcpu->arch.ia32_misc_enable_msr;
4088 		if (!msr_info->host_initiated) {
4098 		if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
4102 			vcpu->arch.ia32_misc_enable_msr = data;
4103 			vcpu->arch.cpuid_dynamic_bits_dirty = true;
4105 			vcpu->arch.ia32_misc_enable_msr = data;
4110 		if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
4112 		vcpu->arch.smbase = data;
4115 		vcpu->arch.msr_ia32_power_ctl = data;
4118 		if (msr_info->host_initiated) {
4120 		} else if (!vcpu->arch.guest_tsc_protected) {
4121 			u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
4123 			vcpu->arch.ia32_tsc_adjust_msr += adj;
4130 		if (data & ~vcpu->arch.guest_supported_xss)
4132 		if (vcpu->arch.ia32_xss == data)
4134 		vcpu->arch.ia32_xss = data;
4135 		vcpu->arch.cpuid_dynamic_bits_dirty = true;
4138 		if (!msr_info->host_initiated)
4140 		vcpu->arch.smi_count = data;
4146 		vcpu->kvm->arch.wall_clock = data;
4147 		kvm_write_wall_clock(vcpu->kvm, data, 0);
4153 		vcpu->kvm->arch.wall_clock = data;
4154 		kvm_write_wall_clock(vcpu->kvm, data, 0);
4160 		kvm_write_system_time(vcpu, data, false, msr_info->host_initiated);
4166 		kvm_write_system_time(vcpu, data, true,  msr_info->host_initiated);
4186 			vcpu->arch.apf.pageready_pending = false;
4200 		vcpu->arch.st.msr_val = data;
4221 		if (data & (-1ULL << 1))
4224 		vcpu->arch.msr_kvm_poll_control = data;
4229 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
4230 	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
4247 		 * all pre-dating SVM, but a recommended workaround from
4265 					     msr_info->host_initiated);
4268 		/* Drop writes to this legacy MSR -- see rdmsr
4276 		vcpu->arch.osvw.length = data;
4281 		vcpu->arch.osvw.status = data;
4284 		if (!msr_info->host_initiated)
4286 		vcpu->arch.msr_platform_info = data;
4293 		vcpu->arch.msr_misc_features_enables = data;
4297 		if (!msr_info->host_initiated &&
4304 		fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data);
4307 		if (!msr_info->host_initiated &&
4314 		vcpu->arch.guest_fpu.xfd_err = data;
4334 	u64 mcg_cap = vcpu->arch.mcg_cap;
4344 		data = vcpu->arch.mcg_cap;
4349 		data = vcpu->arch.mcg_ctl;
4352 		data = vcpu->arch.mcg_status;
4354 	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
4355 		last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
4361 		offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
4362 					    last_msr + 1 - MSR_IA32_MC0_CTL2);
4363 		data = vcpu->arch.mci_ctl2_banks[offset];
4365 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
4366 		last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
4370 		offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
4371 					    last_msr + 1 - MSR_IA32_MC0_CTL);
4372 		data = vcpu->arch.mce_banks[offset];
4383 	switch (msr_info->index) {
4406 	 * so for existing CPU-specific MSRs.
4413 		msr_info->data = 0;
4419 		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
4421 		msr_info->data = 0;
4424 		msr_info->data = vcpu->arch.microcode_version;
4429 		msr_info->data = vcpu->arch.arch_capabilities;
4434 		msr_info->data = vcpu->arch.perf_capabilities;
4437 		msr_info->data = vcpu->arch.msr_ia32_power_ctl;
4446 		 * return L1's TSC value to ensure backwards-compatible
4451 		if (msr_info->host_initiated) {
4452 			offset = vcpu->arch.l1_tsc_offset;
4453 			ratio = vcpu->arch.l1_tsc_scaling_ratio;
4455 			offset = vcpu->arch.tsc_offset;
4456 			ratio = vcpu->arch.tsc_scaling_ratio;
4459 		msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset;
4463 		msr_info->data = vcpu->arch.pat;
4468 		return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
4470 		msr_info->data = 3;
4484 		msr_info->data = 1 << 24;
4487 		msr_info->data = vcpu->arch.apic_base;
4490 		return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
4492 		msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
4495 		msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
4498 		msr_info->data = vcpu->arch.ia32_misc_enable_msr;
4501 		if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
4503 		msr_info->data = vcpu->arch.smbase;
4506 		msr_info->data = vcpu->arch.smi_count;
4510 		msr_info->data = 1000ULL;
4512 		msr_info->data |= (((uint64_t)4ULL) << 40);
4515 		msr_info->data = vcpu->arch.efer;
4521 		msr_info->data = vcpu->kvm->arch.wall_clock;
4527 		msr_info->data = vcpu->kvm->arch.wall_clock;
4533 		msr_info->data = vcpu->arch.time;
4539 		msr_info->data = vcpu->arch.time;
4545 		msr_info->data = vcpu->arch.apf.msr_en_val;
4551 		msr_info->data = vcpu->arch.apf.msr_int_val;
4557 		msr_info->data = 0;
4563 		msr_info->data = vcpu->arch.st.msr_val;
4569 		msr_info->data = vcpu->arch.pv_eoi.msr_val;
4575 		msr_info->data = vcpu->arch.msr_kvm_poll_control;
4582 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
4583 	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
4584 		return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
4585 				   msr_info->host_initiated);
4587 		if (!msr_info->host_initiated &&
4590 		msr_info->data = vcpu->arch.ia32_xss;
4594 		 * Provide expected ramp-up count for K7. All other
4602 		msr_info->data = 0x20000000;
4616 					     msr_info->index, &msr_info->data,
4617 					     msr_info->host_initiated);
4630 		msr_info->data = 0xbe702111;
4635 		msr_info->data = vcpu->arch.osvw.length;
4640 		msr_info->data = vcpu->arch.osvw.status;
4643 		if (!msr_info->host_initiated &&
4644 		    !vcpu->kvm->arch.guest_can_read_msr_platform_info)
4646 		msr_info->data = vcpu->arch.msr_platform_info;
4649 		msr_info->data = vcpu->arch.msr_misc_features_enables;
4652 		msr_info->data = vcpu->arch.msr_hwcr;
4656 		if (!msr_info->host_initiated &&
4660 		msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd;
4663 		if (!msr_info->host_initiated &&
4667 		msr_info->data = vcpu->arch.guest_fpu.xfd_err;
4675 		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
4697 	for (i = 0; i < msrs->nmsrs; ++i) {
4699 		 * If userspace is accessing one or more XSTATE-managed MSRs,
4732 	r = -EFAULT;
4736 	r = -E2BIG;
4741 	entries = memdup_user(user_msrs->entries, size);
4749 	if (writeback && copy_to_user(user_msrs->entries, entries, size))
4750 		r = -EFAULT;
4788 	r = -EFAULT;
4792 	r = kvm_get_hv_cpuid(vcpu, &cpuid, cpuid_arg->entries);
4796 	r = -EFAULT;
4811 	return kvm && kvm->arch.has_protected_state ? 0 : KVM_SYNC_X86_VALID_FIELDS;
4940 		 * so do not report SMM to be available if real mode is
4941 		 * emulated via vm86 mode.  Still, do not go to great lengths
4954 			r = kvm->max_vcpus;
4976 		r = kvm_x86_ops.nested_ops->get_state ?
4977 			kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0;
4984 		r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
5029 	if (attr->group) {
5031 			return kvm_x86_call(dev_get_attr)(attr->group, attr->attr, val);
5032 		return -ENXIO;
5035 	switch (attr->attr) {
5040 		return -ENXIO;
5046 	u64 __user *uaddr = u64_to_user_ptr(attr->addr);
5055 		return -EFAULT;
5079 		r = -EFAULT;
5086 		r = -E2BIG;
5089 		r = -EFAULT;
5090 		if (copy_to_user(user_msr_list->indices, &msrs_to_save,
5093 		if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
5105 		r = -EFAULT;
5109 		r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
5114 		r = -EFAULT;
5121 		r = -EFAULT;
5132 		r = -EFAULT;
5139 		r = -E2BIG;
5142 		r = -EFAULT;
5143 		if (copy_to_user(user_msr_list->indices, &msr_based_features,
5159 		r = -EFAULT;
5167 		r = -EFAULT;
5174 		r = -EINVAL;
5183 	return kvm_arch_has_noncoherent_dma(vcpu->kvm);
5192 	vcpu->arch.l1tf_flush_l1d = true;
5194 	if (vcpu->scheduled_out && pmu->version && pmu->event_count) {
5195 		pmu->need_cleanup = true;
5202 			cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
5203 		else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
5204 			wbinvd_on_cpu(vcpu->cpu);
5215 		 * is handled on the nested VM-Exit path.
5223 	vcpu->arch.host_pkru = read_pkru();
5226 	if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
5227 		adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
5228 		vcpu->arch.tsc_offset_adjustment = 0;
5232 	if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
5233 		s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
5234 				rdtsc() - vcpu->arch.last_host_tsc;
5240 						vcpu->arch.last_guest_tsc);
5242 			if (!vcpu->arch.guest_tsc_protected)
5243 				vcpu->arch.tsc_catchup = 1;
5251 		 * kvmclock on vcpu->cpu migration
5253 		if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
5255 		if (vcpu->cpu != cpu)
5257 		vcpu->cpu = cpu;
5265 	struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
5269 	gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
5272 	 * The vCPU can be marked preempted if and only if the VM-Exit was on
5276 	 * preempted if and only if the VM-Exit was due to a host interrupt.
5278 	if (!vcpu->arch.at_instruction_boundary) {
5279 		vcpu->stat.preemption_other++;
5283 	vcpu->stat.preemption_reported++;
5284 	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
5287 	if (vcpu->arch.st.preempted)
5291 	if (unlikely(current->mm != vcpu->kvm->mm))
5294 	slots = kvm_memslots(vcpu->kvm);
5296 	if (unlikely(slots->generation != ghc->generation ||
5297 		     gpa != ghc->gpa ||
5298 		     kvm_is_error_hva(ghc->hva) || !ghc->memslot))
5301 	st = (struct kvm_steal_time __user *)ghc->hva;
5302 	BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
5304 	if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
5305 		vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
5307 	mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
5314 	if (vcpu->preempted) {
5316 		 * Assume protected guests are in-kernel.  Inefficient yielding
5320 		vcpu->arch.preempted_in_kernel = vcpu->arch.guest_state_protected ||
5327 		idx = srcu_read_lock(&vcpu->kvm->srcu);
5328 		if (kvm_xen_msr_enabled(vcpu->kvm))
5332 		srcu_read_unlock(&vcpu->kvm->srcu, idx);
5336 	vcpu->arch.last_host_tsc = rdtsc();
5342 	if (vcpu->arch.apic->guest_apic_protected)
5343 		return -EINVAL;
5355 	if (vcpu->arch.apic->guest_apic_protected)
5356 		return -EINVAL;
5389 	 * instruction boundary and with no events half-injected.
5400 	if (irq->irq >= KVM_NR_INTERRUPTS)
5401 		return -EINVAL;
5403 	if (!irqchip_in_kernel(vcpu->kvm)) {
5404 		kvm_queue_interrupt(vcpu, irq->irq, false);
5410 	 * With in-kernel LAPIC, we only use this to inject EXTINT, so
5411 	 * fail for in-kernel 8259.
5413 	if (pic_in_kernel(vcpu->kvm))
5414 		return -ENXIO;
5416 	if (vcpu->arch.pending_external_vector != -1)
5417 		return -EEXIST;
5419 	vcpu->arch.pending_external_vector = irq->irq;
5434 	if (tac->flags)
5435 		return -EINVAL;
5436 	vcpu->arch.tpr_access_reporting = !!tac->enabled;
5446 	r = -EINVAL;
5452 	vcpu->arch.mcg_cap = mcg_cap;
5455 		vcpu->arch.mcg_ctl = ~(u64)0;
5458 		vcpu->arch.mce_banks[bank*4] = ~(u64)0;
5460 			vcpu->arch.mci_ctl2_banks[bank] = 0;
5473  * - none of the bits for Machine Check Exceptions are set
5474  * - both the VAL (valid) and UC (uncorrectable) bits are set
5475  * MCI_STATUS_PCC - Processor Context Corrupted
5476  * MCI_STATUS_S - Signaled as a Machine Check Exception
5477  * MCI_STATUS_AR - Software recoverable Action Required
5481 	return	!mce->mcg_status &&
5482 		!(mce->status & (MCI_STATUS_PCC | MCI_STATUS_S | MCI_STATUS_AR)) &&
5483 		(mce->status & MCI_STATUS_VAL) &&
5484 		(mce->status & MCI_STATUS_UC);
5489 	u64 mcg_cap = vcpu->arch.mcg_cap;
5491 	banks[1] = mce->status;
5492 	banks[2] = mce->addr;
5493 	banks[3] = mce->misc;
5494 	vcpu->arch.mcg_status = mce->mcg_status;
5497 	    !(vcpu->arch.mci_ctl2_banks[mce->bank] & MCI_CTL2_CMCI_EN))
5501 		kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTCMCI);
5509 	u64 mcg_cap = vcpu->arch.mcg_cap;
5511 	u64 *banks = vcpu->arch.mce_banks;
5513 	if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
5514 		return -EINVAL;
5516 	banks += array_index_nospec(4 * mce->bank, 4 * bank_num);
5525 	if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
5526 	    vcpu->arch.mcg_ctl != ~(u64)0)
5532 	if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
5534 	if (mce->status & MCI_STATUS_UC) {
5535 		if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
5541 			mce->status |= MCI_STATUS_OVER;
5542 		banks[2] = mce->addr;
5543 		banks[3] = mce->misc;
5544 		vcpu->arch.mcg_status = mce->mcg_status;
5545 		banks[1] = mce->status;
5550 			mce->status |= MCI_STATUS_OVER;
5551 		banks[2] = mce->addr;
5552 		banks[3] = mce->misc;
5553 		banks[1] = mce->status;
5574 	 * non-exiting _injected_ exception, and a pending exiting exception.
5575 	 * In that case, ignore the VM-Exiting exception as it's an extension
5578 	if (vcpu->arch.exception_vmexit.pending &&
5579 	    !vcpu->arch.exception.pending &&
5580 	    !vcpu->arch.exception.injected)
5581 		ex = &vcpu->arch.exception_vmexit;
5583 		ex = &vcpu->arch.exception;
5586 	 * In guest mode, payload delivery should be deferred if the exception
5588 	 * intercepts #PF, ditto for DR6 and #DBs.  If the per-VM capability,
5593 	if (!vcpu->kvm->arch.exception_payload_enabled &&
5594 	    ex->pending && ex->has_payload)
5605 	if (!kvm_exception_is_soft(ex->vector)) {
5606 		events->exception.injected = ex->injected;
5607 		events->exception.pending = ex->pending;
5613 		if (!vcpu->kvm->arch.exception_payload_enabled)
5614 			events->exception.injected |= ex->pending;
5616 	events->exception.nr = ex->vector;
5617 	events->exception.has_error_code = ex->has_error_code;
5618 	events->exception.error_code = ex->error_code;
5619 	events->exception_has_payload = ex->has_payload;
5620 	events->exception_payload = ex->payload;
5622 	events->interrupt.injected =
5623 		vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
5624 	events->interrupt.nr = vcpu->arch.interrupt.nr;
5625 	events->interrupt.shadow = kvm_x86_call(get_interrupt_shadow)(vcpu);
5627 	events->nmi.injected = vcpu->arch.nmi_injected;
5628 	events->nmi.pending = kvm_get_nr_pending_nmis(vcpu);
5629 	events->nmi.masked = kvm_x86_call(get_nmi_mask)(vcpu);
5631 	/* events->sipi_vector is never valid when reporting to user space */
5634 	events->smi.smm = is_smm(vcpu);
5635 	events->smi.pending = vcpu->arch.smi_pending;
5636 	events->smi.smm_inside_nmi =
5637 		!!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
5639 	events->smi.latched_init = kvm_lapic_latched_init(vcpu);
5641 	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
5644 	if (vcpu->kvm->arch.exception_payload_enabled)
5645 		events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
5646 	if (vcpu->kvm->arch.triple_fault_event) {
5647 		events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5648 		events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT;
5655 	if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
5661 		return -EINVAL;
5663 	if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
5664 		if (!vcpu->kvm->arch.exception_payload_enabled)
5665 			return -EINVAL;
5666 		if (events->exception.pending)
5667 			events->exception.injected = 0;
5669 			events->exception_has_payload = 0;
5671 		events->exception.pending = 0;
5672 		events->exception_has_payload = 0;
5675 	if ((events->exception.injected || events->exception.pending) &&
5676 	    (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
5677 		return -EINVAL;
5683 	 * morph the exception to a VM-Exit if appropriate.  Do this only for
5684 	 * pending exceptions, already-injected exceptions are not subject to
5687 	 * pending exception, which in turn may cause a spurious VM-Exit.
5689 	vcpu->arch.exception_from_userspace = events->exception.pending;
5691 	vcpu->arch.exception_vmexit.pending = false;
5693 	vcpu->arch.exception.injected = events->exception.injected;
5694 	vcpu->arch.exception.pending = events->exception.pending;
5695 	vcpu->arch.exception.vector = events->exception.nr;
5696 	vcpu->arch.exception.has_error_code = events->exception.has_error_code;
5697 	vcpu->arch.exception.error_code = events->exception.error_code;
5698 	vcpu->arch.exception.has_payload = events->exception_has_payload;
5699 	vcpu->arch.exception.payload = events->exception_payload;
5701 	vcpu->arch.interrupt.injected = events->interrupt.injected;
5702 	vcpu->arch.interrupt.nr = events->interrupt.nr;
5703 	vcpu->arch.interrupt.soft = events->interrupt.soft;
5704 	if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
5706 						   events->interrupt.shadow);
5708 	vcpu->arch.nmi_injected = events->nmi.injected;
5709 	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
5710 		vcpu->arch.nmi_pending = 0;
5711 		atomic_set(&vcpu->arch.nmi_queued, events->nmi.pending);
5712 		if (events->nmi.pending)
5715 	kvm_x86_call(set_nmi_mask)(vcpu, events->nmi.masked);
5717 	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
5719 		vcpu->arch.apic->sipi_vector = events->sipi_vector;
5721 	if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
5723 		if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
5725 			kvm_smm_changed(vcpu, events->smi.smm);
5728 		vcpu->arch.smi_pending = events->smi.pending;
5730 		if (events->smi.smm) {
5731 			if (events->smi.smm_inside_nmi)
5732 				vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
5734 				vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
5738 		if (events->smi.smm || events->smi.pending ||
5739 		    events->smi.smm_inside_nmi)
5740 			return -EINVAL;
5744 			if (events->smi.latched_init)
5745 				set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
5747 				clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
5751 	if (events->flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) {
5752 		if (!vcpu->kvm->arch.triple_fault_event)
5753 			return -EINVAL;
5754 		if (events->triple_fault.pending)
5770 	if (vcpu->kvm->arch.has_protected_state &&
5771 	    vcpu->arch.guest_state_protected)
5772 		return -EINVAL;
5776 	BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db));
5777 	for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++)
5778 		dbgregs->db[i] = vcpu->arch.db[i];
5780 	dbgregs->dr6 = vcpu->arch.dr6;
5781 	dbgregs->dr7 = vcpu->arch.dr7;
5790 	if (vcpu->kvm->arch.has_protected_state &&
5791 	    vcpu->arch.guest_state_protected)
5792 		return -EINVAL;
5794 	if (dbgregs->flags)
5795 		return -EINVAL;
5797 	if (!kvm_dr6_valid(dbgregs->dr6))
5798 		return -EINVAL;
5799 	if (!kvm_dr7_valid(dbgregs->dr7))
5800 		return -EINVAL;
5802 	for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++)
5803 		vcpu->arch.db[i] = dbgregs->db[i];
5806 	vcpu->arch.dr6 = dbgregs->dr6;
5807 	vcpu->arch.dr7 = dbgregs->dr7;
5829 	u64 supported_xcr0 = vcpu->arch.guest_supported_xcr0 |
5832 	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
5833 		return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
5835 	fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, state, size,
5836 				       supported_xcr0, vcpu->arch.pkru);
5843 	return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
5844 					     sizeof(guest_xsave->region));
5850 	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
5851 		return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
5853 	return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
5854 					      guest_xsave->region,
5856 					      &vcpu->arch.pkru);
5862 	if (vcpu->kvm->arch.has_protected_state &&
5863 	    vcpu->arch.guest_state_protected)
5864 		return -EINVAL;
5867 		guest_xcrs->nr_xcrs = 0;
5871 	guest_xcrs->nr_xcrs = 1;
5872 	guest_xcrs->flags = 0;
5873 	guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
5874 	guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
5883 	if (vcpu->kvm->arch.has_protected_state &&
5884 	    vcpu->arch.guest_state_protected)
5885 		return -EINVAL;
5888 		return -EINVAL;
5890 	if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
5891 		return -EINVAL;
5893 	for (i = 0; i < guest_xcrs->nr_xcrs; i++)
5895 		if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {
5897 				guest_xcrs->xcrs[i].value);
5901 		r = -EINVAL;
5913 	if (!vcpu->arch.pv_time.active)
5914 		return -EINVAL;
5915 	vcpu->arch.pvclock_set_guest_stopped_request = true;
5925 	switch (attr->attr) {
5930 		r = -ENXIO;
5939 	u64 __user *uaddr = u64_to_user_ptr(attr->addr);
5942 	switch (attr->attr) {
5944 		r = -EFAULT;
5945 		if (put_user(vcpu->arch.l1_tsc_offset, uaddr))
5950 		r = -ENXIO;
5959 	u64 __user *uaddr = u64_to_user_ptr(attr->addr);
5960 	struct kvm *kvm = vcpu->kvm;
5963 	switch (attr->attr) {
5969 		r = -EFAULT;
5973 		raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
5975 		matched = (vcpu->arch.virtual_tsc_khz &&
5976 			   kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz &&
5977 			   kvm->arch.last_tsc_offset == offset);
5979 		tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
5983 		raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
5989 		r = -ENXIO;
6003 		return -EFAULT;
6006 		return -ENXIO;
6026 	if (cap->flags)
6027 		return -EINVAL;
6029 	switch (cap->cap) {
6032 		if (cap->args[0])
6033 			return -EINVAL;
6037 		if (!irqchip_in_kernel(vcpu->kvm))
6038 			return -EINVAL;
6039 		return kvm_hv_activate_synic(vcpu, cap->cap ==
6047 			if (!kvm_x86_ops.nested_ops->enable_evmcs)
6048 				return -ENOTTY;
6049 			r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version);
6051 				user_ptr = (void __user *)(uintptr_t)cap->args[0];
6054 					r = -EFAULT;
6060 			return -ENOTTY;
6065 		return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]);
6069 		vcpu->arch.pv_cpuid.enforce = cap->args[0];
6072 		return -EINVAL;
6088 	switch (reg->index) {
6091 		 * FIXME: If host-initiated accesses are ever exempted from
6097 			return -EINVAL;
6099 		reg->type = KVM_X86_REG_TYPE_MSR;
6100 		reg->index = MSR_KVM_INTERNAL_GUEST_SSP;
6103 		return -EINVAL;
6113 		return -EINVAL;
6116 		return -EFAULT;
6126 		return -EFAULT;
6129 		return -EINVAL;
6144 		return -EFAULT;
6147 		return -EINVAL;
6150 	if (reg->rsvd1 || reg->rsvd2)
6151 		return -EINVAL;
6153 	if (reg->type == KVM_X86_REG_TYPE_KVM) {
6159 	if (reg->type != KVM_X86_REG_TYPE_MSR)
6160 		return -EINVAL;
6163 		return -EINVAL;
6165 	guard(srcu)(&vcpu->kvm->srcu);
6167 	load_fpu = is_xstate_managed_msr(vcpu, reg->index);
6173 		r = kvm_get_one_msr(vcpu, reg->index, user_val);
6175 		r = kvm_set_one_msr(vcpu, reg->index, user_val);
6188 	if (get_user(user_nr_regs, &user_list->n))
6189 		return -EFAULT;
6191 	if (put_user(nr_regs, &user_list->n))
6192 		return -EFAULT;
6195 		return -E2BIG;
6198 	    put_user(KVM_X86_REG_KVM(KVM_REG_GUEST_SSP), &user_list->reg[0]))
6199 		return -EFAULT;
6207 	struct kvm_vcpu *vcpu = filp->private_data;
6223 		r = -EINVAL;
6228 		r = -ENOMEM;
6234 		r = -EFAULT;
6241 		r = -EINVAL;
6256 		r = -EFAULT;
6274 		r = -EFAULT;
6277 		r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
6284 		r = -EFAULT;
6288 					      cpuid_arg->entries);
6295 		r = -EFAULT;
6299 					      cpuid_arg->entries);
6302 		r = -EFAULT;
6309 		int idx = srcu_read_lock(&vcpu->kvm->srcu);
6311 		srcu_read_unlock(&vcpu->kvm->srcu, idx);
6315 		int idx = srcu_read_lock(&vcpu->kvm->srcu);
6317 		srcu_read_unlock(&vcpu->kvm->srcu, idx);
6330 		r = -EFAULT;
6336 		r = -EFAULT;
6346 		r = -EINVAL;
6349 		r = -EFAULT;
6352 		idx = srcu_read_lock(&vcpu->kvm->srcu);
6354 		srcu_read_unlock(&vcpu->kvm->srcu, idx);
6360 		r = -EFAULT;
6369 		r = -EFAULT;
6380 		r = -EFAULT;
6389 		r = -EFAULT;
6405 		r = -EFAULT;
6415 		r = -EFAULT;
6424 		r = -EINVAL;
6425 		if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave))
6429 		r = -ENOMEM;
6437 		r = -EFAULT;
6444 		int size = vcpu->arch.guest_fpu.uabi_size;
6457 		int size = vcpu->arch.guest_fpu.uabi_size;
6460 		r = -ENOMEM;
6468 		r = -EFAULT;
6478 		r = -ENOMEM;
6486 		r = -EFAULT;
6506 		r = -EINVAL;
6508 		if (vcpu->arch.guest_tsc_protected)
6526 		r = vcpu->arch.virtual_tsc_khz;
6536 		r = -EFAULT;
6546 		r = -EINVAL;
6547 		if (!kvm_x86_ops.nested_ops->get_state)
6550 		BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
6551 		r = -EFAULT;
6552 		if (get_user(user_data_size, &user_kvm_nested_state->size))
6555 		r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state,
6561 			if (put_user(r, &user_kvm_nested_state->size))
6562 				r = -EFAULT;
6564 				r = -E2BIG;
6576 		r = -EINVAL;
6577 		if (!kvm_x86_ops.nested_ops->set_state)
6580 		r = -EFAULT;
6584 		r = -EINVAL;
6599 		idx = srcu_read_lock(&vcpu->kvm->srcu);
6600 		r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state);
6601 		srcu_read_unlock(&vcpu->kvm->srcu, idx);
6613 		r = -EFAULT;
6618 			r = -EFAULT;
6624 		r = -EFAULT;
6632 		r = -EINVAL;
6633 		if (vcpu->kvm->arch.has_protected_state &&
6634 		    vcpu->arch.guest_state_protected)
6638 		r = -ENOMEM;
6642 		r = -EFAULT;
6649 		r = -EINVAL;
6650 		if (vcpu->kvm->arch.has_protected_state &&
6651 		    vcpu->arch.guest_state_protected)
6669 		r = -ENOTTY;
6675 		r = -EINVAL;
6693 	if (addr > (unsigned int)(-3 * PAGE_SIZE))
6694 		return -EINVAL;
6709 		return -EINVAL;
6711 	mutex_lock(&kvm->slots_lock);
6714 	kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
6716 	mutex_unlock(&kvm->slots_lock);
6726 	 * on all VM-Exits, thus we only need to kick running vCPUs to force a
6727 	 * VM-Exit.
6732 	if (!kvm->arch.cpu_dirty_log_size)
6744 	if (cap->flags)
6745 		return -EINVAL;
6747 	switch (cap->cap) {
6749 		r = -EINVAL;
6750 		if (cap->args[0] & ~kvm_caps.supported_quirks)
6754 		kvm->arch.disabled_quirks |= cap->args[0] & kvm_caps.supported_quirks;
6758 		mutex_lock(&kvm->lock);
6759 		r = -EINVAL;
6760 		if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
6762 		r = -EEXIST;
6765 		if (kvm->created_vcpus)
6769 		kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
6770 		kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
6774 		mutex_unlock(&kvm->lock);
6778 		r = -EINVAL;
6779 		if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
6782 		if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
6783 			kvm->arch.x2apic_format = true;
6784 		if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
6785 			kvm->arch.x2apic_broadcast_quirk_disabled = true;
6790 		r = -EINVAL;
6791 		if (cap->args[0] & ~kvm_get_allowed_disable_exits())
6794 		mutex_lock(&kvm->lock);
6795 		if (kvm->created_vcpus)
6798 #define SMT_RSB_MSG "This processor is affected by the Cross-Thread Return Predictions vulnerability. " \
6803 		    (cap->args[0] & ~(KVM_X86_DISABLE_EXITS_PAUSE |
6807 		kvm_disable_exits(kvm, cap->args[0]);
6810 		mutex_unlock(&kvm->lock);
6813 		kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
6817 		kvm->arch.exception_payload_enabled = cap->args[0];
6821 		kvm->arch.triple_fault_event = cap->args[0];
6825 		r = -EINVAL;
6826 		if (cap->args[0] & ~KVM_MSR_EXIT_REASON_VALID_MASK)
6828 		kvm->arch.user_space_msr_mask = cap->args[0];
6832 		r = -EINVAL;
6833 		if (cap->args[0] & ~KVM_BUS_LOCK_DETECTION_VALID_MODE)
6836 		if ((cap->args[0] & KVM_BUS_LOCK_DETECTION_OFF) &&
6837 		    (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT))
6841 		    cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)
6842 			kvm->arch.bus_lock_detection_enabled = true;
6849 		r = sgx_set_attribute(&allowed_attributes, cap->args[0]);
6856 			kvm->arch.sgx_provisioning_allowed = true;
6858 			r = -EINVAL;
6863 		r = -EINVAL;
6867 		r = kvm_x86_call(vm_copy_enc_context_from)(kvm, cap->args[0]);
6870 		r = -EINVAL;
6874 		r = kvm_x86_call(vm_move_enc_context_from)(kvm, cap->args[0]);
6877 		if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) {
6878 			r = -EINVAL;
6881 		kvm->arch.hypercall_exit_enabled = cap->args[0];
6885 		r = -EINVAL;
6886 		if (cap->args[0] & ~1)
6888 		kvm->arch.exit_on_emulation_error = cap->args[0];
6892 		r = -EINVAL;
6893 		if (!enable_pmu || (cap->args[0] & ~KVM_CAP_PMU_VALID_MASK))
6896 		mutex_lock(&kvm->lock);
6897 		if (!kvm->created_vcpus) {
6898 			kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE);
6901 		mutex_unlock(&kvm->lock);
6904 		r = -EINVAL;
6905 		if (cap->args[0] > KVM_MAX_VCPU_IDS)
6908 		mutex_lock(&kvm->lock);
6909 		if (kvm->arch.bsp_vcpu_id > cap->args[0]) {
6911 		} else if (kvm->arch.max_vcpu_ids == cap->args[0]) {
6913 		} else if (!kvm->arch.max_vcpu_ids) {
6914 			kvm->arch.max_vcpu_ids = cap->args[0];
6917 		mutex_unlock(&kvm->lock);
6920 		r = -EINVAL;
6921 		if ((u32)cap->args[0] & ~KVM_X86_NOTIFY_VMEXIT_VALID_BITS)
6925 		if (!((u32)cap->args[0] & KVM_X86_NOTIFY_VMEXIT_ENABLED))
6927 		mutex_lock(&kvm->lock);
6928 		if (!kvm->created_vcpus) {
6929 			kvm->arch.notify_window = cap->args[0] >> 32;
6930 			kvm->arch.notify_vmexit_flags = (u32)cap->args[0];
6933 		mutex_unlock(&kvm->lock);
6936 		r = -EINVAL;
6950 			r = -EPERM;
6954 		if (cap->args[0])
6957 		mutex_lock(&kvm->lock);
6958 		if (!kvm->created_vcpus) {
6959 			kvm->arch.disable_nx_huge_pages = true;
6962 		mutex_unlock(&kvm->lock);
6965 		u64 bus_cycle_ns = cap->args[0];
6972 		r = -EINVAL;
6978 		mutex_lock(&kvm->lock);
6980 			r = -ENXIO;
6981 		else if (kvm->created_vcpus)
6982 			r = -EINVAL;
6984 			kvm->arch.apic_bus_cycle_ns = bus_cycle_ns;
6985 		mutex_unlock(&kvm->lock);
6989 		r = -EINVAL;
7003 	msr_filter->default_allow = default_allow;
7014 	for (i = 0; i < msr_filter->count; i++)
7015 		kfree(msr_filter->ranges[i].bitmap);
7026 	if (!user_range->nmsrs)
7029 	if (user_range->flags & ~KVM_MSR_FILTER_RANGE_VALID_MASK)
7030 		return -EINVAL;
7032 	if (!user_range->flags)
7033 		return -EINVAL;
7035 	bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
7037 		return -EINVAL;
7039 	bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size);
7043 	msr_filter->ranges[msr_filter->count] = (struct msr_bitmap_range) {
7044 		.flags = user_range->flags,
7045 		.base = user_range->base,
7046 		.nmsrs = user_range->nmsrs,
7050 	msr_filter->count++;
7063 	if (filter->flags & ~KVM_MSR_FILTER_VALID_MASK)
7064 		return -EINVAL;
7066 	for (i = 0; i < ARRAY_SIZE(filter->ranges); i++)
7067 		empty &= !filter->ranges[i].nmsrs;
7069 	default_allow = !(filter->flags & KVM_MSR_FILTER_DEFAULT_DENY);
7071 		return -EINVAL;
7075 		return -ENOMEM;
7077 	for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) {
7078 		r = kvm_add_msr_filter(new_filter, &filter->ranges[i]);
7085 	mutex_lock(&kvm->lock);
7086 	old_filter = rcu_replace_pointer(kvm->arch.msr_filter, new_filter,
7087 					 mutex_is_locked(&kvm->lock));
7088 	mutex_unlock(&kvm->lock);
7089 	synchronize_srcu(&kvm->srcu);
7122 	struct kvm *kvm = filp->private_data;
7123 	long r = -ENOTTY;
7134 			return -EFAULT;
7142 				.flags = cr->flags,
7143 				.nmsrs = cr->nmsrs,
7144 				.base = cr->base,
7145 				.bitmap = (__u8 *)(ulong)cr->bitmap,
7192 		return -EFAULT;
7199 	struct kvm_arch *ka = &kvm->arch;
7204 		return -EFAULT;
7211 		return -EINVAL;
7231 			data.clock += now_real_ns - data.realtime;
7234 	if (ka->use_master_clock)
7235 		now_raw_ns = ka->master_kernel_ns;
7238 	ka->kvmclock_offset = data.clock - now_raw_ns;
7245 	struct kvm *kvm = filp->private_data;
7247 	int r = -ENOTTY;
7251 	 * This union makes it completely explicit to gcc-3.x
7269 		mutex_lock(&kvm->lock);
7270 		r = -EINVAL;
7271 		if (kvm->created_vcpus)
7273 		r = -EFAULT;
7278 		mutex_unlock(&kvm->lock);
7286 		mutex_lock(&kvm->lock);
7288 		r = -EEXIST;
7293 		 * Disallow an in-kernel I/O APIC if the VM has protected EOIs,
7295 		 * emulate level-triggered interrupts.
7297 		r = -ENOTTY;
7298 		if (kvm->arch.has_protected_eoi)
7301 		r = -EINVAL;
7302 		if (kvm->created_vcpus)
7321 		/* Write kvm->irq_routing before enabling irqchip_in_kernel. */
7323 		kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
7326 		mutex_unlock(&kvm->lock);
7333 		r = -EFAULT;
7338 		mutex_lock(&kvm->lock);
7339 		r = -EEXIST;
7340 		if (kvm->arch.vpit)
7342 		r = -ENOENT;
7345 		r = -ENOMEM;
7346 		kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
7347 		if (kvm->arch.vpit)
7350 		mutex_unlock(&kvm->lock);
7362 		r = -ENXIO;
7368 		r = -EFAULT;
7386 		r = -ENXIO;
7395 		r = -EFAULT;
7398 		r = -ENXIO;
7399 		if (!kvm->arch.vpit)
7404 		r = -EFAULT;
7411 		r = -EFAULT;
7414 		mutex_lock(&kvm->lock);
7415 		r = -ENXIO;
7416 		if (!kvm->arch.vpit)
7420 		mutex_unlock(&kvm->lock);
7424 		r = -ENXIO;
7425 		if (!kvm->arch.vpit)
7430 		r = -EFAULT;
7437 		r = -EFAULT;
7440 		mutex_lock(&kvm->lock);
7441 		r = -ENXIO;
7442 		if (!kvm->arch.vpit)
7446 		mutex_unlock(&kvm->lock);
7451 		r =  -EFAULT;
7454 		r = -ENXIO;
7455 		if (!kvm->arch.vpit)
7463 		mutex_lock(&kvm->lock);
7464 		if (kvm->created_vcpus)
7465 			r = -EBUSY;
7467 			 (kvm->arch.max_vcpu_ids && arg > kvm->arch.max_vcpu_ids))
7468 			r = -EINVAL;
7470 			kvm->arch.bsp_vcpu_id = arg;
7471 		mutex_unlock(&kvm->lock);
7476 		r = -EFAULT;
7485 		r = -EFAULT;
7490 			r = -EFAULT;
7496 		r = -EFAULT;
7505 		r = -EFAULT;
7521 		r = -EINVAL;
7531 		mutex_lock(&kvm->lock);
7532 		if (!kvm->created_vcpus) {
7533 			WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz);
7536 		mutex_unlock(&kvm->lock);
7540 		r = READ_ONCE(kvm->arch.default_tsc_khz);
7544 		r = -ENOTTY;
7553 		r = -EFAULT;
7557 		r = -ENOTTY;
7567 		r = -EFAULT;
7571 		r = -ENOTTY;
7582 		r = -EFAULT;
7597 			return -EFAULT;
7603 		r = -ENOTTY;
7663 		    (msr_index - MSR_IA32_RTIT_ADDR0_A >=
7668 	     MSR_ARCH_PERFMON_PERFCTR0 + KVM_MAX_NR_GP_COUNTERS - 1:
7669 		if (msr_index - MSR_ARCH_PERFMON_PERFCTR0 >=
7674 	     MSR_ARCH_PERFMON_EVENTSEL0 + KVM_MAX_NR_GP_COUNTERS - 1:
7675 		if (msr_index - MSR_ARCH_PERFMON_EVENTSEL0 >=
7680 	     MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_MAX_NR_FIXED_COUNTERS - 1:
7681 		if (msr_index - MSR_ARCH_PERFMON_FIXED_CTR0 >=
7769 		      !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
7774 		len -= n;
7789 		      !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
7796 		len -= n;
7818 	struct kvm_mmu *mmu = vcpu->arch.mmu;
7823 	/* NPT walks are always user-walks */
7825 	t_gpa  = mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
7833 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7836 	return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
7843 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7847 	return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
7855 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7857 	return mmu->gva_to_gpa(vcpu, mmu, gva, 0, exception);
7864 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7869 		gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
7870 		unsigned offset = addr & (PAGE_SIZE-1);
7871 		unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
7883 		bytes -= toread;
7897 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7903 	gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access|PFERR_FETCH_MASK,
7908 	offset = addr & (PAGE_SIZE-1);
7910 		bytes = (unsigned)PAGE_SIZE - offset;
7956 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7961 		gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
7962 		unsigned offset = addr & (PAGE_SIZE-1);
7963 		unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
7974 		bytes -= towrite;
8002 	vcpu->arch.l1tf_flush_l1d = true;
8062 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
8072 	    !permission_fault(vcpu, vcpu->arch.walk_mmu,
8073 			      vcpu->arch.mmio_access, 0, access))) {
8074 		*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
8075 					(gva & (PAGE_SIZE - 1));
8080 	*gpa = mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
8083 		return -1;
8114 	if (vcpu->mmio_read_completed) {
8116 			       vcpu->mmio_fragments[0].gpa, val);
8117 		vcpu->mmio_read_completed = 0;
8152 	struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
8154 	memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
8180 	bool write = ops->write;
8182 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8191 	if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) &&
8192 	    (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) {
8193 		gpa = ctxt->gpa_val;
8201 	if (!ret && ops->read_write_emulate(vcpu, gpa, val, bytes))
8207 	handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
8212 	bytes -= handled;
8215 	WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);
8216 	frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
8217 	frag->gpa = gpa;
8218 	frag->data = val;
8219 	frag->len = bytes;
8233 	if (ops->read_write_prepare &&
8234 		  ops->read_write_prepare(vcpu, val, bytes))
8237 	vcpu->mmio_nr_fragments = 0;
8240 	if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
8243 		now = -addr & ~PAGE_MASK;
8250 		if (ctxt->mode != X86EMUL_MODE_PROT64)
8253 		bytes -= now;
8261 	if (!vcpu->mmio_nr_fragments)
8264 	gpa = vcpu->mmio_fragments[0].gpa;
8266 	vcpu->mmio_needed = 1;
8267 	vcpu->mmio_cur_fragment = 0;
8269 	vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
8270 	vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
8271 	vcpu->run->exit_reason = KVM_EXIT_MMIO;
8272 	vcpu->run->mmio.phys_addr = gpa;
8274 	return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
8314 	if (bytes > 8 || (bytes & (bytes - 1)))
8328 		page_line_mask = ~(cache_line_size() - 1);
8332 	if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask))
8390 	WARN_ON_ONCE(vcpu->arch.pio.count);
8406 				memset(data, 0, size * (count - i));
8415 	vcpu->arch.pio.port = port;
8416 	vcpu->arch.pio.in = in;
8417 	vcpu->arch.pio.count = count;
8418 	vcpu->arch.pio.size = size;
8421 		memset(vcpu->arch.pio_data, 0, size * count);
8423 		memcpy(vcpu->arch.pio_data, data, size * count);
8425 	vcpu->run->exit_reason = KVM_EXIT_IO;
8426 	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
8427 	vcpu->run->io.size = size;
8428 	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
8429 	vcpu->run->io.count = count;
8430 	vcpu->run->io.port = port;
8446 	int size = vcpu->arch.pio.size;
8447 	unsigned int count = vcpu->arch.pio.count;
8448 	memcpy(val, vcpu->arch.pio_data, size * count);
8449 	trace_kvm_pio(KVM_PIO_IN, vcpu->arch.pio.port, size, count, vcpu->arch.pio_data);
8450 	vcpu->arch.pio.count = 0;
8458 	if (vcpu->arch.pio.count) {
8506 		cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
8507 		wbinvd_on_cpus_mask(vcpu->arch.wbinvd_dirty_mask);
8509 		cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
8543 	return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
8556 		value = vcpu->arch.cr2;
8585 		vcpu->arch.cr2 = val;
8598 		res = -1;
8659 	desc->type = var.type;
8660 	desc->s = var.s;
8661 	desc->dpl = var.dpl;
8662 	desc->p = var.present;
8663 	desc->avl = var.avl;
8664 	desc->l = var.l;
8665 	desc->d = var.db;
8666 	desc->g = var.g;
8684 	if (desc->g)
8686 	var.type = desc->type;
8687 	var.dpl = desc->dpl;
8688 	var.db = desc->d;
8689 	var.s = desc->s;
8690 	var.l = desc->l;
8691 	var.g = desc->g;
8692 	var.avl = desc->avl;
8693 	var.present = desc->p;
8751 	 * Treat emulator accesses to the current shadow stack pointer as host-
8754 	 * so the index is fully KVM-controlled.
8775 	emul_to_vcpu(ctxt)->arch.halt_request = 1;
8783 					     &ctxt->exception);
8853 	struct kvm *kvm = emul_to_vcpu(ctxt)->kvm;
8855 	if (!kvm->vm_bugged)
8944 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8946 	if (ctxt->exception.vector == PF_VECTOR)
8947 		kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
8948 	else if (ctxt->exception.error_code_valid)
8949 		kvm_queue_exception_e(vcpu, ctxt->exception.vector,
8950 				      ctxt->exception.error_code);
8952 		kvm_queue_exception(vcpu, ctxt->exception.vector);
8965 	ctxt->vcpu = vcpu;
8966 	ctxt->ops = &emulate_ops;
8967 	vcpu->arch.emulate_ctxt = ctxt;
8974 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8979 	ctxt->gpa_available = false;
8980 	ctxt->eflags = kvm_get_rflags(vcpu);
8981 	ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
8983 	ctxt->eip = kvm_rip_read(vcpu);
8984 	ctxt->mode = (!is_protmode(vcpu))		? X86EMUL_MODE_REAL :
8985 		     (ctxt->eflags & X86_EFLAGS_VM)	? X86EMUL_MODE_VM86 :
8989 	ctxt->interruptibility = 0;
8990 	ctxt->have_exception = false;
8991 	ctxt->exception.vector = -1;
8992 	ctxt->perm_ok = false;
8995 	vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
9000 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
9005 	ctxt->op_bytes = 2;
9006 	ctxt->ad_bytes = 2;
9007 	ctxt->_eip = ctxt->eip + inc_eip;
9013 		ctxt->eip = ctxt->_eip;
9014 		kvm_rip_write(vcpu, ctxt->eip);
9015 		kvm_set_rflags(vcpu, ctxt->eflags);
9023 	struct kvm_run *run = vcpu->run;
9036 	run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
9037 	run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION;
9049 	run->emulation_failure.flags = 0;
9052 		BUILD_BUG_ON((sizeof(run->emulation_failure.insn_size) +
9053 			      sizeof(run->emulation_failure.insn_bytes) != 16));
9055 		run->emulation_failure.flags |=
9057 		run->emulation_failure.insn_size = insn_size;
9058 		memset(run->emulation_failure.insn_bytes, 0x90,
9059 		       sizeof(run->emulation_failure.insn_bytes));
9060 		memcpy(run->emulation_failure.insn_bytes, insn_bytes, insn_size);
9063 	memcpy(&run->internal.data[info_start], info, sizeof(info));
9064 	memcpy(&run->internal.data[info_start + ARRAY_SIZE(info)], data,
9067 	run->emulation_failure.ndata = info_start + ARRAY_SIZE(info) + ndata;
9072 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
9074 	prepare_emulation_failure_exit(vcpu, NULL, 0, ctxt->fetch.data,
9075 				       ctxt->fetch.end - ctxt->fetch.data);
9094 	struct kvm_run *run = vcpu->run;
9101 	run->internal.data[ndata++] = info2;
9102 	run->internal.data[ndata++] = reason;
9103 	run->internal.data[ndata++] = info1;
9104 	run->internal.data[ndata++] = gpa;
9105 	run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;
9107 	run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
9108 	run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
9109 	run->internal.ndata = ndata;
9115 	struct kvm *kvm = vcpu->kvm;
9117 	++vcpu->stat.insn_emulation_fail;
9125 	if (kvm->arch.exit_on_emulation_error ||
9154 	 * a SPTE and write-protect the gfn to resolve the !PRESENT fault, and
9163 	 * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the
9164 	 * guest to let the CPU re-execute the instruction in the hope that the
9199 	struct kvm_run *kvm_run = vcpu->run;
9201 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
9202 		kvm_run->debug.arch.dr6 = DR6_BS | DR6_ACTIVE_LOW;
9203 		kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
9204 		kvm_run->debug.arch.exception = DB_VECTOR;
9205 		kvm_run->exit_reason = KVM_EXIT_DEBUG;
9275 	if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
9276 	    (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
9277 		struct kvm_run *kvm_run = vcpu->run;
9280 					   vcpu->arch.guest_debug_dr7,
9281 					   vcpu->arch.eff_db);
9284 			kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
9285 			kvm_run->debug.arch.pc = eip;
9286 			kvm_run->debug.arch.exception = DB_VECTOR;
9287 			kvm_run->exit_reason = KVM_EXIT_DEBUG;
9293 	if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
9297 					   vcpu->arch.dr7,
9298 					   vcpu->arch.db);
9312 	switch (ctxt->opcode_len) {
9314 		switch (ctxt->b) {
9331 		switch (ctxt->b) {
9344  * (and wrong) when emulating on an intercepted fault-like exception[*], as
9354 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
9362 	++vcpu->stat.insn_emulation;
9372 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
9398 	vcpu->arch.l1tf_flush_l1d = true;
9405 		 * are fault-like and are higher priority than any faults on
9423 			if (ctxt->have_exception &&
9426 				 * #UD should result in just EMULATION_FAILED, and trap-like
9429 				WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR ||
9430 					     exception_type(ctxt->exception.vector) == EXCPT_TRAP);
9448 	 * injecting single-step #DBs.
9451 		if (ctxt->mode != X86EMUL_MODE_PROT64)
9452 			ctxt->eip = (u32)ctxt->_eip;
9454 			ctxt->eip = ctxt->_eip;
9461 		kvm_rip_write(vcpu, ctxt->eip);
9462 		if (ctxt->eflags & X86_EFLAGS_RF)
9463 			kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
9468 	 * If emulation was caused by a write-protection #PF on a non-page_table
9480 	if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
9481 		vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
9488 		ctxt->exception.address = cr2_or_gpa;
9491 		if (vcpu->arch.mmu->root_role.direct) {
9492 			ctxt->gpa_available = true;
9493 			ctxt->gpa_val = cr2_or_gpa;
9497 		ctxt->exception.address = 0;
9502 	 * L2, unless KVM is re-emulating a previously decoded instruction,
9520 	if (ctxt->have_exception) {
9521 		WARN_ON_ONCE(vcpu->mmio_needed && !vcpu->mmio_is_write);
9522 		vcpu->mmio_needed = false;
9525 	} else if (vcpu->arch.pio.count) {
9526 		if (!vcpu->arch.pio.in) {
9527 			/* FIXME: return into emulator if single-stepping.  */
9528 			vcpu->arch.pio.count = 0;
9531 			vcpu->arch.complete_userspace_io = complete_emulated_pio;
9534 	} else if (vcpu->mmio_needed) {
9535 		++vcpu->stat.mmio_exits;
9537 		if (!vcpu->mmio_is_write)
9540 		vcpu->arch.complete_userspace_io = complete_emulated_mmio;
9541 	} else if (vcpu->arch.complete_userspace_io) {
9552 		toggle_interruptibility(vcpu, ctxt->interruptibility);
9553 		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
9556 		 * Note, EXCPT_DB is assumed to be fault-like as the emulator
9558 		 * of which are fault-like.
9560 		if (!ctxt->have_exception ||
9561 		    exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
9563 			if (ctxt->is_branch)
9565 			kvm_rip_write(vcpu, ctxt->eip);
9566 			if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
9569 			__kvm_set_rflags(vcpu, ctxt->eflags);
9578 		if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF))
9581 		vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
9601 	vcpu->arch.pio.count = 0;
9607 	vcpu->arch.pio.count = 0;
9609 	if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip)))
9629 	    kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) {
9630 		vcpu->arch.complete_userspace_io =
9634 		vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
9635 		vcpu->arch.complete_userspace_io = complete_fast_pio_out;
9645 	BUG_ON(vcpu->arch.pio.count != 1);
9647 	if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))) {
9648 		vcpu->arch.pio.count = 0;
9653 	val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
9676 	vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
9677 	vcpu->arch.complete_userspace_io = complete_fast_pio_in;
9708 		khz = freq->new;
9729 	/* TSC frequency always matches when on Hyper-V */
9797 			if (vcpu->cpu != cpu)
9800 			if (vcpu->cpu != raw_smp_processor_id())
9806 	if (freq->old < freq->new && send_ipi) {
9829 	if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
9831 	if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
9834 	for_each_cpu(cpu, freq->policy->cpus)
9862 				if (policy->cpuinfo.max_freq)
9863 					max_tsc_khz = policy->cpuinfo.max_freq;
9921 	if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
9934 	memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
9944 #include <asm/kvm-x86-ops.h>
9947 	kvm_pmu_ops_update(ops->pmu_ops);
9965 		return -EIO;
9984 		return -EEXIST;
9994 		return -EOPNOTSUPP;
9999 		return -EOPNOTSUPP;
10012 		return -EIO;
10018 		 * Linux doesn't yet support supervisor shadow stacks (SSS), so
10024 			return -EIO;
10032 		return -ENOMEM;
10038 		r = -ENOMEM;
10065 	kvm_init_pmu_capability(ops->pmu_ops);
10070 	r = ops->hardware_setup();
10092 	if (pi_inject_timer == -1)
10101 	kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
10195 		return -KVM_EOPNOTSUPP;
10198 	 * When tsc is in permanent catchup mode guests won't be able to use
10201 	if (vcpu->arch.tsc_always_catchup)
10202 		return -KVM_EOPNOTSUPP;
10205 		return -KVM_EOPNOTSUPP;
10214 	if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing,
10216 		ret = -KVM_EFAULT;
10225  * @apicid - apicid of vcpu to be kicked.
10245 	return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0);
10251 	ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons);
10279 	set_or_clear_apicv_inhibit(&kvm->arch.apicv_inhibit_reasons, reason, true);
10281 	init_rwsem(&kvm->arch.apicv_update_lock);
10289 	vcpu->stat.directed_yield_attempted++;
10295 	map = rcu_dereference(vcpu->kvm->arch.apic_map);
10297 	if (likely(map) && dest_id <= map->max_apic_id) {
10298 		dest_id = array_index_nospec(dest_id, map->max_apic_id + 1);
10299 		if (map->phys_map[dest_id])
10300 			target = map->phys_map[dest_id]->vcpu;
10305 	if (!target || !READ_ONCE(target->ready))
10315 	vcpu->stat.directed_yield_successful++;
10323 	u64 ret = vcpu->run->hypercall.ret;
10342 	++vcpu->stat.hypercalls;
10355 		ret = -KVM_EPERM;
10359 	ret = -KVM_ENOSYS;
10369 		kvm_pv_kick_cpu_op(vcpu->kvm, a1);
10382 		ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
10394 		ret = -KVM_ENOSYS;
10395 		if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE))
10400 			ret = -KVM_EINVAL;
10404 		vcpu->run->exit_reason        = KVM_EXIT_HYPERCALL;
10405 		vcpu->run->hypercall.nr       = KVM_HC_MAP_GPA_RANGE;
10407 		 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
10408 		 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
10410 		 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
10412 		vcpu->run->hypercall.ret = 0;
10413 		vcpu->run->hypercall.args[0]  = gpa;
10414 		vcpu->run->hypercall.args[1]  = npages;
10415 		vcpu->run->hypercall.args[2]  = attrs;
10416 		vcpu->run->hypercall.flags    = 0;
10418 			vcpu->run->hypercall.flags |= KVM_EXIT_HYPERCALL_LONG_MODE;
10420 		WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ);
10421 		vcpu->arch.complete_userspace_io = complete_hypercall;
10425 		ret = -KVM_ENOSYS;
10430 	vcpu->run->hypercall.ret = ret;
10437 	if (kvm_xen_hypercall_enabled(vcpu->kvm))
10458 	if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_FIX_HYPERCALL_INSN)) {
10459 		ctxt->exception.error_code_valid = false;
10460 		ctxt->exception.vector = UD_VECTOR;
10461 		ctxt->have_exception = true;
10468 		&ctxt->exception);
10473 	return vcpu->run->request_interrupt_window &&
10474 		likely(!pic_in_kernel(vcpu->kvm));
10477 /* Called within kvm->srcu read side.  */
10480 	struct kvm_run *kvm_run = vcpu->run;
10482 	kvm_run->if_flag = kvm_x86_call(get_if_flag)(vcpu);
10483 	kvm_run->cr8 = kvm_get_cr8(vcpu);
10484 	kvm_run->apic_base = vcpu->arch.apic_base;
10486 	kvm_run->ready_for_interrupt_injection =
10487 		pic_in_kernel(vcpu->kvm) ||
10491 		kvm_run->flags |= KVM_RUN_X86_SMM;
10493 		kvm_run->flags |= KVM_RUN_X86_GUEST_MODE;
10506 	if (vcpu->arch.apic->apicv_active)
10509 	if (!vcpu->arch.apic->vapic_addr)
10512 		max_irr = -1;
10514 	if (max_irr != -1)
10526 		kvm_x86_ops.nested_ops->triple_fault(vcpu);
10530 	return kvm_x86_ops.nested_ops->check_events(vcpu);
10536 	 * Suppress the error code if the vCPU is in Real Mode, as Real Mode
10539 	 * is injected as intercepted #PF VM-Exits for AMD's Paged Real Mode do
10540 	 * report an error code despite the CPU being in Real Mode.
10542 	vcpu->arch.exception.has_error_code &= is_protmode(vcpu);
10544 	trace_kvm_inj_exception(vcpu->arch.exception.vector,
10545 				vcpu->arch.exception.has_error_code,
10546 				vcpu->arch.exception.error_code,
10547 				vcpu->arch.exception.injected);
10557  * injected as part of a previous VM-Enter, but weren't successfully delivered
10558  * and need to be re-injected.
10563  * also be able to re-inject NMIs and IRQs in the middle of an instruction.
10564  * I.e. for exceptions and re-injected events, NOT invoking this on instruction
10569  * instruction boundaries for asynchronous events.  However, because VM-Exits
10575  * But, if a VM-Exit occurs during instruction execution, and KVM does NOT skip
10598 	 * Process nested events first, as nested VM-Exit supersedes event
10599 	 * re-injection.  If there's an event queued for re-injection, it will
10600 	 * be saved into the appropriate vmc{b,s}12 fields on nested VM-Exit.
10608 	 * Re-inject exceptions and events *especially* if immediate entry+exit
10612 	 * Don't re-inject an NMI or interrupt if there is a pending exception.
10621 	 * as the exception "occurred" before the exit to userspace.  Trap-like
10623 	 * And while fault-like exceptions, e.g. #GP and #PF, are the lowest
10626 	 * Thus a pending fault-like exception means the fault occurred on the
10630 	if (vcpu->arch.exception.injected)
10634 	else if (vcpu->arch.nmi_injected)
10636 	else if (vcpu->arch.interrupt.injected)
10640 	 * Exceptions that morph to VM-Exits are handled above, and pending
10641 	 * exceptions on top of injected exceptions that do not VM-Exit should
10644 	WARN_ON_ONCE(vcpu->arch.exception.injected &&
10645 		     vcpu->arch.exception.pending);
10649 	 * nested VM-Enter or event re-injection so that a different pending
10652 	 * Otherwise, continue processing events even if VM-Exit occurred.  The
10653 	 * VM-Exit will have cleared exceptions that were meant for L2, but
10660 	 * A pending exception VM-Exit should either result in nested VM-Exit
10661 	 * or force an immediate re-entry and exit to/from L2, and exception
10662 	 * VM-Exits cannot be injected (flag should _never_ be set).
10664 	WARN_ON_ONCE(vcpu->arch.exception_vmexit.injected ||
10665 		     vcpu->arch.exception_vmexit.pending);
10669 	 * to re-inject a previous event.  See above comments on re-injecting
10674 	if (vcpu->arch.exception.pending) {
10676 		 * Fault-class exceptions, except #DBs, set RF=1 in the RFLAGS
10677 		 * value pushed on the stack.  Trap-like exception and all #DBs
10678 		 * leave RF as-is (KVM follows Intel's behavior in this regard;
10683 		 * fault-like.  They do _not_ set RF, a la code breakpoints.
10685 		if (exception_type(vcpu->arch.exception.vector) == EXCPT_FAULT)
10689 		if (vcpu->arch.exception.vector == DB_VECTOR) {
10690 			kvm_deliver_exception_payload(vcpu, &vcpu->arch.exception);
10691 			if (vcpu->arch.dr7 & DR7_GD) {
10692 				vcpu->arch.dr7 &= ~DR7_GD;
10699 		vcpu->arch.exception.pending = false;
10700 		vcpu->arch.exception.injected = true;
10706 	if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ)
10711 	 * due to architectural conditions (e.g. IF=0) a window-open exit
10712 	 * will re-request KVM_REQ_EVENT.  Sometimes however an event is pending
10718 	 * The kvm_x86_ops hooks communicate this by returning -EBUSY.
10721 	if (vcpu->arch.smi_pending) {
10723 				 -EBUSY;
10727 			vcpu->arch.smi_pending = false;
10728 			++vcpu->arch.smi_count;
10736 	if (vcpu->arch.nmi_pending) {
10738 				 -EBUSY;
10742 			--vcpu->arch.nmi_pending;
10743 			vcpu->arch.nmi_injected = true;
10748 		if (vcpu->arch.nmi_pending)
10754 				 -EBUSY;
10760 			if (!WARN_ON_ONCE(irq == -1)) {
10771 	    kvm_x86_ops.nested_ops->has_events &&
10772 	    kvm_x86_ops.nested_ops->has_events(vcpu, true))
10777 	 * is done emulating and should only propagate the to-be-injected event
10779 	 * infinite loop as KVM will bail from VM-Enter to inject the pending
10785 	 * Mode events (see kvm_inject_realmode_interrupt()).
10787 	WARN_ON_ONCE(vcpu->arch.exception.pending ||
10788 		     vcpu->arch.exception_vmexit.pending);
10792 	if (r == -EBUSY) {
10813 	if (kvm_x86_call(get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
10820 	 * tracked in vcpu->arch.nmi_pending.
10823 		limit--;
10825 	vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
10826 	vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
10828 	if (vcpu->arch.nmi_pending &&
10830 		vcpu->arch.nmi_pending--;
10832 	if (vcpu->arch.nmi_pending)
10839 	return vcpu->arch.nmi_pending +
10856 	struct kvm_lapic *apic = vcpu->arch.apic;
10862 	down_read(&vcpu->kvm->arch.apicv_update_lock);
10869 	if (apic->apicv_active == activate)
10872 	apic->apicv_active = activate;
10882 	if (!apic->apicv_active)
10887 	up_read(&vcpu->kvm->arch.apicv_update_lock);
10904 	 * despite being in x2APIC mode.  For simplicity, inhibiting the APIC
10907 	if (apic_x2apic_mode(vcpu->arch.apic) &&
10919 	lockdep_assert_held_write(&kvm->arch.apicv_update_lock);
10924 	old = new = kvm->arch.apicv_inhibit_reasons;
10942 		kvm->arch.apicv_inhibit_reasons = new;
10945 			int idx = srcu_read_lock(&kvm->srcu);
10948 			srcu_read_unlock(&kvm->srcu, idx);
10951 		kvm->arch.apicv_inhibit_reasons = new;
10961 	down_write(&kvm->arch.apicv_update_lock);
10963 	up_write(&kvm->arch.apicv_update_lock);
10972 	bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
10973 	vcpu->arch.highest_stale_pending_ioapic_eoi = -1;
10977 	if (irqchip_split(vcpu->kvm))
10978 		kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
10980 	else if (ioapic_in_kernel(vcpu->kvm))
10981 		kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
10985 		vcpu->arch.load_eoi_exitmap_pending = true;
10992 	if (!kvm_apic_hw_enabled(vcpu->arch.apic))
11000 			  vcpu->arch.ioapic_handled_vectors,
11001 			  to_hv_synic(vcpu)->vec_bitmap, 256);
11007 		vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors);
11024  * Called within kvm->srcu read side.
11042 			r = -EIO;
11052 			if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
11062 			kvm_update_masterclock(vcpu->kvm);
11086 		 * Fall back to a "full" guest flush if Hyper-V's precise
11087 		 * flushing fails.  Note, Hyper-V's flushing is per-vCPU, but
11098 			vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
11104 				kvm_x86_ops.nested_ops->triple_fault(vcpu);
11107 				vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
11108 				vcpu->mmio_needed = 0;
11115 			vcpu->arch.apf.halted = true;
11132 			BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
11133 			if (test_bit(vcpu->arch.pending_ioapic_eoi,
11134 				     vcpu->arch.ioapic_handled_vectors)) {
11135 				vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
11136 				vcpu->run->eoi.vector =
11137 						vcpu->arch.pending_ioapic_eoi;
11150 			vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
11151 			vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
11152 			vcpu->run->system_event.ndata = 0;
11157 			vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
11158 			vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
11159 			vcpu->run->system_event.ndata = 0;
11166 			vcpu->run->exit_reason = KVM_EXIT_HYPERV;
11167 			vcpu->run->hyperv = hv_vcpu->exit;
11174 		 * KVM_REQ_CLOCK_UPDATE, because Hyper-V SynIC timers
11175 		 * depend on the guest clock being up-to-date
11193 			if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) {
11202 		++vcpu->stat.req_event;
11208 		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
11243 	/* Store vcpu->apicv_active before vcpu->mode.  */
11244 	smp_store_release(&vcpu->mode, IN_GUEST_MODE);
11249 	 * 1) We should set ->mode before checking ->requests.  Please see
11252 	 * 2) For APICv, we should set ->mode before checking PID.ON. This
11256 	 * 3) This also orders the write to mode from any reads to the page
11273 		vcpu->mode = OUTSIDE_GUEST_MODE;
11292 	if (vcpu->arch.guest_fpu.xfd_err)
11293 		wrmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
11295 	if (unlikely(vcpu->arch.switch_db_regs &&
11296 		     !(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))) {
11298 		set_debugreg(vcpu->arch.eff_db[0], 0);
11299 		set_debugreg(vcpu->arch.eff_db[1], 1);
11300 		set_debugreg(vcpu->arch.eff_db[2], 2);
11301 		set_debugreg(vcpu->arch.eff_db[3], 3);
11303 		if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
11312 	 * vendor code if any host-owned bits were changed, e.g. so that the
11316 	if ((debug_ctl ^ vcpu->arch.host_debugctl) & kvm_x86_ops.HOST_OWNED_DEBUGCTL &&
11317 	    !vcpu->arch.guest_state_protected)
11319 	vcpu->arch.host_debugctl = debug_ctl;
11327 		 * per-VM state, and responding vCPUs must wait for the update
11347 		/* Note, VM-Exits that go down the "slow" path are accounted below. */
11348 		++vcpu->stat.exits;
11357 	if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
11358 		WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
11359 		WARN_ON(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH);
11375 	vcpu->arch.last_vmentry_cpu = vcpu->cpu;
11376 	vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
11378 	vcpu->mode = OUTSIDE_GUEST_MODE;
11383 	 * rely on the fact that guest_fpu::xfd is up-to-date (e.g.
11386 	if (vcpu->arch.xfd_no_write_intercept)
11391 	if (vcpu->arch.guest_fpu.xfd_err)
11405 	 * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
11412 	++vcpu->stat.exits;
11440 		     !vcpu->arch.guest_state_protected)) {
11445 	if (unlikely(vcpu->arch.tsc_always_catchup))
11448 	if (vcpu->arch.apic_attention)
11461 	if (unlikely(vcpu->arch.apic_attention))
11469 	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
11470 		!vcpu->arch.apf.halted);
11475 	if (!list_empty_careful(&vcpu->async_pf.done))
11486 	    (vcpu->arch.nmi_pending &&
11492 	    (vcpu->arch.smi_pending &&
11510 	    kvm_x86_ops.nested_ops->has_events &&
11511 	    kvm_x86_ops.nested_ops->has_events(vcpu, false))
11523 	return kvm_vcpu_running(vcpu) || vcpu->arch.pv.pv_unhalted ||
11527 /* Called within kvm->srcu read side.  */
11534 		 * Switch to the software timer before halt-polling/blocking as
11536 		 * hypervisor timer runs only when the CPU is in guest mode.
11537 		 * Switch before halt-polling so that KVM recognizes an expired
11545 		if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
11566 	 * state field (AMD does not have a similar field and a VM-Exit always
11572 		WARN_ON_ONCE(r == -EBUSY);
11579 	switch(vcpu->arch.mp_state) {
11585 		vcpu->arch.apf.halted = false;
11596 /* Called within kvm->srcu read side.  */
11601 	vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
11610 		vcpu->arch.at_instruction_boundary = false;
11630 			vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
11631 			++vcpu->stat.request_irq_exits;
11651 	 * local APIC is in-kernel, the run loop will detect the non-runnable
11656 	++vcpu->stat.halt_exits;
11658 		if (kvm_vcpu_has_events(vcpu) || vcpu->arch.pv.pv_unhalted)
11663 		vcpu->run->exit_reason = reason;
11678 	 * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
11714 	return vcpu->arch.preempted_in_kernel;
11719 	if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
11739 	BUG_ON(!vcpu->arch.pio.count);
11764 	struct kvm_run *run = vcpu->run;
11768 	BUG_ON(!vcpu->mmio_needed);
11771 	frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
11772 	len = min(8u, frag->len);
11773 	if (!vcpu->mmio_is_write)
11774 		memcpy(frag->data, run->mmio.data, len);
11776 	if (frag->len <= 8) {
11779 		vcpu->mmio_cur_fragment++;
11782 		frag->data += len;
11783 		frag->gpa += len;
11784 		frag->len -= len;
11787 	if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
11788 		vcpu->mmio_needed = 0;
11790 		/* FIXME: return into emulator if single-stepping.  */
11791 		if (vcpu->mmio_is_write)
11793 		vcpu->mmio_read_completed = 1;
11797 	run->exit_reason = KVM_EXIT_MMIO;
11798 	run->mmio.phys_addr = frag->gpa;
11799 	if (vcpu->mmio_is_write)
11800 		memcpy(run->mmio.data, frag->data, min(8u, frag->len));
11801 	run->mmio.len = min(8u, frag->len);
11802 	run->mmio.is_write = vcpu->mmio_is_write;
11803 	vcpu->arch.complete_userspace_io = complete_emulated_mmio;
11810 	/* Exclude PKRU, it's restored separately immediately after VM-Exit. */
11811 	fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true);
11818 	fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false);
11819 	++vcpu->stat.fpu_reload;
11826 	 * SIPI_RECEIVED is obsolete; KVM leaves the vCPU in Wait-For-SIPI and
11831 	if (WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED))
11832 		return -EINVAL;
11838 	if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED &&
11840 		return -EINVAL;
11847 	struct kvm_queued_exception *ex = &vcpu->arch.exception;
11848 	struct kvm_run *kvm_run = vcpu->run;
11852 	r = kvm_mmu_post_init_vm(vcpu->kvm);
11858 	kvm_run->flags = 0;
11862 	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
11863 		if (!vcpu->wants_to_run) {
11864 			r = -EINTR;
11886 		r = -EAGAIN;
11888 			r = -EINTR;
11889 			kvm_run->exit_reason = KVM_EXIT_INTR;
11890 			++vcpu->stat.signal_exits;
11895 	sync_valid_fields = kvm_sync_valid_fields(vcpu->kvm);
11896 	if ((kvm_run->kvm_valid_regs & ~sync_valid_fields) ||
11897 	    (kvm_run->kvm_dirty_regs & ~sync_valid_fields)) {
11898 		r = -EINVAL;
11902 	if (kvm_run->kvm_dirty_regs) {
11908 	/* re-sync apic's tpr */
11910 		if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
11911 			r = -EINVAL;
11918 	 * a pending VM-Exit if L1 wants to intercept the exception.
11920 	if (vcpu->arch.exception_from_userspace && is_guest_mode(vcpu) &&
11921 	    kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, ex->vector,
11922 							ex->error_code)) {
11923 		kvm_queue_exception_vmexit(vcpu, ex->vector,
11924 					   ex->has_error_code, ex->error_code,
11925 					   ex->has_payload, ex->payload);
11926 		ex->injected = false;
11927 		ex->pending = false;
11929 	vcpu->arch.exception_from_userspace = false;
11931 	if (unlikely(vcpu->arch.complete_userspace_io)) {
11932 		int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
11933 		vcpu->arch.complete_userspace_io = NULL;
11938 		WARN_ON_ONCE(vcpu->arch.pio.count);
11939 		WARN_ON_ONCE(vcpu->mmio_needed);
11942 	if (!vcpu->wants_to_run) {
11943 		r = -EINTR;
11955 	if (kvm_run->kvm_valid_regs && likely(!vcpu->arch.guest_state_protected))
11967 	if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
11975 		emulator_writeback_register_cache(vcpu->arch.emulate_ctxt);
11976 		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
11978 	regs->rax = kvm_rax_read(vcpu);
11979 	regs->rbx = kvm_rbx_read(vcpu);
11980 	regs->rcx = kvm_rcx_read(vcpu);
11981 	regs->rdx = kvm_rdx_read(vcpu);
11982 	regs->rsi = kvm_rsi_read(vcpu);
11983 	regs->rdi = kvm_rdi_read(vcpu);
11984 	regs->rsp = kvm_rsp_read(vcpu);
11985 	regs->rbp = kvm_rbp_read(vcpu);
11987 	regs->r8 = kvm_r8_read(vcpu);
11988 	regs->r9 = kvm_r9_read(vcpu);
11989 	regs->r10 = kvm_r10_read(vcpu);
11990 	regs->r11 = kvm_r11_read(vcpu);
11991 	regs->r12 = kvm_r12_read(vcpu);
11992 	regs->r13 = kvm_r13_read(vcpu);
11993 	regs->r14 = kvm_r14_read(vcpu);
11994 	regs->r15 = kvm_r15_read(vcpu);
11997 	regs->rip = kvm_rip_read(vcpu);
11998 	regs->rflags = kvm_get_rflags(vcpu);
12003 	if (vcpu->kvm->arch.has_protected_state &&
12004 	    vcpu->arch.guest_state_protected)
12005 		return -EINVAL;
12015 	vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
12016 	vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
12018 	kvm_rax_write(vcpu, regs->rax);
12019 	kvm_rbx_write(vcpu, regs->rbx);
12020 	kvm_rcx_write(vcpu, regs->rcx);
12021 	kvm_rdx_write(vcpu, regs->rdx);
12022 	kvm_rsi_write(vcpu, regs->rsi);
12023 	kvm_rdi_write(vcpu, regs->rdi);
12024 	kvm_rsp_write(vcpu, regs->rsp);
12025 	kvm_rbp_write(vcpu, regs->rbp);
12027 	kvm_r8_write(vcpu, regs->r8);
12028 	kvm_r9_write(vcpu, regs->r9);
12029 	kvm_r10_write(vcpu, regs->r10);
12030 	kvm_r11_write(vcpu, regs->r11);
12031 	kvm_r12_write(vcpu, regs->r12);
12032 	kvm_r13_write(vcpu, regs->r13);
12033 	kvm_r14_write(vcpu, regs->r14);
12034 	kvm_r15_write(vcpu, regs->r15);
12037 	kvm_rip_write(vcpu, regs->rip);
12038 	kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
12040 	vcpu->arch.exception.pending = false;
12041 	vcpu->arch.exception_vmexit.pending = false;
12048 	if (vcpu->kvm->arch.has_protected_state &&
12049 	    vcpu->arch.guest_state_protected)
12050 		return -EINVAL;
12062 	if (vcpu->arch.guest_state_protected)
12065 	kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
12066 	kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
12067 	kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
12068 	kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
12069 	kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
12070 	kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
12072 	kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
12073 	kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
12076 	sregs->idt.limit = dt.size;
12077 	sregs->idt.base = dt.address;
12079 	sregs->gdt.limit = dt.size;
12080 	sregs->gdt.base = dt.address;
12082 	sregs->cr2 = vcpu->arch.cr2;
12083 	sregs->cr3 = kvm_read_cr3(vcpu);
12086 	sregs->cr0 = kvm_read_cr0(vcpu);
12087 	sregs->cr4 = kvm_read_cr4(vcpu);
12088 	sregs->cr8 = kvm_get_cr8(vcpu);
12089 	sregs->efer = vcpu->arch.efer;
12090 	sregs->apic_base = vcpu->arch.apic_base;
12097 	if (vcpu->arch.guest_state_protected)
12100 	if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
12101 		set_bit(vcpu->arch.interrupt.nr,
12102 			(unsigned long *)sregs->interrupt_bitmap);
12111 	if (vcpu->arch.guest_state_protected)
12116 			sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i);
12117 		sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID;
12124 	if (vcpu->kvm->arch.has_protected_state &&
12125 	    vcpu->arch.guest_state_protected)
12126 		return -EINVAL;
12150 	if ((vcpu->arch.mp_state == KVM_MP_STATE_HALTED ||
12151 	     vcpu->arch.mp_state == KVM_MP_STATE_AP_RESET_HOLD) &&
12152 	    vcpu->arch.pv.pv_unhalted)
12153 		mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
12155 		mp_state->mp_state = vcpu->arch.mp_state;
12169 	int ret = -EINVAL;
12173 	switch (mp_state->mp_state) {
12192 	 * leaves the vCPU in INIT_RECIEVED (Wait-For-SIPI) and pends the SIPI.
12195 	if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
12196 		mp_state->mp_state = KVM_MP_STATE_INIT_RECEIVED;
12197 		set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
12200 	kvm_set_mp_state(vcpu, mp_state->mp_state);
12212 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
12219 		 * Check both User and Supervisor on task switches as inter-
12243 	if (ret || vcpu->mmio_needed)
12246 	kvm_rip_write(vcpu, ctxt->eip);
12247 	kvm_set_rflags(vcpu, ctxt->eflags);
12251 	vcpu->mmio_needed = false;
12252 	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
12253 	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
12254 	vcpu->run->internal.ndata = 0;
12261 	if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
12264 		 * 64-bit mode (though maybe in a 32-bit code segment).
12267 		if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA))
12269 		if (!kvm_vcpu_is_legal_cr3(vcpu, sregs->cr3))
12273 		 * Not in 64-bit mode: EFER.LMA is clear and the code
12274 		 * segment cannot be 64-bit.
12276 		if (sregs->efer & EFER_LMA || sregs->cs.l)
12280 	return kvm_is_valid_cr4(vcpu, sregs->cr4) &&
12281 	       kvm_is_valid_cr0(vcpu, sregs->cr0);
12291 		return -EINVAL;
12293 	if (kvm_apic_set_base(vcpu, sregs->apic_base, true))
12294 		return -EINVAL;
12296 	if (vcpu->arch.guest_state_protected)
12299 	dt.size = sregs->idt.limit;
12300 	dt.address = sregs->idt.base;
12302 	dt.size = sregs->gdt.limit;
12303 	dt.address = sregs->gdt.base;
12306 	vcpu->arch.cr2 = sregs->cr2;
12307 	*mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
12308 	vcpu->arch.cr3 = sregs->cr3;
12310 	kvm_x86_call(post_set_cr3)(vcpu, sregs->cr3);
12312 	kvm_set_cr8(vcpu, sregs->cr8);
12314 	*mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
12315 	kvm_x86_call(set_efer)(vcpu, sregs->efer);
12317 	*mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
12318 	kvm_x86_call(set_cr0)(vcpu, sregs->cr0);
12320 	*mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
12321 	kvm_x86_call(set_cr4)(vcpu, sregs->cr4);
12324 		idx = srcu_read_lock(&vcpu->kvm->srcu);
12329 		srcu_read_unlock(&vcpu->kvm->srcu, idx);
12332 	kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
12333 	kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
12334 	kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
12335 	kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
12336 	kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
12337 	kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
12339 	kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
12340 	kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
12346 	    sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
12369 		(const unsigned long *)sregs->interrupt_bitmap, max_bits);
12382 	bool valid_pdptrs = sregs2->flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
12383 	bool pae = (sregs2->cr0 & X86_CR0_PG) && (sregs2->cr4 & X86_CR4_PAE) &&
12384 		!(sregs2->efer & EFER_LMA);
12387 	if (sregs2->flags & ~KVM_SREGS2_FLAGS_PDPTRS_VALID)
12388 		return -EINVAL;
12390 	if (valid_pdptrs && (!pae || vcpu->arch.guest_state_protected))
12391 		return -EINVAL;
12400 			kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]);
12404 		vcpu->arch.pdptrs_from_userspace = true;
12418 	if (vcpu->kvm->arch.has_protected_state &&
12419 	    vcpu->arch.guest_state_protected)
12420 		return -EINVAL;
12437 	down_write(&kvm->arch.apicv_update_lock);
12440 		if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) {
12446 	up_write(&kvm->arch.apicv_update_lock);
12455 	if (vcpu->arch.guest_state_protected)
12456 		return -EINVAL;
12460 	if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
12461 		r = -EBUSY;
12464 		if (dbg->control & KVM_GUESTDBG_INJECT_DB)
12476 	vcpu->guest_debug = dbg->control;
12477 	if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
12478 		vcpu->guest_debug = 0;
12480 	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
12482 			vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
12483 		vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
12486 			vcpu->arch.eff_db[i] = vcpu->arch.db[i];
12490 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
12491 		vcpu->arch.singlestep_rip = kvm_get_linear_rip(vcpu);
12501 	kvm_arch_vcpu_guestdbg_update_apicv_inhibit(vcpu->kvm);
12516 	unsigned long vaddr = tr->linear_address;
12522 	idx = srcu_read_lock(&vcpu->kvm->srcu);
12524 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
12525 	tr->physical_address = gpa;
12526 	tr->valid = gpa != INVALID_GPA;
12527 	tr->writeable = 1;
12528 	tr->usermode = 0;
12538 	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
12539 		return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
12543 	fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
12544 	memcpy(fpu->fpr, fxsave->st_space, 128);
12545 	fpu->fcw = fxsave->cwd;
12546 	fpu->fsw = fxsave->swd;
12547 	fpu->ftwx = fxsave->twd;
12548 	fpu->last_opcode = fxsave->fop;
12549 	fpu->last_ip = fxsave->rip;
12550 	fpu->last_dp = fxsave->rdp;
12551 	memcpy(fpu->xmm, fxsave->xmm_space, sizeof(fxsave->xmm_space));
12561 	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
12562 		return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
12566 	fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
12568 	memcpy(fxsave->st_space, fpu->fpr, 128);
12569 	fxsave->cwd = fpu->fcw;
12570 	fxsave->swd = fpu->fsw;
12571 	fxsave->twd = fpu->ftwx;
12572 	fxsave->fop = fpu->last_opcode;
12573 	fxsave->rip = fpu->last_ip;
12574 	fxsave->rdp = fpu->last_dp;
12575 	memcpy(fxsave->xmm_space, fpu->xmm, sizeof(fxsave->xmm_space));
12585 	if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS)
12586 		__get_regs(vcpu, &vcpu->run->s.regs.regs);
12588 	if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS)
12589 		__get_sregs(vcpu, &vcpu->run->s.regs.sregs);
12591 	if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS)
12593 				vcpu, &vcpu->run->s.regs.events);
12598 	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
12599 		__set_regs(vcpu, &vcpu->run->s.regs.regs);
12600 		vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
12603 	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
12604 		struct kvm_sregs sregs = vcpu->run->s.regs.sregs;
12607 			return -EINVAL;
12609 		vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
12612 	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
12613 		struct kvm_vcpu_events events = vcpu->run->s.regs.events;
12616 			return -EINVAL;
12618 		vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
12626 	if (kvm_check_tsc_unstable() && kvm->created_vcpus)
12630 	if (!kvm->arch.max_vcpu_ids)
12631 		kvm->arch.max_vcpu_ids = KVM_MAX_VCPU_IDS;
12633 	if (id >= kvm->arch.max_vcpu_ids)
12634 		return -EINVAL;
12644 	vcpu->arch.last_vmentry_cpu = -1;
12645 	vcpu->arch.regs_avail = ~0;
12646 	vcpu->arch.regs_dirty = ~0;
12648 	kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm);
12650 	if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
12663 	r = -ENOMEM;
12668 	vcpu->arch.pio_data = page_address(page);
12670 	vcpu->arch.mce_banks = kcalloc(KVM_MAX_MCE_BANKS * 4, sizeof(u64),
12672 	vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64),
12674 	if (!vcpu->arch.mce_banks || !vcpu->arch.mci_ctl2_banks)
12676 	vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
12678 	if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
12685 	if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) {
12692 	if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) {
12693 		vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
12694 		vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
12695 		vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap;
12699 	vcpu->arch.pending_external_vector = -1;
12700 	vcpu->arch.preempted_in_kernel = false;
12703 	vcpu->arch.hv_root_tdp = INVALID_PAGE;
12713 	kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
12720 	fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
12722 	kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
12724 	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
12726 	kfree(vcpu->arch.mce_banks);
12727 	kfree(vcpu->arch.mci_ctl2_banks);
12728 	free_page((unsigned long)vcpu->arch.pio_data);
12738 	struct kvm *kvm = vcpu->kvm;
12740 	if (mutex_lock_killable(&vcpu->mutex))
12747 	vcpu->arch.msr_kvm_poll_control = 1;
12749 	mutex_unlock(&vcpu->mutex);
12751 	if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0)
12752 		schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
12770 	kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
12771 	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
12772 	fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
12777 	kfree(vcpu->arch.mce_banks);
12778 	kfree(vcpu->arch.mci_ctl2_banks);
12780 	idx = srcu_read_lock(&vcpu->kvm->srcu);
12782 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
12783 	free_page((unsigned long)vcpu->arch.pio_data);
12784 	kvfree(vcpu->arch.cpuid_entries);
12789 	struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
12830 	 * Several of the "set" flows, e.g. ->set_cr0(), read other registers
12840 	 * SVM doesn't unconditionally VM-Exit on INIT and SHUTDOWN, thus it's
12851 	vcpu->arch.hflags = 0;
12853 	vcpu->arch.smi_pending = 0;
12854 	vcpu->arch.smi_count = 0;
12855 	atomic_set(&vcpu->arch.nmi_queued, 0);
12856 	vcpu->arch.nmi_pending = 0;
12857 	vcpu->arch.nmi_injected = false;
12861 	memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
12863 	vcpu->arch.dr6 = DR6_ACTIVE_LOW;
12864 	vcpu->arch.dr7 = DR7_FIXED_1;
12867 	vcpu->arch.cr2 = 0;
12870 	vcpu->arch.apf.msr_en_val = 0;
12871 	vcpu->arch.apf.msr_int_val = 0;
12872 	vcpu->arch.st.msr_val = 0;
12878 	vcpu->arch.apf.halted = false;
12883 		vcpu->arch.smbase = 0x30000;
12885 		vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
12887 		vcpu->arch.msr_misc_features_enables = 0;
12888 		vcpu->arch.ia32_misc_enable_msr = MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
12896 	memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
12907 	kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600);
12914 	vcpu->arch.cr3 = 0;
12936 	 * which PCIDs have to be flushed.  However, CR0.WP and the paging-related
13006 			if (!stable && vcpu->cpu == smp_processor_id())
13008 			if (stable && vcpu->arch.last_host_tsc > local_tsc) {
13010 				if (vcpu->arch.last_host_tsc > max_tsc)
13011 					max_tsc = vcpu->arch.last_host_tsc;
13041 	 * N.B. - this code below runs only on platforms with reliable TSC,
13051 	 * catchup mode.  This will catchup all VCPUs to real time, but cannot
13055 		u64 delta_cyc = max_tsc - local_tsc;
13057 			kvm->arch.backwards_tsc_observed = true;
13059 				vcpu->arch.tsc_offset_adjustment += delta_cyc;
13060 				vcpu->arch.last_host_tsc = local_tsc;
13070 			kvm->arch.last_tsc_nsec = 0;
13071 			kvm->arch.last_tsc_write = 0;
13086 	return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
13092 	return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
13098 	kfree(kvm->arch.hv_pa_pg);
13110 		return -EINVAL;
13112 	kvm->arch.vm_type = type;
13113 	kvm->arch.has_private_mem =
13116 	kvm->arch.pre_fault_allowed =
13118 	kvm->arch.disabled_quirks = kvm_caps.inapplicable_quirks & kvm_caps.supported_quirks;
13132 	atomic_set(&kvm->arch.noncoherent_dma_count, 0);
13134 	raw_spin_lock_init(&kvm->arch.tsc_write_lock);
13135 	mutex_init(&kvm->arch.apic_map_lock);
13136 	seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock);
13137 	kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
13139 	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
13141 	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
13143 	kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz;
13144 	kvm->arch.apic_bus_cycle_ns = APIC_BUS_CYCLE_NS_DEFAULT;
13145 	kvm->arch.guest_can_read_msr_platform_info = true;
13146 	kvm->arch.enable_pmu = enable_pmu;
13149 	spin_lock_init(&kvm->arch.hv_root_tdp_lock);
13150 	kvm->arch.hv_root_tdp = INVALID_PAGE;
13153 	INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
13154 	INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
13167 	once_init(&kvm->arch.nx_once);
13191  *   -errno:        on error
13196  * GPA->HVA translation will not change.  However, the HVA is a user
13208 	lockdep_assert_held(&kvm->slots_lock);
13211 		return ERR_PTR_USR(-EINVAL);
13215 		if (slot && slot->npages)
13216 			return ERR_PTR_USR(-EEXIST);
13227 		if (!slot || !slot->npages)
13230 		old_npages = slot->npages;
13231 		hva = slot->userspace_addr;
13259 	 * is unsafe, i.e. will lead to use-after-free.  The PIT also needs to
13262 	cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
13263 	cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
13275 	if (current->mm == kvm->mm) {
13281 		mutex_lock(&kvm->slots_lock);
13287 		mutex_unlock(&kvm->slots_lock);
13290 	kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
13295 	kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
13296 	kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
13309 		vfree(slot->arch.rmap[i]);
13310 		slot->arch.rmap[i] = NULL;
13321 		vfree(slot->arch.lpage_info[i - 1]);
13322 		slot->arch.lpage_info[i - 1] = NULL;
13330 	const int sz = sizeof(*slot->arch.rmap[0]);
13337 		if (slot->arch.rmap[i])
13340 		slot->arch.rmap[i] = __vcalloc(lpages, sz, GFP_KERNEL_ACCOUNT);
13341 		if (!slot->arch.rmap[i]) {
13343 			return -ENOMEM;
13353 	unsigned long npages = slot->npages;
13361 	memset(&slot->arch, 0, sizeof(slot->arch));
13381 		slot->arch.lpage_info[i - 1] = linfo;
13383 		if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
13385 		if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
13386 			linfo[lpages - 1].disallow_lpage = 1;
13387 		ugfn = slot->userspace_addr >> PAGE_SHIFT;
13392 		if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) {
13413 		vfree(slot->arch.lpage_info[i - 1]);
13414 		slot->arch.lpage_info[i - 1] = NULL;
13416 	return -ENOMEM;
13425 	 * memslots->generation has been incremented.
13430 	/* Force re-initialization of steal_time cache */
13445 		return -EINVAL;
13448 		if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn())
13449 			return -EINVAL;
13451 		if (kvm_is_gfn_alias(kvm, new->base_gfn + new->npages - 1))
13452 			return -EINVAL;
13458 		memcpy(&new->arch, &old->arch, sizeof(old->arch));
13460 		return -EIO;
13470 	if (!kvm->arch.cpu_dirty_log_size)
13473 	nr_slots = atomic_read(&kvm->nr_memslots_dirty_logging);
13483 	u32 old_flags = old ? old->flags : 0;
13484 	u32 new_flags = new ? new->flags : 0;
13504 	 * CREATE:      No shadow pages exist, thus nothing to write-protect
13513 	 * READONLY and non-flags changes were filtered out above, and the only
13533 		 * Initially-all-set does not require write protecting any page,
13542 		if (kvm->arch.cpu_dirty_log_size) {
13558 		 * write-protected before returning to userspace, i.e. before
13565 		 * Specifically, KVM also write-protects guest page tables to
13574 		 * To handle these scenarios, KVM uses a separate software-only
13575 		 * bit (MMU-writable) to track if a SPTE is !writable due to
13576 		 * a guest page table being write-protected (KVM clears the
13577 		 * MMU-writable flag when write-protecting for shadow paging).
13579 		 * The use of MMU-writable is also the primary motivation for
13582 		 * !MMU-writable SPTE, KVM must flush if it encounters any
13583 		 * MMU-writable SPTE regardless of whether the actual hardware
13586 		 * write access" helpers to ignore MMU-writable entirely.
13589 		 * access-tracked SPTEs is particularly relevant).
13603 	if (!kvm->arch.n_requested_mmu_pages &&
13607 		nr_mmu_pages = kvm->nr_memslot_pages / KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO;
13623 	if (vcpu->arch.guest_state_protected)
13633 	if (vcpu->arch.guest_state_protected)
13652 	if (vcpu->arch.guest_state_protected)
13673 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
13681 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
13682 	    kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
13703 	return (key + 1) & (ASYNC_PF_PER_VCPU - 1);
13710 	while (vcpu->arch.apf.gfns[key] != ~0)
13713 	vcpu->arch.apf.gfns[key] = gfn;
13722 		     (vcpu->arch.apf.gfns[key] != gfn &&
13723 		      vcpu->arch.apf.gfns[key] != ~0); i++)
13731 	return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
13740 	if (WARN_ON_ONCE(vcpu->arch.apf.gfns[i] != gfn))
13744 		vcpu->arch.apf.gfns[i] = ~0;
13747 			if (vcpu->arch.apf.gfns[j] == ~0)
13749 			k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
13756 		vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
13765 	return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &reason,
13773 	return kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
13782 	if (kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
13795 	if (!vcpu->arch.apf.send_always &&
13796 	    (vcpu->arch.guest_state_protected || !kvm_x86_call(get_cpl)(vcpu)))
13804 		return vcpu->arch.apf.delivery_as_pf_vmexit;
13808 		 * The real mode IDT in particular is unlikely to have a #PF
13822 	if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
13837 	trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa);
13838 	kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
13846 		fault.address = work->arch.token;
13869 		.vector = vcpu->arch.apf.vec
13872 	if (work->wakeup_all)
13873 		work->arch.token = ~0; /* broadcast wakeup */
13875 		kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
13876 	trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
13878 	if ((work->wakeup_all || work->notpresent_injected) &&
13880 	    !apf_put_user_ready(vcpu, work->arch.token)) {
13881 		vcpu->arch.apf.pageready_pending = true;
13885 	vcpu->arch.apf.halted = false;
13892 	if (!vcpu->arch.apf.pageready_pending)
13907 	 * Non-coherent DMA assignment and de-assignment may affect whether or
13910 	 * (or last) non-coherent device is (un)registered to so that new SPTEs
13921 	if (atomic_inc_return(&kvm->arch.noncoherent_dma_count) == 1)
13927 	if (!atomic_dec_return(&kvm->arch.noncoherent_dma_count))
13933 	return atomic_read(&kvm->arch.noncoherent_dma_count);
13939 	return (vcpu->arch.msr_kvm_poll_control & 1) == 0;
13996 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
14002 	    mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != INVALID_GPA) {
14004 		 * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
14015 	vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault);
14028 		if (KVM_BUG_ON(!e, vcpu->kvm))
14029 			return -EIO;
14039 	 * doesn't seem to be a real use-case behind such requests, just return
14095 		 * page tables, so a non-global flush just degenerates to a
14114 	struct kvm_run *run = vcpu->run;
14118 	BUG_ON(!vcpu->mmio_needed);
14121 	frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
14122 	len = min(8u, frag->len);
14123 	if (!vcpu->mmio_is_write)
14124 		memcpy(frag->data, run->mmio.data, len);
14126 	if (frag->len <= 8) {
14129 		vcpu->mmio_cur_fragment++;
14132 		frag->data += len;
14133 		frag->gpa += len;
14134 		frag->len -= len;
14137 	if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
14138 		vcpu->mmio_needed = 0;
14146 	run->mmio.phys_addr = frag->gpa;
14147 	run->mmio.len = min(8u, frag->len);
14148 	run->mmio.is_write = vcpu->mmio_is_write;
14149 	if (run->mmio.is_write)
14150 		memcpy(run->mmio.data, frag->data, min(8u, frag->len));
14151 	run->exit_reason = KVM_EXIT_MMIO;
14153 	vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
14165 		return -EINVAL;
14171 	bytes -= handled;
14176 	frag = vcpu->mmio_fragments;
14177 	vcpu->mmio_nr_fragments = 1;
14178 	frag->len = bytes;
14179 	frag->gpa = gpa;
14180 	frag->data = data;
14182 	vcpu->mmio_needed = 1;
14183 	vcpu->mmio_cur_fragment = 0;
14185 	vcpu->run->mmio.phys_addr = gpa;
14186 	vcpu->run->mmio.len = min(8u, frag->len);
14187 	vcpu->run->mmio.is_write = 1;
14188 	memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
14189 	vcpu->run->exit_reason = KVM_EXIT_MMIO;
14191 	vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
14204 		return -EINVAL;
14210 	bytes -= handled;
14215 	frag = vcpu->mmio_fragments;
14216 	vcpu->mmio_nr_fragments = 1;
14217 	frag->len = bytes;
14218 	frag->gpa = gpa;
14219 	frag->data = data;
14221 	vcpu->mmio_needed = 1;
14222 	vcpu->mmio_cur_fragment = 0;
14224 	vcpu->run->mmio.phys_addr = gpa;
14225 	vcpu->run->mmio.len = min(8u, frag->len);
14226 	vcpu->run->mmio.is_write = 0;
14227 	vcpu->run->exit_reason = KVM_EXIT_MMIO;
14229 	vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
14237 	vcpu->arch.sev_pio_count -= count;
14238 	vcpu->arch.sev_pio_data += count * size;
14246 	int size = vcpu->arch.pio.size;
14247 	int port = vcpu->arch.pio.port;
14249 	vcpu->arch.pio.count = 0;
14250 	if (vcpu->arch.sev_pio_count)
14260 			min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
14261 		int ret = emulator_pio_out(vcpu, size, port, vcpu->arch.sev_pio_data, count);
14269 		if (!vcpu->arch.sev_pio_count)
14273 	vcpu->arch.complete_userspace_io = complete_sev_es_emulated_outs;
14282 	unsigned count = vcpu->arch.pio.count;
14283 	int size = vcpu->arch.pio.size;
14284 	int port = vcpu->arch.pio.port;
14286 	complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data);
14288 	if (vcpu->arch.sev_pio_count)
14298 			min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
14299 		if (!emulator_pio_in(vcpu, size, port, vcpu->arch.sev_pio_data, count))
14304 		if (!vcpu->arch.sev_pio_count)
14308 	vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins;
14316 	vcpu->arch.sev_pio_data = data;
14317 	vcpu->arch.sev_pio_count = count;