x86.c - OpenGrok cross reference for /linux/arch/x86/kvm/x86.c

Lines Matching +full:use +full:- +full:rtm
1 // SPDX-License-Identifier: GPL-2.0-only
3  * Kernel-based Virtual Machine driver for Linux
16  *   Ben-Ami Yassour <benami@il.ibm.com>
48 #include <linux/user-return-notifier.h>
107 	((struct kvm_vcpu *)(ctxt)->vcpu)
110  * - enable syscall per default because its emulated by KVM
111  * - enable LME and LMA per default on 64 bit KVM
145 				*(((struct kvm_x86_ops *)0)->func));
148 #include <asm/kvm-x86-ops.h>
162 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
171  * Flags to manipulate forced emulation behavior (any non-zero value will
178 int __read_mostly pi_inject_timer = -1;
224  * IA32_XSS[bit 8] (guests can/must use RDMSR/WRMSR to save/restore PT MSRs).
458  * List of MSRs that control the existence of MSR-based features, i.e. MSRs
471 			      (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)];
563 					  size - useroffset, NULL);
572 		vcpu->arch.apf.gfns[i] = ~0;
592 	msrs->registered = false;
596 		values = &msrs->values[slot];
597 		if (values->host != values->curr) {
598 			wrmsrq(kvm_uret_msrs_list[slot], values->host);
599 			values->curr = values->host;
624 		return -1;
639 	return -1;
651 		msrs->values[i].host = value;
652 		msrs->values[i].curr = value;
658 	if (!msrs->registered) {
659 		msrs->urn.on_user_return = kvm_on_user_return;
660 		user_return_notifier_register(&msrs->urn);
661 		msrs->registered = true;
670 	value = (value & mask) | (msrs->values[slot].host & ~mask);
671 	if (value == msrs->values[slot].curr)
677 	msrs->values[slot].curr = value;
685 	return this_cpu_ptr(&user_return_msrs)->values[slot].curr;
693 	if (msrs->registered)
694 		kvm_on_user_return(&msrs->urn);
748 	 * #DBs can be trap-like or fault-like, the caller must check other CPU
767 	if (!ex->has_payload)
770 	switch (ex->vector) {
773 		 * "Certain debug exceptions may clear bit 0-3.  The
777 		vcpu->arch.dr6 &= ~DR_TRAP_BITS;
786 		 * Active low bits should be cleared if 1-setting in payload.
787 		 * Active high bits should be set if 1-setting in payload.
794 		vcpu->arch.dr6 |= DR6_ACTIVE_LOW;
795 		vcpu->arch.dr6 |= ex->payload;
796 		vcpu->arch.dr6 ^= ex->payload & DR6_ACTIVE_LOW;
804 		vcpu->arch.dr6 &= ~BIT(12);
807 		vcpu->arch.cr2 = ex->payload;
811 	ex->has_payload = false;
812 	ex->payload = 0;
820 	struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
822 	ex->vector = vector;
823 	ex->injected = false;
824 	ex->pending = true;
825 	ex->has_error_code = has_error_code;
826 	ex->error_code = error_code;
827 	ex->has_payload = has_payload;
828 	ex->payload = payload;
841 	 * If the exception is destined for L2, morph it to a VM-Exit if L1
845 	    kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) {
851 	if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
853 		vcpu->arch.exception.pending = true;
854 		vcpu->arch.exception.injected = false;
856 		vcpu->arch.exception.has_error_code = has_error;
857 		vcpu->arch.exception.vector = nr;
858 		vcpu->arch.exception.error_code = error_code;
859 		vcpu->arch.exception.has_payload = has_payload;
860 		vcpu->arch.exception.payload = payload;
863 						      &vcpu->arch.exception);
868 	prev_nr = vcpu->arch.exception.vector;
870 		/* triple fault -> shutdown */
882 		vcpu->arch.exception.injected = false;
883 		vcpu->arch.exception.pending = false;
888 		   that instruction re-execution will regenerate lost
919 	 * On VM-Entry, an exception can be pending if and only if event
929 	 * re-checking is incorrect if _L1_ injected the exception, in which
934 	vcpu->arch.exception.injected = true;
935 	vcpu->arch.exception.has_error_code = has_error_code;
936 	vcpu->arch.exception.vector = nr;
937 	vcpu->arch.exception.error_code = error_code;
938 	vcpu->arch.exception.has_payload = false;
939 	vcpu->arch.exception.payload = 0;
967 	++vcpu->stat.pf_guest;
970 	 * Async #PF in L2 is always forwarded to L1 as a VM-Exit regardless of
973 	if (is_guest_mode(vcpu) && fault->async_page_fault)
975 					   true, fault->error_code,
976 					   true, fault->address);
978 		kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
979 					fault->address);
986 	WARN_ON_ONCE(fault->vector != PF_VECTOR);
988 	fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu :
989 					       vcpu->arch.walk_mmu;
995 	if ((fault->error_code & PFERR_PRESENT_MASK) &&
996 	    !(fault->error_code & PFERR_RSVD_MASK))
997 		kvm_mmu_invalidate_addr(vcpu, fault_mmu, fault->address,
1000 	fault_mmu->inject_page_fault(vcpu, fault);
1006 	atomic_inc(&vcpu->arch.nmi_queued);
1042 	return (vcpu->arch.apf.msr_en_val & mask) == mask;
1047 	return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
1055 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
1060 	u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
1088 	if (!tdp_enabled && memcmp(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)))
1089 		kvm_mmu_free_roots(vcpu->kvm, mmu, KVM_MMU_ROOT_CURRENT);
1091 	memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
1094 	vcpu->arch.pdptrs_from_userspace = false;
1119 	 * CR0.WP is incorporated into the MMU role, but only for non-nested,
1171 	if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) &&
1182 	if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) &&
1210 	if (vcpu->arch.guest_state_protected)
1216 	if (vcpu->arch.xcr0 != kvm_host.xcr0)
1218 		       load_guest ? vcpu->arch.xcr0 : kvm_host.xcr0);
1221 	    vcpu->arch.ia32_xss != kvm_host.xss)
1222 		wrmsrq(MSR_IA32_XSS, load_guest ? vcpu->arch.ia32_xss : kvm_host.xss);
1227 	if (vcpu->arch.guest_state_protected)
1231 	    vcpu->arch.pkru != vcpu->arch.host_pkru &&
1232 	    ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
1234 		wrpkru(vcpu->arch.pkru);
1239 	if (vcpu->arch.guest_state_protected)
1243 	    ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
1245 		vcpu->arch.pkru = rdpkru();
1246 		if (vcpu->arch.pkru != vcpu->arch.host_pkru)
1247 			wrpkru(vcpu->arch.host_pkru);
1254 	return vcpu->arch.guest_supported_xcr0 & XFEATURE_MASK_USER_DYNAMIC;
1261 	u64 old_xcr0 = vcpu->arch.xcr0;
1277 	valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
1296 	vcpu->arch.xcr0 = xcr0;
1299 		vcpu->arch.cpuid_dynamic_bits_dirty = true;
1329 	 * If CR4.PCIDE is changed 0 -> 1, there is no need to flush the TLB
1343 	 * - CR4.PCIDE is changed from 1 to 0
1344 	 * - CR4.PGE is toggled
1355 	 * - CR4.SMEP is changed from 0 to 1
1356 	 * - CR4.PAE is toggled
1401 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1418 	 * If neither the current CR3 nor any of the prev_roots use the given
1436 		if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid)
1439 	kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free);
1472 	vcpu->arch.cr3 = cr3;
1481 	 * and it's impossible to use a non-zero PCID when PCID is disabled,
1498 		vcpu->arch.cr8 = cr8;
1508 		return vcpu->arch.cr8;
1516 	if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
1518 			vcpu->arch.eff_db[i] = vcpu->arch.db[i];
1526 	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1527 		dr7 = vcpu->arch.guest_debug_dr7;
1529 		dr7 = vcpu->arch.dr7;
1531 	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
1533 		vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
1551 	size_t size = ARRAY_SIZE(vcpu->arch.db);
1555 		vcpu->arch.db[array_index_nospec(dr, size)] = val;
1556 		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
1557 			vcpu->arch.eff_db[dr] = val;
1563 		vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
1569 		vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
1580 	size_t size = ARRAY_SIZE(vcpu->arch.db);
1584 		return vcpu->arch.db[array_index_nospec(dr, size)];
1587 		return vcpu->arch.dr6;
1590 		return vcpu->arch.dr7;
1614  *   10 - MISC_PACKAGE_CTRLS
1615  *   11 - ENERGY_FILTERING_CTL
1616  *   12 - DOITM
1617  *   18 - FB_CLEAR_CTRL
1618  *   21 - XAPIC_DISABLE_STATUS
1619  *   23 - OVERCLOCKING_STATUS
1667 		 * If RTM=0 because the kernel has disabled TSX, the host might
1668 		 * have TAA_NO or TSX_CTRL.  Clear TAA_NO (the guest sees RTM=0
1752 	u64 old_efer = vcpu->arch.efer;
1753 	u64 efer = msr_info->data;
1759 	if (!msr_info->host_initiated) {
1764 		    (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
1769 	efer |= vcpu->arch.efer & EFER_LMA;
1797 	struct kvm *kvm = vcpu->kvm;
1806 	idx = srcu_read_lock(&kvm->srcu);
1808 	msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu);
1814 	allowed = msr_filter->default_allow;
1815 	ranges = msr_filter->ranges;
1817 	for (i = 0; i < msr_filter->count; i++) {
1824 			allowed = test_bit(index - start, bitmap);
1830 	srcu_read_unlock(&kvm->srcu, idx);
1839  * Returns 0 on success, non-0 otherwise.
1860 		 * non-canonical address is written on Intel but not on
1861 		 * AMD (which ignores the top 32-bits, because it does
1862 		 * not implement 64-bit SYSENTER).
1864 		 * 64-bit code should hence be able to write a non-canonical
1866 		 * vmentry does not fail on Intel after writing a non-canonical
1868 		 * invokes 64-bit SYSENTER.
1887 		 * clear the bits.  This ensures cross-vendor migration will
1929 		/* All SSP MSRs except MSR_IA32_INT_SSP_TAB must be 4-byte aligned */
1958  * Returns 0 on success, non-0 otherwise.
2052 	if (!vcpu->run->msr.error) {
2053 		kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
2054 		kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
2060 	return complete_emulated_insn_gp(vcpu, vcpu->run->msr.error);
2071 	return kvm_x86_call(complete_emulated_msr)(vcpu, vcpu->run->msr.error);
2082 	if (!vcpu->run->msr.error)
2083 		kvm_register_write(vcpu, vcpu->arch.cui_rdmsr_imm_reg,
2084 				   vcpu->run->msr.data);
2109 	if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason))
2112 	vcpu->run->exit_reason = exit_reason;
2113 	vcpu->run->msr.error = 0;
2114 	memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad));
2115 	vcpu->run->msr.reason = msr_reason;
2116 	vcpu->run->msr.index = index;
2117 	vcpu->run->msr.data = data;
2118 	vcpu->arch.complete_userspace_io = completion;
2135 			kvm_rax_write(vcpu, data & -1u);
2136 			kvm_rdx_write(vcpu, (data >> 32) & -1u);
2153 	return __kvm_emulate_rdmsr(vcpu, kvm_rcx_read(vcpu), -1,
2160 	vcpu->arch.cui_rdmsr_imm_reg = reg;
2233 	if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS))
2236 	if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT))
2239 		enabled = vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT;
2264 	return READ_ONCE(vcpu->mode) == EXITING_GUEST_MODE ||
2272 		if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic) ||
2273 		    kvm_x2apic_icr_write_fast(vcpu->arch.apic, data))
2325 		return -EINVAL;
2357 	write_seqcount_begin(&vdata->seq);
2360 	vdata->clock.vclock_mode	= tk->tkr_mono.clock->vdso_clock_mode;
2361 	vdata->clock.cycle_last		= tk->tkr_mono.cycle_last;
2362 	vdata->clock.mask		= tk->tkr_mono.mask;
2363 	vdata->clock.mult		= tk->tkr_mono.mult;
2364 	vdata->clock.shift		= tk->tkr_mono.shift;
2365 	vdata->clock.base_cycles	= tk->tkr_mono.xtime_nsec;
2366 	vdata->clock.offset		= tk->tkr_mono.base;
2368 	vdata->raw_clock.vclock_mode	= tk->tkr_raw.clock->vdso_clock_mode;
2369 	vdata->raw_clock.cycle_last	= tk->tkr_raw.cycle_last;
2370 	vdata->raw_clock.mask		= tk->tkr_raw.mask;
2371 	vdata->raw_clock.mult		= tk->tkr_raw.mult;
2372 	vdata->raw_clock.shift		= tk->tkr_raw.shift;
2373 	vdata->raw_clock.base_cycles	= tk->tkr_raw.xtime_nsec;
2374 	vdata->raw_clock.offset		= tk->tkr_raw.base;
2376 	vdata->wall_time_sec            = tk->xtime_sec;
2378 	vdata->offs_boot		= tk->offs_boot;
2380 	write_seqcount_end(&vdata->seq);
2391 	/* Master clock not used, so we can just use CLOCK_BOOTTIME.  */
2440 	struct kvm_arch *ka = &vcpu->kvm->arch;
2442 	if (vcpu->vcpu_id == 0 && !host_initiated) {
2443 		if (ka->boot_vcpu_runs_old_kvmclock != old_msr)
2446 		ka->boot_vcpu_runs_old_kvmclock = old_msr;
2449 	vcpu->arch.time = system_time;
2454 		kvm_gpc_activate(&vcpu->arch.pv_time, system_time & ~1ULL,
2457 		kvm_gpc_deactivate(&vcpu->arch.pv_time);
2480 		shift--;
2525 			vcpu->arch.tsc_catchup = 1;
2526 			vcpu->arch.tsc_always_catchup = 1;
2530 			return -1;
2534 	/* TSC scaling required  - calculate ratio */
2539 		pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
2541 		return -1;
2557 		return -1;
2562 			   &vcpu->arch.virtual_tsc_shift,
2563 			   &vcpu->arch.virtual_tsc_mult);
2564 	vcpu->arch.virtual_tsc_khz = user_tsc_khz;
2572 	thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
2584 	u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
2585 				      vcpu->arch.virtual_tsc_mult,
2586 				      vcpu->arch.virtual_tsc_shift);
2587 	tsc += vcpu->arch.this_tsc_write;
2601 	struct kvm_arch *ka = &vcpu->kvm->arch;
2605 	 * To use the masterclock, the host clocksource must be based on TSC
2609 	bool use_master_clock = (ka->nr_vcpus_matched_tsc + 1 ==
2610 				 atomic_read(&vcpu->kvm->online_vcpus)) &&
2611 				gtod_is_based_on_tsc(gtod->clock.vclock_mode);
2619 	if ((ka->use_master_clock && new_generation) ||
2620 	    (ka->use_master_clock != use_master_clock))
2623 	trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
2624 			    atomic_read(&vcpu->kvm->online_vcpus),
2625 		            ka->use_master_clock, gtod->clock.vclock_mode);
2632  * The most significant 64-N bits (mult) of ratio represent the
2635  * point number (mult + frac * 2^(-N)).
2658 	tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio);
2660 	return target_tsc - tsc;
2665 	return vcpu->arch.l1_tsc_offset +
2666 		kvm_scale_tsc(host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
2697 	if (vcpu->arch.guest_tsc_protected)
2700 	trace_kvm_write_tsc_offset(vcpu->vcpu_id,
2701 				   vcpu->arch.l1_tsc_offset,
2704 	vcpu->arch.l1_tsc_offset = l1_offset;
2712 		vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
2717 		vcpu->arch.tsc_offset = l1_offset;
2724 	vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier;
2728 		vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
2732 		vcpu->arch.tsc_scaling_ratio = l1_multiplier;
2742 	 * TSC is marked unstable when we're running on Hyper-V,
2759 	struct kvm *kvm = vcpu->kvm;
2761 	lockdep_assert_held(&kvm->arch.tsc_write_lock);
2763 	if (vcpu->arch.guest_tsc_protected)
2767 		vcpu->kvm->arch.user_set_tsc = true;
2773 	kvm->arch.last_tsc_nsec = ns;
2774 	kvm->arch.last_tsc_write = tsc;
2775 	kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
2776 	kvm->arch.last_tsc_offset = offset;
2778 	vcpu->arch.last_guest_tsc = tsc;
2790 		 * These values are tracked in kvm->arch.cur_xxx variables.
2792 		kvm->arch.cur_tsc_generation++;
2793 		kvm->arch.cur_tsc_nsec = ns;
2794 		kvm->arch.cur_tsc_write = tsc;
2795 		kvm->arch.cur_tsc_offset = offset;
2796 		kvm->arch.nr_vcpus_matched_tsc = 0;
2797 	} else if (vcpu->arch.this_tsc_generation != kvm->arch.cur_tsc_generation) {
2798 		kvm->arch.nr_vcpus_matched_tsc++;
2802 	vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
2803 	vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
2804 	vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
2812 	struct kvm *kvm = vcpu->kvm;
2818 	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
2821 	elapsed = ns - kvm->arch.last_tsc_nsec;
2823 	if (vcpu->arch.virtual_tsc_khz) {
2830 		} else if (kvm->arch.user_set_tsc) {
2831 			u64 tsc_exp = kvm->arch.last_tsc_write +
2833 			u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
2835 			 * Here lies UAPI baggage: when a user-initiated TSC write has
2846 			 * come from the kernel's default vCPU creation. Make the 1-second
2862 	    vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
2864 			offset = kvm->arch.cur_tsc_offset;
2874 	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
2880 	u64 tsc_offset = vcpu->arch.l1_tsc_offset;
2886 	if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio)
2889 				   vcpu->arch.l1_tsc_scaling_ratio);
2921 	switch (clock->vclock_mode) {
2927 			v = (tsc_pg_val - clock->cycle_last) &
2928 				clock->mask;
2937 		v = (*tsc_timestamp - clock->cycle_last) &
2938 			clock->mask;
2947 	return v * clock->mult;
2952  * frequency of CLOCK_MONOTONIC_RAW (hence adding gtos->offs_boot).
2962 		seq = read_seqcount_begin(&gtod->seq);
2963 		ns = gtod->raw_clock.base_cycles;
2964 		ns += vgettsc(&gtod->raw_clock, tsc_timestamp, &mode);
2965 		ns >>= gtod->raw_clock.shift;
2966 		ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot));
2967 	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
2985 		seq = read_seqcount_begin(&gtod->seq);
2986 		ns = gtod->clock.base_cycles;
2987 		ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
2988 		ns >>= gtod->clock.shift;
2989 		ns += ktime_to_ns(gtod->clock.offset);
2990 	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
3004 		seq = read_seqcount_begin(&gtod->seq);
3005 		ts->tv_sec = gtod->wall_time_sec;
3006 		ns = gtod->clock.base_cycles;
3007 		ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
3008 		ns >>= gtod->clock.shift;
3009 	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
3011 	ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
3012 	ts->tv_nsec = ns;
3050  * DO NOT USE this for anything related to migration. You want CLOCK_TAI
3080  * 4. ret0 = timespec0 + (rdtsc - tsc0) |
3081  * 5.				        | ret1 = timespec1 + (rdtsc - tsc1)
3082  * 				        | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
3086  * 	- ret0 < ret1
3087  *	- timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
3089  *	- 0 < N - M => M < N
3097  * system_timestamp/tsc_timestamp values simultaneously: use a master
3108 	struct kvm_arch *ka = &kvm->arch;
3112 	lockdep_assert_held(&kvm->arch.tsc_write_lock);
3113 	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
3114 			atomic_read(&kvm->online_vcpus));
3121 					&ka->master_kernel_ns,
3122 					&ka->master_cycle_now);
3124 	ka->use_master_clock = host_tsc_clocksource && vcpus_matched
3125 				&& !ka->backwards_tsc_observed
3126 				&& !ka->boot_vcpu_runs_old_kvmclock;
3128 	if (ka->use_master_clock)
3132 	trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
3144 	raw_spin_lock_irq(&kvm->arch.tsc_write_lock);
3145 	write_seqcount_begin(&kvm->arch.pvclock_sc);
3158 	struct kvm_arch *ka = &kvm->arch;
3162 	write_seqcount_end(&ka->pvclock_sc);
3163 	raw_spin_unlock_irq(&ka->tsc_write_lock);
3181  * Use the kernel's tsc_khz directly if the TSC is constant, otherwise use KVM's
3182  * per-CPU value (which may be zero if a CPU is going offline).  Note, tsc_khz
3196 /* Called within read_seqcount_begin/retry for kvm->pvclock_sc.  */
3199 	struct kvm_arch *ka = &kvm->arch;
3205 	data->flags = 0;
3206 	if (ka->use_master_clock &&
3211 		if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) {
3212 			data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
3213 			data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
3216 		data->host_tsc = rdtsc();
3218 		data->flags |= KVM_CLOCK_TSC_STABLE;
3219 		hv_clock.tsc_timestamp = ka->master_cycle_now;
3220 		hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
3224 		data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
3226 		data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
3234 	struct kvm_arch *ka = &kvm->arch;
3238 		seq = read_seqcount_begin(&ka->pvclock_sc);
3240 	} while (read_seqcount_retry(&ka->pvclock_sc, seq));
3262 	read_lock_irqsave(&gpc->lock, flags);
3264 		read_unlock_irqrestore(&gpc->lock, flags);
3269 		read_lock_irqsave(&gpc->lock, flags);
3272 	guest_hv_clock = (void *)(gpc->khva + offset);
3281 	guest_hv_clock->version = hv_clock.version = (guest_hv_clock->version + 1) | 1;
3285 	hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
3291 	guest_hv_clock->version = ++hv_clock.version;
3294 	read_unlock_irqrestore(&gpc->lock, flags);
3296 	trace_kvm_pvclock_update(vcpu->vcpu_id, &hv_clock);
3304 	struct kvm_vcpu_arch *vcpu = &v->arch;
3305 	struct kvm_arch *ka = &v->kvm->arch;
3318 		seq = read_seqcount_begin(&ka->pvclock_sc);
3319 		use_master_clock = ka->use_master_clock;
3321 			host_tsc = ka->master_cycle_now;
3322 			kernel_ns = ka->master_kernel_ns;
3324 	} while (read_seqcount_retry(&ka->pvclock_sc, seq));
3351 	if (vcpu->tsc_catchup) {
3354 			adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
3365 					    v->arch.l1_tsc_scaling_ratio);
3369 	if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
3371 				   &vcpu->pvclock_tsc_shift,
3372 				   &vcpu->pvclock_tsc_mul);
3373 		vcpu->hw_tsc_khz = tgt_tsc_khz;
3376 	hv_clock.tsc_shift = vcpu->pvclock_tsc_shift;
3377 	hv_clock.tsc_to_system_mul = vcpu->pvclock_tsc_mul;
3379 	hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
3380 	vcpu->last_guest_tsc = tsc_timestamp;
3387 	if (vcpu->pv_time.active) {
3393 		if (vcpu->pvclock_set_guest_stopped_request) {
3395 			vcpu->pvclock_set_guest_stopped_request = false;
3397 		kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->pv_time, 0);
3402 	kvm_hv_setup_tsc_page(v->kvm, &hv_clock);
3407 	 * explicitly told to use TSC as its clocksource Xen will not set this bit.
3413 	if (ka->xen.hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE)
3416 	if (vcpu->xen.vcpu_info_cache.active)
3417 		kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_info_cache,
3419 	if (vcpu->xen.vcpu_time_info_cache.active)
3420 		kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_time_info_cache, 0);
3448 	struct kvm_arch *ka = &kvm->arch;
3454 		seq = read_seqcount_begin(&ka->pvclock_sc);
3457 		if (!ka->use_master_clock)
3479 		hv_clock.tsc_timestamp = ka->master_cycle_now;
3480 		hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
3482 	} while (read_seqcount_retry(&ka->pvclock_sc, seq));
3488 	 * since 1970-01-01.
3494 		return ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec -
3498 	return ktime_get_real_ns() - get_kvmclock_ns(kvm);
3503  * vcpu->cpu migration, should not allow system_timestamp from
3514 	struct kvm *kvm = v->kvm;
3539 		return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
3546 	u64 mcg_cap = vcpu->arch.mcg_cap;
3548 	u32 msr = msr_info->index;
3549 	u64 data = msr_info->data;
3554 		vcpu->arch.mcg_status = data;
3558 		    (data || !msr_info->host_initiated))
3562 		vcpu->arch.mcg_ctl = data;
3564 	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
3565 		last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
3569 		if (!(mcg_cap & MCG_CMCI_P) && (data || !msr_info->host_initiated))
3574 		offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
3575 					    last_msr + 1 - MSR_IA32_MC0_CTL2);
3576 		vcpu->arch.mci_ctl2_banks[offset] = data;
3578 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
3579 		last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
3591 		 * single-bit ECC data errors.
3599 		 * AMD-based CPUs allow non-zero values, but if and only if
3602 		if (!msr_info->host_initiated && is_mci_status_msr(msr) &&
3606 		offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
3607 					    last_msr + 1 - MSR_IA32_MC0_CTL);
3608 		vcpu->arch.mce_banks[offset] = data;
3635 	vcpu->arch.apf.msr_en_val = data;
3643 	if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
3647 	vcpu->arch.apf.send_always = (data & KVM_ASYNC_PF_SEND_ALWAYS);
3648 	vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
3657 	/* Bits 8-63 are reserved */
3664 	vcpu->arch.apf.msr_int_val = data;
3666 	vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK;
3673 	kvm_gpc_deactivate(&vcpu->arch.pv_time);
3674 	vcpu->arch.time = 0;
3679 	++vcpu->stat.tlb_flush;
3688 	++vcpu->stat.tlb_flush;
3704 	 * Flushing all "guest" TLB is always a superset of Hyper-V's fine
3713 	++vcpu->stat.tlb_flush;
3721  * prior before nested VM-Enter/VM-Exit.
3735 	struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
3738 	gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
3742 	if (kvm_xen_msr_enabled(vcpu->kvm)) {
3747 	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
3750 	if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
3753 	slots = kvm_memslots(vcpu->kvm);
3755 	if (unlikely(slots->generation != ghc->generation ||
3756 		     gpa != ghc->gpa ||
3757 		     kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
3759 		BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
3761 		if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st)) ||
3762 		    kvm_is_error_hva(ghc->hva) || !ghc->memslot)
3766 	st = (struct kvm_steal_time __user *)ghc->hva;
3773 		int err = -EFAULT;
3784 			       "+m" (st->preempted));
3790 		vcpu->arch.st.preempted = 0;
3792 		trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
3803 		unsafe_put_user(0, &st->preempted, out);
3804 		vcpu->arch.st.preempted = 0;
3807 	unsafe_get_user(version, &st->version, out);
3812 	unsafe_put_user(version, &st->version, out);
3816 	unsafe_get_user(steal, &st->steal, out);
3817 	steal += current->sched_info.run_delay -
3818 		vcpu->arch.st.last_steal;
3819 	vcpu->arch.st.last_steal = current->sched_info.run_delay;
3820 	unsafe_put_user(steal, &st->steal, out);
3823 	unsafe_put_user(version, &st->version, out);
3828 	mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
3854  * Lock (and if necessary, re-load) the guest FPU, i.e. XSTATE, and access an
3859  * Note, loading guest values for U_CET and PL[0-3]_SSP while executing in the
3860  * kernel is safe, as U_CET is specific to userspace, and PL[0-3]_SSP are only
3870 	KVM_BUG_ON(!is_xstate_managed_msr(vcpu, msr_info->index), vcpu->kvm);
3871 	KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm);
3875 		rdmsrq(msr_info->index, msr_info->data);
3877 		wrmsrq(msr_info->index, msr_info->data);
3893 	u32 msr = msr_info->index;
3894 	u64 data = msr_info->data;
3897 	 * Do not allow host-initiated writes to trigger the Xen hypercall
3901 	if (kvm_xen_is_hypercall_page_msr(vcpu->kvm, msr) &&
3902 	    !msr_info->host_initiated)
3917 		if (msr_info->host_initiated)
3918 			vcpu->arch.microcode_version = data;
3921 		if (!msr_info->host_initiated ||
3924 		vcpu->arch.arch_capabilities = data;
3927 		if (!msr_info->host_initiated ||
3939 		if (vcpu->arch.perf_capabilities == data)
3942 		vcpu->arch.perf_capabilities = data;
3948 		if (!msr_info->host_initiated) {
3976 		if (!msr_info->host_initiated &&
4003 		vcpu->arch.msr_hwcr = data;
4015 		vcpu->arch.pat = data;
4021 		return kvm_apic_set_base(vcpu, data, msr_info->host_initiated);
4029 			if (!msr_info->host_initiated) {
4030 				s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
4037 			vcpu->arch.ia32_tsc_adjust_msr = data;
4041 		u64 old_val = vcpu->arch.ia32_misc_enable_msr;
4043 		if (!msr_info->host_initiated) {
4053 		if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
4057 			vcpu->arch.ia32_misc_enable_msr = data;
4058 			vcpu->arch.cpuid_dynamic_bits_dirty = true;
4060 			vcpu->arch.ia32_misc_enable_msr = data;
4065 		if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
4067 		vcpu->arch.smbase = data;
4070 		vcpu->arch.msr_ia32_power_ctl = data;
4073 		if (msr_info->host_initiated) {
4075 		} else if (!vcpu->arch.guest_tsc_protected) {
4076 			u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
4078 			vcpu->arch.ia32_tsc_adjust_msr += adj;
4085 		if (data & ~vcpu->arch.guest_supported_xss)
4087 		if (vcpu->arch.ia32_xss == data)
4089 		vcpu->arch.ia32_xss = data;
4090 		vcpu->arch.cpuid_dynamic_bits_dirty = true;
4093 		if (!msr_info->host_initiated)
4095 		vcpu->arch.smi_count = data;
4101 		vcpu->kvm->arch.wall_clock = data;
4102 		kvm_write_wall_clock(vcpu->kvm, data, 0);
4108 		vcpu->kvm->arch.wall_clock = data;
4109 		kvm_write_wall_clock(vcpu->kvm, data, 0);
4115 		kvm_write_system_time(vcpu, data, false, msr_info->host_initiated);
4121 		kvm_write_system_time(vcpu, data, true,  msr_info->host_initiated);
4145 			smp_store_mb(vcpu->arch.apf.pageready_pending, false);
4160 		vcpu->arch.st.msr_val = data;
4181 		if (data & (-1ULL << 1))
4184 		vcpu->arch.msr_kvm_poll_control = data;
4189 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
4190 	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
4207 		 * all pre-dating SVM, but a recommended workaround from
4225 					     msr_info->host_initiated);
4228 		/* Drop writes to this legacy MSR -- see rdmsr
4236 		vcpu->arch.osvw.length = data;
4241 		vcpu->arch.osvw.status = data;
4244 		if (!msr_info->host_initiated)
4246 		vcpu->arch.msr_platform_info = data;
4253 		vcpu->arch.msr_misc_features_enables = data;
4257 		if (!msr_info->host_initiated &&
4264 		fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data);
4267 		if (!msr_info->host_initiated &&
4274 		vcpu->arch.guest_fpu.xfd_err = data;
4294 	u64 mcg_cap = vcpu->arch.mcg_cap;
4304 		data = vcpu->arch.mcg_cap;
4309 		data = vcpu->arch.mcg_ctl;
4312 		data = vcpu->arch.mcg_status;
4314 	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
4315 		last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
4321 		offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
4322 					    last_msr + 1 - MSR_IA32_MC0_CTL2);
4323 		data = vcpu->arch.mci_ctl2_banks[offset];
4325 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
4326 		last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
4330 		offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
4331 					    last_msr + 1 - MSR_IA32_MC0_CTL);
4332 		data = vcpu->arch.mce_banks[offset];
4343 	switch (msr_info->index) {
4366 	 * so for existing CPU-specific MSRs.
4373 		msr_info->data = 0;
4379 		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
4381 		msr_info->data = 0;
4384 		msr_info->data = vcpu->arch.microcode_version;
4389 		msr_info->data = vcpu->arch.arch_capabilities;
4394 		msr_info->data = vcpu->arch.perf_capabilities;
4397 		msr_info->data = vcpu->arch.msr_ia32_power_ctl;
4406 		 * return L1's TSC value to ensure backwards-compatible
4411 		if (msr_info->host_initiated) {
4412 			offset = vcpu->arch.l1_tsc_offset;
4413 			ratio = vcpu->arch.l1_tsc_scaling_ratio;
4415 			offset = vcpu->arch.tsc_offset;
4416 			ratio = vcpu->arch.tsc_scaling_ratio;
4419 		msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset;
4423 		msr_info->data = vcpu->arch.pat;
4428 		return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
4430 		msr_info->data = 3;
4444 		msr_info->data = 1 << 24;
4447 		msr_info->data = vcpu->arch.apic_base;
4450 		return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
4452 		msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
4455 		msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
4458 		msr_info->data = vcpu->arch.ia32_misc_enable_msr;
4461 		if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
4463 		msr_info->data = vcpu->arch.smbase;
4466 		msr_info->data = vcpu->arch.smi_count;
4470 		msr_info->data = 1000ULL;
4472 		msr_info->data |= (((uint64_t)4ULL) << 40);
4475 		msr_info->data = vcpu->arch.efer;
4481 		msr_info->data = vcpu->kvm->arch.wall_clock;
4487 		msr_info->data = vcpu->kvm->arch.wall_clock;
4493 		msr_info->data = vcpu->arch.time;
4499 		msr_info->data = vcpu->arch.time;
4505 		msr_info->data = vcpu->arch.apf.msr_en_val;
4511 		msr_info->data = vcpu->arch.apf.msr_int_val;
4517 		msr_info->data = 0;
4523 		msr_info->data = vcpu->arch.st.msr_val;
4529 		msr_info->data = vcpu->arch.pv_eoi.msr_val;
4535 		msr_info->data = vcpu->arch.msr_kvm_poll_control;
4542 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
4543 	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
4544 		return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
4545 				   msr_info->host_initiated);
4547 		if (!msr_info->host_initiated &&
4550 		msr_info->data = vcpu->arch.ia32_xss;
4554 		 * Provide expected ramp-up count for K7. All other
4562 		msr_info->data = 0x20000000;
4576 					     msr_info->index, &msr_info->data,
4577 					     msr_info->host_initiated);
4590 		msr_info->data = 0xbe702111;
4595 		msr_info->data = vcpu->arch.osvw.length;
4600 		msr_info->data = vcpu->arch.osvw.status;
4603 		if (!msr_info->host_initiated &&
4604 		    !vcpu->kvm->arch.guest_can_read_msr_platform_info)
4606 		msr_info->data = vcpu->arch.msr_platform_info;
4609 		msr_info->data = vcpu->arch.msr_misc_features_enables;
4612 		msr_info->data = vcpu->arch.msr_hwcr;
4616 		if (!msr_info->host_initiated &&
4620 		msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd;
4623 		if (!msr_info->host_initiated &&
4627 		msr_info->data = vcpu->arch.guest_fpu.xfd_err;
4635 		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
4657 	for (i = 0; i < msrs->nmsrs; ++i) {
4659 		 * If userspace is accessing one or more XSTATE-managed MSRs,
4692 	r = -EFAULT;
4696 	r = -E2BIG;
4701 	entries = memdup_user(user_msrs->entries, size);
4709 	if (writeback && copy_to_user(user_msrs->entries, entries, size))
4710 		r = -EFAULT;
4748 	r = -EFAULT;
4752 	r = kvm_get_hv_cpuid(vcpu, &cpuid, cpuid_arg->entries);
4756 	r = -EFAULT;
4771 	return kvm && kvm->arch.has_protected_state ? 0 : KVM_SYNC_X86_VALID_FIELDS;
4914 			r = kvm->max_vcpus;
4936 		r = kvm_x86_ops.nested_ops->get_state ?
4937 			kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0;
4944 		r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
4989 	if (attr->group) {
4991 			return kvm_x86_call(dev_get_attr)(attr->group, attr->attr, val);
4992 		return -ENXIO;
4995 	switch (attr->attr) {
5000 		return -ENXIO;
5006 	u64 __user *uaddr = u64_to_user_ptr(attr->addr);
5015 		return -EFAULT;
5039 		r = -EFAULT;
5046 		r = -E2BIG;
5049 		r = -EFAULT;
5050 		if (copy_to_user(user_msr_list->indices, &msrs_to_save,
5053 		if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
5065 		r = -EFAULT;
5069 		r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
5074 		r = -EFAULT;
5081 		r = -EFAULT;
5092 		r = -EFAULT;
5099 		r = -E2BIG;
5102 		r = -EFAULT;
5103 		if (copy_to_user(user_msr_list->indices, &msr_based_features,
5119 		r = -EFAULT;
5127 		r = -EFAULT;
5134 		r = -EINVAL;
5143 	return kvm_arch_has_noncoherent_dma(vcpu->kvm);
5154 	if (vcpu->scheduled_out && pmu->version && pmu->event_count) {
5155 		pmu->need_cleanup = true;
5162 			cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
5163 		else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
5164 			wbinvd_on_cpu(vcpu->cpu);
5175 		 * is handled on the nested VM-Exit path.
5183 	vcpu->arch.host_pkru = read_pkru();
5186 	if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
5187 		adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
5188 		vcpu->arch.tsc_offset_adjustment = 0;
5192 	if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
5193 		s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
5194 				rdtsc() - vcpu->arch.last_host_tsc;
5200 						vcpu->arch.last_guest_tsc);
5202 			if (!vcpu->arch.guest_tsc_protected)
5203 				vcpu->arch.tsc_catchup = 1;
5211 		 * kvmclock on vcpu->cpu migration
5213 		if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
5215 		if (vcpu->cpu != cpu)
5217 		vcpu->cpu = cpu;
5225 	struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
5229 	gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
5232 	 * The vCPU can be marked preempted if and only if the VM-Exit was on
5236 	 * preempted if and only if the VM-Exit was due to a host interrupt.
5238 	if (!vcpu->arch.at_instruction_boundary) {
5239 		vcpu->stat.preemption_other++;
5243 	vcpu->stat.preemption_reported++;
5244 	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
5247 	if (vcpu->arch.st.preempted)
5251 	if (unlikely(current->mm != vcpu->kvm->mm))
5254 	slots = kvm_memslots(vcpu->kvm);
5256 	if (unlikely(slots->generation != ghc->generation ||
5257 		     gpa != ghc->gpa ||
5258 		     kvm_is_error_hva(ghc->hva) || !ghc->memslot))
5261 	st = (struct kvm_steal_time __user *)ghc->hva;
5262 	BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
5264 	if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
5265 		vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
5267 	mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
5274 	if (vcpu->preempted) {
5276 		 * Assume protected guests are in-kernel.  Inefficient yielding
5280 		vcpu->arch.preempted_in_kernel = vcpu->arch.guest_state_protected ||
5287 		idx = srcu_read_lock(&vcpu->kvm->srcu);
5288 		if (kvm_xen_msr_enabled(vcpu->kvm))
5292 		srcu_read_unlock(&vcpu->kvm->srcu, idx);
5296 	vcpu->arch.last_host_tsc = rdtsc();
5302 	if (vcpu->arch.apic->guest_apic_protected)
5303 		return -EINVAL;
5315 	if (vcpu->arch.apic->guest_apic_protected)
5316 		return -EINVAL;
5349 	 * instruction boundary and with no events half-injected.
5360 	if (irq->irq >= KVM_NR_INTERRUPTS)
5361 		return -EINVAL;
5363 	if (!irqchip_in_kernel(vcpu->kvm)) {
5364 		kvm_queue_interrupt(vcpu, irq->irq, false);
5370 	 * With in-kernel LAPIC, we only use this to inject EXTINT, so
5371 	 * fail for in-kernel 8259.
5373 	if (pic_in_kernel(vcpu->kvm))
5374 		return -ENXIO;
5376 	if (vcpu->arch.pending_external_vector != -1)
5377 		return -EEXIST;
5379 	vcpu->arch.pending_external_vector = irq->irq;
5394 	if (tac->flags)
5395 		return -EINVAL;
5396 	vcpu->arch.tpr_access_reporting = !!tac->enabled;
5406 	r = -EINVAL;
5412 	vcpu->arch.mcg_cap = mcg_cap;
5415 		vcpu->arch.mcg_ctl = ~(u64)0;
5418 		vcpu->arch.mce_banks[bank*4] = ~(u64)0;
5420 			vcpu->arch.mci_ctl2_banks[bank] = 0;
5433  * - none of the bits for Machine Check Exceptions are set
5434  * - both the VAL (valid) and UC (uncorrectable) bits are set
5435  * MCI_STATUS_PCC - Processor Context Corrupted
5436  * MCI_STATUS_S - Signaled as a Machine Check Exception
5437  * MCI_STATUS_AR - Software recoverable Action Required
5441 	return	!mce->mcg_status &&
5442 		!(mce->status & (MCI_STATUS_PCC | MCI_STATUS_S | MCI_STATUS_AR)) &&
5443 		(mce->status & MCI_STATUS_VAL) &&
5444 		(mce->status & MCI_STATUS_UC);
5449 	u64 mcg_cap = vcpu->arch.mcg_cap;
5451 	banks[1] = mce->status;
5452 	banks[2] = mce->addr;
5453 	banks[3] = mce->misc;
5454 	vcpu->arch.mcg_status = mce->mcg_status;
5457 	    !(vcpu->arch.mci_ctl2_banks[mce->bank] & MCI_CTL2_CMCI_EN))
5461 		kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTCMCI);
5469 	u64 mcg_cap = vcpu->arch.mcg_cap;
5471 	u64 *banks = vcpu->arch.mce_banks;
5473 	if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
5474 		return -EINVAL;
5476 	banks += array_index_nospec(4 * mce->bank, 4 * bank_num);
5485 	if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
5486 	    vcpu->arch.mcg_ctl != ~(u64)0)
5492 	if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
5494 	if (mce->status & MCI_STATUS_UC) {
5495 		if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
5501 			mce->status |= MCI_STATUS_OVER;
5502 		banks[2] = mce->addr;
5503 		banks[3] = mce->misc;
5504 		vcpu->arch.mcg_status = mce->mcg_status;
5505 		banks[1] = mce->status;
5510 			mce->status |= MCI_STATUS_OVER;
5511 		banks[2] = mce->addr;
5512 		banks[3] = mce->misc;
5513 		banks[1] = mce->status;
5534 	 * non-exiting _injected_ exception, and a pending exiting exception.
5535 	 * In that case, ignore the VM-Exiting exception as it's an extension
5538 	if (vcpu->arch.exception_vmexit.pending &&
5539 	    !vcpu->arch.exception.pending &&
5540 	    !vcpu->arch.exception.injected)
5541 		ex = &vcpu->arch.exception_vmexit;
5543 		ex = &vcpu->arch.exception;
5548 	 * intercepts #PF, ditto for DR6 and #DBs.  If the per-VM capability,
5553 	if (!vcpu->kvm->arch.exception_payload_enabled &&
5554 	    ex->pending && ex->has_payload)
5565 	if (!kvm_exception_is_soft(ex->vector)) {
5566 		events->exception.injected = ex->injected;
5567 		events->exception.pending = ex->pending;
5573 		if (!vcpu->kvm->arch.exception_payload_enabled)
5574 			events->exception.injected |= ex->pending;
5576 	events->exception.nr = ex->vector;
5577 	events->exception.has_error_code = ex->has_error_code;
5578 	events->exception.error_code = ex->error_code;
5579 	events->exception_has_payload = ex->has_payload;
5580 	events->exception_payload = ex->payload;
5582 	events->interrupt.injected =
5583 		vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
5584 	events->interrupt.nr = vcpu->arch.interrupt.nr;
5585 	events->interrupt.shadow = kvm_x86_call(get_interrupt_shadow)(vcpu);
5587 	events->nmi.injected = vcpu->arch.nmi_injected;
5588 	events->nmi.pending = kvm_get_nr_pending_nmis(vcpu);
5589 	events->nmi.masked = kvm_x86_call(get_nmi_mask)(vcpu);
5591 	/* events->sipi_vector is never valid when reporting to user space */
5594 	events->smi.smm = is_smm(vcpu);
5595 	events->smi.pending = vcpu->arch.smi_pending;
5596 	events->smi.smm_inside_nmi =
5597 		!!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
5599 	events->smi.latched_init = kvm_lapic_latched_init(vcpu);
5601 	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
5604 	if (vcpu->kvm->arch.exception_payload_enabled)
5605 		events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
5606 	if (vcpu->kvm->arch.triple_fault_event) {
5607 		events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5608 		events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT;
5615 	if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
5621 		return -EINVAL;
5623 	if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
5624 		if (!vcpu->kvm->arch.exception_payload_enabled)
5625 			return -EINVAL;
5626 		if (events->exception.pending)
5627 			events->exception.injected = 0;
5629 			events->exception_has_payload = 0;
5631 		events->exception.pending = 0;
5632 		events->exception_has_payload = 0;
5635 	if ((events->exception.injected || events->exception.pending) &&
5636 	    (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
5637 		return -EINVAL;
5643 	 * morph the exception to a VM-Exit if appropriate.  Do this only for
5644 	 * pending exceptions, already-injected exceptions are not subject to
5647 	 * pending exception, which in turn may cause a spurious VM-Exit.
5649 	vcpu->arch.exception_from_userspace = events->exception.pending;
5651 	vcpu->arch.exception_vmexit.pending = false;
5653 	vcpu->arch.exception.injected = events->exception.injected;
5654 	vcpu->arch.exception.pending = events->exception.pending;
5655 	vcpu->arch.exception.vector = events->exception.nr;
5656 	vcpu->arch.exception.has_error_code = events->exception.has_error_code;
5657 	vcpu->arch.exception.error_code = events->exception.error_code;
5658 	vcpu->arch.exception.has_payload = events->exception_has_payload;
5659 	vcpu->arch.exception.payload = events->exception_payload;
5661 	vcpu->arch.interrupt.injected = events->interrupt.injected;
5662 	vcpu->arch.interrupt.nr = events->interrupt.nr;
5663 	vcpu->arch.interrupt.soft = events->interrupt.soft;
5664 	if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
5666 						   events->interrupt.shadow);
5668 	vcpu->arch.nmi_injected = events->nmi.injected;
5669 	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
5670 		vcpu->arch.nmi_pending = 0;
5671 		atomic_set(&vcpu->arch.nmi_queued, events->nmi.pending);
5672 		if (events->nmi.pending)
5675 	kvm_x86_call(set_nmi_mask)(vcpu, events->nmi.masked);
5677 	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
5679 		vcpu->arch.apic->sipi_vector = events->sipi_vector;
5681 	if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
5683 		if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
5685 			kvm_smm_changed(vcpu, events->smi.smm);
5688 		vcpu->arch.smi_pending = events->smi.pending;
5690 		if (events->smi.smm) {
5691 			if (events->smi.smm_inside_nmi)
5692 				vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
5694 				vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
5698 		if (events->smi.smm || events->smi.pending ||
5699 		    events->smi.smm_inside_nmi)
5700 			return -EINVAL;
5704 			if (events->smi.latched_init)
5705 				set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
5707 				clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
5711 	if (events->flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) {
5712 		if (!vcpu->kvm->arch.triple_fault_event)
5713 			return -EINVAL;
5714 		if (events->triple_fault.pending)
5730 	if (vcpu->kvm->arch.has_protected_state &&
5731 	    vcpu->arch.guest_state_protected)
5732 		return -EINVAL;
5736 	BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db));
5737 	for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++)
5738 		dbgregs->db[i] = vcpu->arch.db[i];
5740 	dbgregs->dr6 = vcpu->arch.dr6;
5741 	dbgregs->dr7 = vcpu->arch.dr7;
5750 	if (vcpu->kvm->arch.has_protected_state &&
5751 	    vcpu->arch.guest_state_protected)
5752 		return -EINVAL;
5754 	if (dbgregs->flags)
5755 		return -EINVAL;
5757 	if (!kvm_dr6_valid(dbgregs->dr6))
5758 		return -EINVAL;
5759 	if (!kvm_dr7_valid(dbgregs->dr7))
5760 		return -EINVAL;
5762 	for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++)
5763 		vcpu->arch.db[i] = dbgregs->db[i];
5766 	vcpu->arch.dr6 = dbgregs->dr6;
5767 	vcpu->arch.dr7 = dbgregs->dr7;
5789 	u64 supported_xcr0 = vcpu->arch.guest_supported_xcr0 |
5792 	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
5793 		return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
5795 	fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, state, size,
5796 				       supported_xcr0, vcpu->arch.pkru);
5803 	return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
5804 					     sizeof(guest_xsave->region));
5810 	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
5811 		return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
5813 	return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
5814 					      guest_xsave->region,
5816 					      &vcpu->arch.pkru);
5822 	if (vcpu->kvm->arch.has_protected_state &&
5823 	    vcpu->arch.guest_state_protected)
5824 		return -EINVAL;
5827 		guest_xcrs->nr_xcrs = 0;
5831 	guest_xcrs->nr_xcrs = 1;
5832 	guest_xcrs->flags = 0;
5833 	guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
5834 	guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
5843 	if (vcpu->kvm->arch.has_protected_state &&
5844 	    vcpu->arch.guest_state_protected)
5845 		return -EINVAL;
5848 		return -EINVAL;
5850 	if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
5851 		return -EINVAL;
5853 	for (i = 0; i < guest_xcrs->nr_xcrs; i++)
5855 		if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {
5857 				guest_xcrs->xcrs[i].value);
5861 		r = -EINVAL;
5873 	if (!vcpu->arch.pv_time.active)
5874 		return -EINVAL;
5875 	vcpu->arch.pvclock_set_guest_stopped_request = true;
5885 	switch (attr->attr) {
5890 		r = -ENXIO;
5899 	u64 __user *uaddr = u64_to_user_ptr(attr->addr);
5902 	switch (attr->attr) {
5904 		r = -EFAULT;
5905 		if (put_user(vcpu->arch.l1_tsc_offset, uaddr))
5910 		r = -ENXIO;
5919 	u64 __user *uaddr = u64_to_user_ptr(attr->addr);
5920 	struct kvm *kvm = vcpu->kvm;
5923 	switch (attr->attr) {
5929 		r = -EFAULT;
5933 		raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
5935 		matched = (vcpu->arch.virtual_tsc_khz &&
5936 			   kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz &&
5937 			   kvm->arch.last_tsc_offset == offset);
5939 		tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
5943 		raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
5949 		r = -ENXIO;
5963 		return -EFAULT;
5966 		return -ENXIO;
5986 	if (cap->flags)
5987 		return -EINVAL;
5989 	switch (cap->cap) {
5992 		if (cap->args[0])
5993 			return -EINVAL;
5997 		if (!irqchip_in_kernel(vcpu->kvm))
5998 			return -EINVAL;
5999 		return kvm_hv_activate_synic(vcpu, cap->cap ==
6007 			if (!kvm_x86_ops.nested_ops->enable_evmcs)
6008 				return -ENOTTY;
6009 			r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version);
6011 				user_ptr = (void __user *)(uintptr_t)cap->args[0];
6014 					r = -EFAULT;
6020 			return -ENOTTY;
6025 		return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]);
6029 		vcpu->arch.pv_cpuid.enforce = cap->args[0];
6032 		return -EINVAL;
6048 	switch (reg->index) {
6051 		 * FIXME: If host-initiated accesses are ever exempted from
6057 			return -EINVAL;
6059 		reg->type = KVM_X86_REG_TYPE_MSR;
6060 		reg->index = MSR_KVM_INTERNAL_GUEST_SSP;
6063 		return -EINVAL;
6073 		return -EINVAL;
6076 		return -EFAULT;
6086 		return -EFAULT;
6089 		return -EINVAL;
6104 		return -EFAULT;
6107 		return -EINVAL;
6110 	if (reg->rsvd1 || reg->rsvd2)
6111 		return -EINVAL;
6113 	if (reg->type == KVM_X86_REG_TYPE_KVM) {
6119 	if (reg->type != KVM_X86_REG_TYPE_MSR)
6120 		return -EINVAL;
6123 		return -EINVAL;
6125 	guard(srcu)(&vcpu->kvm->srcu);
6127 	load_fpu = is_xstate_managed_msr(vcpu, reg->index);
6133 		r = kvm_get_one_msr(vcpu, reg->index, user_val);
6135 		r = kvm_set_one_msr(vcpu, reg->index, user_val);
6148 	if (get_user(user_nr_regs, &user_list->n))
6149 		return -EFAULT;
6151 	if (put_user(nr_regs, &user_list->n))
6152 		return -EFAULT;
6155 		return -E2BIG;
6158 	    put_user(KVM_X86_REG_KVM(KVM_REG_GUEST_SSP), &user_list->reg[0]))
6159 		return -EFAULT;
6167 	struct kvm_vcpu *vcpu = filp->private_data;
6183 		r = -EINVAL;
6188 		r = -ENOMEM;
6194 		r = -EFAULT;
6201 		r = -EINVAL;
6216 		r = -EFAULT;
6234 		r = -EFAULT;
6237 		r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
6244 		r = -EFAULT;
6248 					      cpuid_arg->entries);
6255 		r = -EFAULT;
6259 					      cpuid_arg->entries);
6262 		r = -EFAULT;
6269 		int idx = srcu_read_lock(&vcpu->kvm->srcu);
6271 		srcu_read_unlock(&vcpu->kvm->srcu, idx);
6275 		int idx = srcu_read_lock(&vcpu->kvm->srcu);
6277 		srcu_read_unlock(&vcpu->kvm->srcu, idx);
6290 		r = -EFAULT;
6296 		r = -EFAULT;
6306 		r = -EINVAL;
6309 		r = -EFAULT;
6312 		idx = srcu_read_lock(&vcpu->kvm->srcu);
6314 		srcu_read_unlock(&vcpu->kvm->srcu, idx);
6320 		r = -EFAULT;
6329 		r = -EFAULT;
6340 		r = -EFAULT;
6349 		r = -EFAULT;
6365 		r = -EFAULT;
6375 		r = -EFAULT;
6384 		r = -EINVAL;
6385 		if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave))
6389 		r = -ENOMEM;
6397 		r = -EFAULT;
6404 		int size = vcpu->arch.guest_fpu.uabi_size;
6417 		int size = vcpu->arch.guest_fpu.uabi_size;
6420 		r = -ENOMEM;
6428 		r = -EFAULT;
6438 		r = -ENOMEM;
6446 		r = -EFAULT;
6466 		r = -EINVAL;
6468 		if (vcpu->arch.guest_tsc_protected)
6486 		r = vcpu->arch.virtual_tsc_khz;
6496 		r = -EFAULT;
6506 		r = -EINVAL;
6507 		if (!kvm_x86_ops.nested_ops->get_state)
6510 		BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
6511 		r = -EFAULT;
6512 		if (get_user(user_data_size, &user_kvm_nested_state->size))
6515 		r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state,
6521 			if (put_user(r, &user_kvm_nested_state->size))
6522 				r = -EFAULT;
6524 				r = -E2BIG;
6536 		r = -EINVAL;
6537 		if (!kvm_x86_ops.nested_ops->set_state)
6540 		r = -EFAULT;
6544 		r = -EINVAL;
6559 		idx = srcu_read_lock(&vcpu->kvm->srcu);
6560 		r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state);
6561 		srcu_read_unlock(&vcpu->kvm->srcu, idx);
6573 		r = -EFAULT;
6578 			r = -EFAULT;
6584 		r = -EFAULT;
6592 		r = -EINVAL;
6593 		if (vcpu->kvm->arch.has_protected_state &&
6594 		    vcpu->arch.guest_state_protected)
6598 		r = -ENOMEM;
6602 		r = -EFAULT;
6609 		r = -EINVAL;
6610 		if (vcpu->kvm->arch.has_protected_state &&
6611 		    vcpu->arch.guest_state_protected)
6629 		r = -ENOTTY;
6635 		r = -EINVAL;
6653 	if (addr > (unsigned int)(-3 * PAGE_SIZE))
6654 		return -EINVAL;
6669 		return -EINVAL;
6671 	mutex_lock(&kvm->slots_lock);
6674 	kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
6676 	mutex_unlock(&kvm->slots_lock);
6686 	 * on all VM-Exits, thus we only need to kick running vCPUs to force a
6687 	 * VM-Exit.
6692 	if (!kvm->arch.cpu_dirty_log_size)
6704 	if (cap->flags)
6705 		return -EINVAL;
6707 	switch (cap->cap) {
6709 		r = -EINVAL;
6710 		if (cap->args[0] & ~kvm_caps.supported_quirks)
6714 		kvm->arch.disabled_quirks |= cap->args[0] & kvm_caps.supported_quirks;
6718 		mutex_lock(&kvm->lock);
6719 		r = -EINVAL;
6720 		if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
6722 		r = -EEXIST;
6725 		if (kvm->created_vcpus)
6729 		kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
6730 		kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
6734 		mutex_unlock(&kvm->lock);
6738 		r = -EINVAL;
6739 		if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
6742 		if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
6743 			kvm->arch.x2apic_format = true;
6744 		if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
6745 			kvm->arch.x2apic_broadcast_quirk_disabled = true;
6750 		r = -EINVAL;
6751 		if (cap->args[0] & ~kvm_get_allowed_disable_exits())
6754 		mutex_lock(&kvm->lock);
6755 		if (kvm->created_vcpus)
6758 #define SMT_RSB_MSG "This processor is affected by the Cross-Thread Return Predictions vulnerability. " \
6763 		    (cap->args[0] & ~(KVM_X86_DISABLE_EXITS_PAUSE |
6767 		kvm_disable_exits(kvm, cap->args[0]);
6770 		mutex_unlock(&kvm->lock);
6773 		kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
6777 		kvm->arch.exception_payload_enabled = cap->args[0];
6781 		kvm->arch.triple_fault_event = cap->args[0];
6785 		r = -EINVAL;
6786 		if (cap->args[0] & ~KVM_MSR_EXIT_REASON_VALID_MASK)
6788 		kvm->arch.user_space_msr_mask = cap->args[0];
6792 		r = -EINVAL;
6793 		if (cap->args[0] & ~KVM_BUS_LOCK_DETECTION_VALID_MODE)
6796 		if ((cap->args[0] & KVM_BUS_LOCK_DETECTION_OFF) &&
6797 		    (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT))
6801 		    cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)
6802 			kvm->arch.bus_lock_detection_enabled = true;
6809 		r = sgx_set_attribute(&allowed_attributes, cap->args[0]);
6816 			kvm->arch.sgx_provisioning_allowed = true;
6818 			r = -EINVAL;
6823 		r = -EINVAL;
6827 		r = kvm_x86_call(vm_copy_enc_context_from)(kvm, cap->args[0]);
6830 		r = -EINVAL;
6834 		r = kvm_x86_call(vm_move_enc_context_from)(kvm, cap->args[0]);
6837 		if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) {
6838 			r = -EINVAL;
6841 		kvm->arch.hypercall_exit_enabled = cap->args[0];
6845 		r = -EINVAL;
6846 		if (cap->args[0] & ~1)
6848 		kvm->arch.exit_on_emulation_error = cap->args[0];
6852 		r = -EINVAL;
6853 		if (!enable_pmu || (cap->args[0] & ~KVM_CAP_PMU_VALID_MASK))
6856 		mutex_lock(&kvm->lock);
6857 		if (!kvm->created_vcpus) {
6858 			kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE);
6861 		mutex_unlock(&kvm->lock);
6864 		r = -EINVAL;
6865 		if (cap->args[0] > KVM_MAX_VCPU_IDS)
6868 		mutex_lock(&kvm->lock);
6869 		if (kvm->arch.bsp_vcpu_id > cap->args[0]) {
6871 		} else if (kvm->arch.max_vcpu_ids == cap->args[0]) {
6873 		} else if (!kvm->arch.max_vcpu_ids) {
6874 			kvm->arch.max_vcpu_ids = cap->args[0];
6877 		mutex_unlock(&kvm->lock);
6880 		r = -EINVAL;
6881 		if ((u32)cap->args[0] & ~KVM_X86_NOTIFY_VMEXIT_VALID_BITS)
6885 		if (!((u32)cap->args[0] & KVM_X86_NOTIFY_VMEXIT_ENABLED))
6887 		mutex_lock(&kvm->lock);
6888 		if (!kvm->created_vcpus) {
6889 			kvm->arch.notify_window = cap->args[0] >> 32;
6890 			kvm->arch.notify_vmexit_flags = (u32)cap->args[0];
6893 		mutex_unlock(&kvm->lock);
6896 		r = -EINVAL;
6907 		 * this must use capable(), not ns_capable().
6910 			r = -EPERM;
6914 		if (cap->args[0])
6917 		mutex_lock(&kvm->lock);
6918 		if (!kvm->created_vcpus) {
6919 			kvm->arch.disable_nx_huge_pages = true;
6922 		mutex_unlock(&kvm->lock);
6925 		u64 bus_cycle_ns = cap->args[0];
6932 		r = -EINVAL;
6938 		mutex_lock(&kvm->lock);
6940 			r = -ENXIO;
6941 		else if (kvm->created_vcpus)
6942 			r = -EINVAL;
6944 			kvm->arch.apic_bus_cycle_ns = bus_cycle_ns;
6945 		mutex_unlock(&kvm->lock);
6949 		r = -EINVAL;
6963 	msr_filter->default_allow = default_allow;
6974 	for (i = 0; i < msr_filter->count; i++)
6975 		kfree(msr_filter->ranges[i].bitmap);
6986 	if (!user_range->nmsrs)
6989 	if (user_range->flags & ~KVM_MSR_FILTER_RANGE_VALID_MASK)
6990 		return -EINVAL;
6992 	if (!user_range->flags)
6993 		return -EINVAL;
6995 	bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
6997 		return -EINVAL;
6999 	bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size);
7003 	msr_filter->ranges[msr_filter->count] = (struct msr_bitmap_range) {
7004 		.flags = user_range->flags,
7005 		.base = user_range->base,
7006 		.nmsrs = user_range->nmsrs,
7010 	msr_filter->count++;
7023 	if (filter->flags & ~KVM_MSR_FILTER_VALID_MASK)
7024 		return -EINVAL;
7026 	for (i = 0; i < ARRAY_SIZE(filter->ranges); i++)
7027 		empty &= !filter->ranges[i].nmsrs;
7029 	default_allow = !(filter->flags & KVM_MSR_FILTER_DEFAULT_DENY);
7031 		return -EINVAL;
7035 		return -ENOMEM;
7037 	for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) {
7038 		r = kvm_add_msr_filter(new_filter, &filter->ranges[i]);
7045 	mutex_lock(&kvm->lock);
7046 	old_filter = rcu_replace_pointer(kvm->arch.msr_filter, new_filter,
7047 					 mutex_is_locked(&kvm->lock));
7048 	mutex_unlock(&kvm->lock);
7049 	synchronize_srcu(&kvm->srcu);
7082 	struct kvm *kvm = filp->private_data;
7083 	long r = -ENOTTY;
7094 			return -EFAULT;
7102 				.flags = cr->flags,
7103 				.nmsrs = cr->nmsrs,
7104 				.base = cr->base,
7105 				.bitmap = (__u8 *)(ulong)cr->bitmap,
7152 		return -EFAULT;
7159 	struct kvm_arch *ka = &kvm->arch;
7164 		return -EFAULT;
7171 		return -EINVAL;
7179 	 * in use, we use master_kernel_ns + kvmclock_offset to set
7180 	 * unsigned 'system_time' so if we use get_kvmclock_ns() (which
7191 			data.clock += now_real_ns - data.realtime;
7194 	if (ka->use_master_clock)
7195 		now_raw_ns = ka->master_kernel_ns;
7198 	ka->kvmclock_offset = data.clock - now_raw_ns;
7206 	struct kvm_vcpu *vcpu = filp->private_data;
7213 	return -ENOIOCTLCMD;
7218 	struct kvm *kvm = filp->private_data;
7220 	int r = -ENOTTY;
7224 	 * This union makes it completely explicit to gcc-3.x
7242 		mutex_lock(&kvm->lock);
7243 		r = -EINVAL;
7244 		if (kvm->created_vcpus)
7246 		r = -EFAULT;
7251 		mutex_unlock(&kvm->lock);
7259 		mutex_lock(&kvm->lock);
7261 		r = -EEXIST;
7266 		 * Disallow an in-kernel I/O APIC if the VM has protected EOIs,
7268 		 * emulate level-triggered interrupts.
7270 		r = -ENOTTY;
7271 		if (kvm->arch.has_protected_eoi)
7274 		r = -EINVAL;
7275 		if (kvm->created_vcpus)
7294 		/* Write kvm->irq_routing before enabling irqchip_in_kernel. */
7296 		kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
7299 		mutex_unlock(&kvm->lock);
7306 		r = -EFAULT;
7311 		mutex_lock(&kvm->lock);
7312 		r = -EEXIST;
7313 		if (kvm->arch.vpit)
7315 		r = -ENOENT;
7318 		r = -ENOMEM;
7319 		kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
7320 		if (kvm->arch.vpit)
7323 		mutex_unlock(&kvm->lock);
7335 		r = -ENXIO;
7341 		r = -EFAULT;
7359 		r = -ENXIO;
7368 		r = -EFAULT;
7371 		r = -ENXIO;
7372 		if (!kvm->arch.vpit)
7377 		r = -EFAULT;
7384 		r = -EFAULT;
7387 		mutex_lock(&kvm->lock);
7388 		r = -ENXIO;
7389 		if (!kvm->arch.vpit)
7393 		mutex_unlock(&kvm->lock);
7397 		r = -ENXIO;
7398 		if (!kvm->arch.vpit)
7403 		r = -EFAULT;
7410 		r = -EFAULT;
7413 		mutex_lock(&kvm->lock);
7414 		r = -ENXIO;
7415 		if (!kvm->arch.vpit)
7419 		mutex_unlock(&kvm->lock);
7424 		r =  -EFAULT;
7427 		r = -ENXIO;
7428 		if (!kvm->arch.vpit)
7436 		mutex_lock(&kvm->lock);
7437 		if (kvm->created_vcpus)
7438 			r = -EBUSY;
7440 			 (kvm->arch.max_vcpu_ids && arg > kvm->arch.max_vcpu_ids))
7441 			r = -EINVAL;
7443 			kvm->arch.bsp_vcpu_id = arg;
7444 		mutex_unlock(&kvm->lock);
7449 		r = -EFAULT;
7458 		r = -EFAULT;
7463 			r = -EFAULT;
7469 		r = -EFAULT;
7478 		r = -EFAULT;
7494 		r = -EINVAL;
7504 		mutex_lock(&kvm->lock);
7505 		if (!kvm->created_vcpus) {
7506 			WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz);
7509 		mutex_unlock(&kvm->lock);
7513 		r = READ_ONCE(kvm->arch.default_tsc_khz);
7517 		r = -ENOTTY;
7526 		r = -EFAULT;
7530 		r = -ENOTTY;
7540 		r = -EFAULT;
7544 		r = -ENOTTY;
7555 		r = -EFAULT;
7570 			return -EFAULT;
7576 		r = -ENOTTY;
7636 		    (msr_index - MSR_IA32_RTIT_ADDR0_A >=
7641 	     MSR_ARCH_PERFMON_PERFCTR0 + KVM_MAX_NR_GP_COUNTERS - 1:
7642 		if (msr_index - MSR_ARCH_PERFMON_PERFCTR0 >=
7647 	     MSR_ARCH_PERFMON_EVENTSEL0 + KVM_MAX_NR_GP_COUNTERS - 1:
7648 		if (msr_index - MSR_ARCH_PERFMON_EVENTSEL0 >=
7653 	     MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_MAX_NR_FIXED_COUNTERS - 1:
7654 		if (msr_index - MSR_ARCH_PERFMON_FIXED_CTR0 >=
7742 		      !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
7747 		len -= n;
7762 		      !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
7769 		len -= n;
7791 	struct kvm_mmu *mmu = vcpu->arch.mmu;
7796 	/* NPT walks are always user-walks */
7798 	t_gpa  = mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
7806 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7809 	return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
7816 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7820 	return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
7828 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7830 	return mmu->gva_to_gpa(vcpu, mmu, gva, 0, exception);
7837 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7842 		gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
7843 		unsigned offset = addr & (PAGE_SIZE-1);
7844 		unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
7856 		bytes -= toread;
7870 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7876 	gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access|PFERR_FETCH_MASK,
7881 	offset = addr & (PAGE_SIZE-1);
7883 		bytes = (unsigned)PAGE_SIZE - offset;
7929 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7934 		gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
7935 		unsigned offset = addr & (PAGE_SIZE-1);
7936 		unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
7947 		bytes -= towrite;
8035 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
8045 	    !permission_fault(vcpu, vcpu->arch.walk_mmu,
8046 			      vcpu->arch.mmio_access, 0, access))) {
8047 		*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
8048 					(gva & (PAGE_SIZE - 1));
8053 	*gpa = mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
8056 		return -1;
8087 	if (vcpu->mmio_read_completed) {
8089 			       vcpu->mmio_fragments[0].gpa, val);
8090 		vcpu->mmio_read_completed = 0;
8125 	struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
8127 	memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
8153 	bool write = ops->write;
8155 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8159 	 * If the GPA is present, use it to avoid the GVA to GPA table walk.
8164 	if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) &&
8165 	    (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) {
8166 		gpa = ctxt->gpa_val;
8174 	if (!ret && ops->read_write_emulate(vcpu, gpa, val, bytes))
8180 	handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
8185 	bytes -= handled;
8188 	WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);
8189 	frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
8190 	frag->gpa = gpa;
8191 	frag->data = val;
8192 	frag->len = bytes;
8206 	if (ops->read_write_prepare &&
8207 		  ops->read_write_prepare(vcpu, val, bytes))
8210 	vcpu->mmio_nr_fragments = 0;
8213 	if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
8216 		now = -addr & ~PAGE_MASK;
8223 		if (ctxt->mode != X86EMUL_MODE_PROT64)
8226 		bytes -= now;
8234 	if (!vcpu->mmio_nr_fragments)
8237 	gpa = vcpu->mmio_fragments[0].gpa;
8239 	vcpu->mmio_needed = 1;
8240 	vcpu->mmio_cur_fragment = 0;
8242 	vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
8243 	vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
8244 	vcpu->run->exit_reason = KVM_EXIT_MMIO;
8245 	vcpu->run->mmio.phys_addr = gpa;
8247 	return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
8287 	if (bytes > 8 || (bytes & (bytes - 1)))
8301 		page_line_mask = ~(cache_line_size() - 1);
8305 	if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask))
8363 	WARN_ON_ONCE(vcpu->arch.pio.count);
8379 				memset(data, 0, size * (count - i));
8388 	vcpu->arch.pio.port = port;
8389 	vcpu->arch.pio.in = in;
8390 	vcpu->arch.pio.count = count;
8391 	vcpu->arch.pio.size = size;
8394 		memset(vcpu->arch.pio_data, 0, size * count);
8396 		memcpy(vcpu->arch.pio_data, data, size * count);
8398 	vcpu->run->exit_reason = KVM_EXIT_IO;
8399 	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
8400 	vcpu->run->io.size = size;
8401 	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
8402 	vcpu->run->io.count = count;
8403 	vcpu->run->io.port = port;
8419 	int size = vcpu->arch.pio.size;
8420 	unsigned int count = vcpu->arch.pio.count;
8421 	memcpy(val, vcpu->arch.pio_data, size * count);
8422 	trace_kvm_pio(KVM_PIO_IN, vcpu->arch.pio.port, size, count, vcpu->arch.pio_data);
8423 	vcpu->arch.pio.count = 0;
8431 	if (vcpu->arch.pio.count) {
8479 		cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
8480 		wbinvd_on_cpus_mask(vcpu->arch.wbinvd_dirty_mask);
8482 		cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
8516 	return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
8529 		value = vcpu->arch.cr2;
8558 		vcpu->arch.cr2 = val;
8571 		res = -1;
8632 	desc->type = var.type;
8633 	desc->s = var.s;
8634 	desc->dpl = var.dpl;
8635 	desc->p = var.present;
8636 	desc->avl = var.avl;
8637 	desc->l = var.l;
8638 	desc->d = var.db;
8639 	desc->g = var.g;
8657 	if (desc->g)
8659 	var.type = desc->type;
8660 	var.dpl = desc->dpl;
8661 	var.db = desc->d;
8662 	var.s = desc->s;
8663 	var.l = desc->l;
8664 	var.g = desc->g;
8665 	var.avl = desc->avl;
8666 	var.present = desc->p;
8724 	 * Treat emulator accesses to the current shadow stack pointer as host-
8727 	 * so the index is fully KVM-controlled.
8748 	emul_to_vcpu(ctxt)->arch.halt_request = 1;
8756 					     &ctxt->exception);
8823 	*xcr = emul_to_vcpu(ctxt)->arch.xcr0;
8834 	struct kvm *kvm = emul_to_vcpu(ctxt)->kvm;
8836 	if (!kvm->vm_bugged)
8926 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8928 	if (ctxt->exception.vector == PF_VECTOR)
8929 		kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
8930 	else if (ctxt->exception.error_code_valid)
8931 		kvm_queue_exception_e(vcpu, ctxt->exception.vector,
8932 				      ctxt->exception.error_code);
8934 		kvm_queue_exception(vcpu, ctxt->exception.vector);
8947 	ctxt->vcpu = vcpu;
8948 	ctxt->ops = &emulate_ops;
8949 	vcpu->arch.emulate_ctxt = ctxt;
8956 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8961 	ctxt->gpa_available = false;
8962 	ctxt->eflags = kvm_get_rflags(vcpu);
8963 	ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
8965 	ctxt->eip = kvm_rip_read(vcpu);
8966 	ctxt->mode = (!is_protmode(vcpu))		? X86EMUL_MODE_REAL :
8967 		     (ctxt->eflags & X86_EFLAGS_VM)	? X86EMUL_MODE_VM86 :
8971 	ctxt->interruptibility = 0;
8972 	ctxt->have_exception = false;
8973 	ctxt->exception.vector = -1;
8974 	ctxt->perm_ok = false;
8977 	vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
8982 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8987 	ctxt->op_bytes = 2;
8988 	ctxt->ad_bytes = 2;
8989 	ctxt->_eip = ctxt->eip + inc_eip;
8995 		ctxt->eip = ctxt->_eip;
8996 		kvm_rip_write(vcpu, ctxt->eip);
8997 		kvm_set_rflags(vcpu, ctxt->eflags);
9005 	struct kvm_run *run = vcpu->run;
9018 	run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
9019 	run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION;
9031 	run->emulation_failure.flags = 0;
9034 		BUILD_BUG_ON((sizeof(run->emulation_failure.insn_size) +
9035 			      sizeof(run->emulation_failure.insn_bytes) != 16));
9037 		run->emulation_failure.flags |=
9039 		run->emulation_failure.insn_size = insn_size;
9040 		memset(run->emulation_failure.insn_bytes, 0x90,
9041 		       sizeof(run->emulation_failure.insn_bytes));
9042 		memcpy(run->emulation_failure.insn_bytes, insn_bytes, insn_size);
9045 	memcpy(&run->internal.data[info_start], info, sizeof(info));
9046 	memcpy(&run->internal.data[info_start + ARRAY_SIZE(info)], data,
9049 	run->emulation_failure.ndata = info_start + ARRAY_SIZE(info) + ndata;
9054 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
9056 	prepare_emulation_failure_exit(vcpu, NULL, 0, ctxt->fetch.data,
9057 				       ctxt->fetch.end - ctxt->fetch.data);
9076 	struct kvm_run *run = vcpu->run;
9083 	run->internal.data[ndata++] = info2;
9084 	run->internal.data[ndata++] = reason;
9085 	run->internal.data[ndata++] = info1;
9086 	run->internal.data[ndata++] = gpa;
9087 	run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;
9089 	run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
9090 	run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
9091 	run->internal.ndata = ndata;
9099 	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
9100 	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
9101 	vcpu->run->internal.ndata = 2;
9102 	vcpu->run->internal.data[0] = exit_reason;
9103 	vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
9109 	struct kvm *kvm = vcpu->kvm;
9111 	++vcpu->stat.insn_emulation_fail;
9119 	if (kvm->arch.exit_on_emulation_error ||
9148 	 * a SPTE and write-protect the gfn to resolve the !PRESENT fault, and
9157 	 * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the
9158 	 * guest to let the CPU re-execute the instruction in the hope that the
9193 	struct kvm_run *kvm_run = vcpu->run;
9195 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
9196 		kvm_run->debug.arch.dr6 = DR6_BS | DR6_ACTIVE_LOW;
9197 		kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
9198 		kvm_run->debug.arch.exception = DB_VECTOR;
9199 		kvm_run->exit_reason = KVM_EXIT_DEBUG;
9269 	if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
9270 	    (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
9271 		struct kvm_run *kvm_run = vcpu->run;
9274 					   vcpu->arch.guest_debug_dr7,
9275 					   vcpu->arch.eff_db);
9278 			kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
9279 			kvm_run->debug.arch.pc = eip;
9280 			kvm_run->debug.arch.exception = DB_VECTOR;
9281 			kvm_run->exit_reason = KVM_EXIT_DEBUG;
9287 	if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
9291 					   vcpu->arch.dr7,
9292 					   vcpu->arch.db);
9306 	switch (ctxt->opcode_len) {
9308 		switch (ctxt->b) {
9325 		switch (ctxt->b) {
9340 	switch (ctxt->b) {
9344 		return vector == ctxt->src.val;
9355  * (and wrong) when emulating on an intercepted fault-like exception[*], as
9365 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
9373 	++vcpu->stat.insn_emulation;
9383 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
9416 		 * are fault-like and are higher priority than any faults on
9434 			if (ctxt->have_exception &&
9437 				 * #UD should result in just EMULATION_FAILED, and trap-like
9440 				WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR ||
9441 					     exception_type(ctxt->exception.vector) == EXCPT_TRAP);
9457 	 * use *only* by vendor callbacks for kvm_skip_emulated_instruction().
9459 	 * injecting single-step #DBs.
9466 		if (ctxt->mode != X86EMUL_MODE_PROT64)
9467 			ctxt->eip = (u32)ctxt->_eip;
9469 			ctxt->eip = ctxt->_eip;
9476 		kvm_rip_write(vcpu, ctxt->eip);
9477 		if (ctxt->eflags & X86_EFLAGS_RF)
9478 			kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
9483 	 * If emulation was caused by a write-protection #PF on a non-page_table
9495 	if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
9496 		vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
9503 		ctxt->exception.address = cr2_or_gpa;
9506 		if (vcpu->arch.mmu->root_role.direct) {
9507 			ctxt->gpa_available = true;
9508 			ctxt->gpa_val = cr2_or_gpa;
9512 		ctxt->exception.address = 0;
9517 	 * L2, unless KVM is re-emulating a previously decoded instruction,
9535 	if (ctxt->have_exception) {
9536 		WARN_ON_ONCE(vcpu->mmio_needed && !vcpu->mmio_is_write);
9537 		vcpu->mmio_needed = false;
9540 	} else if (vcpu->arch.pio.count) {
9541 		if (!vcpu->arch.pio.in) {
9542 			/* FIXME: return into emulator if single-stepping.  */
9543 			vcpu->arch.pio.count = 0;
9546 			vcpu->arch.complete_userspace_io = complete_emulated_pio;
9549 	} else if (vcpu->mmio_needed) {
9550 		++vcpu->stat.mmio_exits;
9552 		if (!vcpu->mmio_is_write)
9555 		vcpu->arch.complete_userspace_io = complete_emulated_mmio;
9556 	} else if (vcpu->arch.complete_userspace_io) {
9567 		toggle_interruptibility(vcpu, ctxt->interruptibility);
9568 		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
9571 		 * Note, EXCPT_DB is assumed to be fault-like as the emulator
9573 		 * of which are fault-like.
9575 		if (!ctxt->have_exception ||
9576 		    exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
9578 			if (ctxt->is_branch)
9580 			kvm_rip_write(vcpu, ctxt->eip);
9581 			if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
9584 			__kvm_set_rflags(vcpu, ctxt->eflags);
9593 		if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF))
9596 		vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
9616 	vcpu->arch.pio.count = 0;
9622 	vcpu->arch.pio.count = 0;
9624 	if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip)))
9644 	    kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) {
9645 		vcpu->arch.complete_userspace_io =
9649 		vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
9650 		vcpu->arch.complete_userspace_io = complete_fast_pio_out;
9660 	BUG_ON(vcpu->arch.pio.count != 1);
9662 	if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))) {
9663 		vcpu->arch.pio.count = 0;
9668 	val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
9691 	vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
9692 	vcpu->arch.complete_userspace_io = complete_fast_pio_in;
9723 		khz = freq->new;
9744 	/* TSC frequency always matches when on Hyper-V */
9812 			if (vcpu->cpu != cpu)
9815 			if (vcpu->cpu != raw_smp_processor_id())
9821 	if (freq->old < freq->new && send_ipi) {
9844 	if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
9846 	if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
9849 	for_each_cpu(cpu, freq->policy->cpus)
9877 				if (policy->cpuinfo.max_freq)
9878 					max_tsc_khz = policy->cpuinfo.max_freq;
9932 	 * Disable master clock if host does not trust, or does not use,
9936 	if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
9949 	memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
9959 #include <asm/kvm-x86-ops.h>
9962 	kvm_pmu_ops_update(ops->pmu_ops);
9980 		return -EIO;
9999 		return -EEXIST;
10009 		return -EOPNOTSUPP;
10014 		return -EOPNOTSUPP;
10027 		return -EIO;
10039 			return -EIO;
10047 		return -ENOMEM;
10072 	kvm_init_pmu_capability(ops->pmu_ops);
10079 	r = ops->hardware_setup();
10101 	if (pi_inject_timer == -1)
10110 	kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
10203 		return -KVM_EOPNOTSUPP;
10206 	 * When tsc is in permanent catchup mode guests won't be able to use
10209 	if (vcpu->arch.tsc_always_catchup)
10210 		return -KVM_EOPNOTSUPP;
10213 		return -KVM_EOPNOTSUPP;
10222 	if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing,
10224 		ret = -KVM_EFAULT;
10233  * @apicid - apicid of vcpu to be kicked.
10253 	return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0);
10259 	ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons);
10287 	set_or_clear_apicv_inhibit(&kvm->arch.apicv_inhibit_reasons, reason, true);
10289 	init_rwsem(&kvm->arch.apicv_update_lock);
10297 	vcpu->stat.directed_yield_attempted++;
10303 	map = rcu_dereference(vcpu->kvm->arch.apic_map);
10305 	if (likely(map) && dest_id <= map->max_apic_id) {
10306 		dest_id = array_index_nospec(dest_id, map->max_apic_id + 1);
10307 		if (map->phys_map[dest_id])
10308 			target = map->phys_map[dest_id]->vcpu;
10313 	if (!target || !READ_ONCE(target->ready))
10323 	vcpu->stat.directed_yield_successful++;
10331 	u64 ret = vcpu->run->hypercall.ret;
10350 	++vcpu->stat.hypercalls;
10363 		ret = -KVM_EPERM;
10367 	ret = -KVM_ENOSYS;
10377 		kvm_pv_kick_cpu_op(vcpu->kvm, a1);
10390 		ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
10402 		ret = -KVM_ENOSYS;
10403 		if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE))
10408 			ret = -KVM_EINVAL;
10412 		vcpu->run->exit_reason        = KVM_EXIT_HYPERCALL;
10413 		vcpu->run->hypercall.nr       = KVM_HC_MAP_GPA_RANGE;
10415 		 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
10416 		 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
10418 		 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
10420 		vcpu->run->hypercall.ret = 0;
10421 		vcpu->run->hypercall.args[0]  = gpa;
10422 		vcpu->run->hypercall.args[1]  = npages;
10423 		vcpu->run->hypercall.args[2]  = attrs;
10424 		vcpu->run->hypercall.flags    = 0;
10426 			vcpu->run->hypercall.flags |= KVM_EXIT_HYPERCALL_LONG_MODE;
10428 		WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ);
10429 		vcpu->arch.complete_userspace_io = complete_hypercall;
10433 		ret = -KVM_ENOSYS;
10438 	vcpu->run->hypercall.ret = ret;
10445 	if (kvm_xen_hypercall_enabled(vcpu->kvm))
10466 	if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_FIX_HYPERCALL_INSN)) {
10467 		ctxt->exception.error_code_valid = false;
10468 		ctxt->exception.vector = UD_VECTOR;
10469 		ctxt->have_exception = true;
10476 		&ctxt->exception);
10481 	return vcpu->run->request_interrupt_window &&
10482 		likely(!pic_in_kernel(vcpu->kvm));
10485 /* Called within kvm->srcu read side.  */
10488 	struct kvm_run *kvm_run = vcpu->run;
10490 	kvm_run->if_flag = kvm_x86_call(get_if_flag)(vcpu);
10491 	kvm_run->cr8 = kvm_get_cr8(vcpu);
10492 	kvm_run->apic_base = vcpu->arch.apic_base;
10494 	kvm_run->ready_for_interrupt_injection =
10495 		pic_in_kernel(vcpu->kvm) ||
10499 		kvm_run->flags |= KVM_RUN_X86_SMM;
10501 		kvm_run->flags |= KVM_RUN_X86_GUEST_MODE;
10514 	if (vcpu->arch.apic->apicv_active)
10517 	if (!vcpu->arch.apic->vapic_addr)
10520 		max_irr = -1;
10522 	if (max_irr != -1)
10534 		kvm_x86_ops.nested_ops->triple_fault(vcpu);
10538 	return kvm_x86_ops.nested_ops->check_events(vcpu);
10547 	 * is injected as intercepted #PF VM-Exits for AMD's Paged Real Mode do
10550 	vcpu->arch.exception.has_error_code &= is_protmode(vcpu);
10552 	trace_kvm_inj_exception(vcpu->arch.exception.vector,
10553 				vcpu->arch.exception.has_error_code,
10554 				vcpu->arch.exception.error_code,
10555 				vcpu->arch.exception.injected);
10565  * injected as part of a previous VM-Enter, but weren't successfully delivered
10566  * and need to be re-injected.
10571  * also be able to re-inject NMIs and IRQs in the middle of an instruction.
10572  * I.e. for exceptions and re-injected events, NOT invoking this on instruction
10577  * instruction boundaries for asynchronous events.  However, because VM-Exits
10583  * But, if a VM-Exit occurs during instruction execution, and KVM does NOT skip
10606 	 * Process nested events first, as nested VM-Exit supersedes event
10607 	 * re-injection.  If there's an event queued for re-injection, it will
10608 	 * be saved into the appropriate vmc{b,s}12 fields on nested VM-Exit.
10616 	 * Re-inject exceptions and events *especially* if immediate entry+exit
10620 	 * Don't re-inject an NMI or interrupt if there is a pending exception.
10629 	 * as the exception "occurred" before the exit to userspace.  Trap-like
10631 	 * And while fault-like exceptions, e.g. #GP and #PF, are the lowest
10634 	 * Thus a pending fault-like exception means the fault occurred on the
10638 	if (vcpu->arch.exception.injected)
10642 	else if (vcpu->arch.nmi_injected)
10644 	else if (vcpu->arch.interrupt.injected)
10648 	 * Exceptions that morph to VM-Exits are handled above, and pending
10649 	 * exceptions on top of injected exceptions that do not VM-Exit should
10652 	WARN_ON_ONCE(vcpu->arch.exception.injected &&
10653 		     vcpu->arch.exception.pending);
10657 	 * nested VM-Enter or event re-injection so that a different pending
10660 	 * Otherwise, continue processing events even if VM-Exit occurred.  The
10661 	 * VM-Exit will have cleared exceptions that were meant for L2, but
10668 	 * A pending exception VM-Exit should either result in nested VM-Exit
10669 	 * or force an immediate re-entry and exit to/from L2, and exception
10670 	 * VM-Exits cannot be injected (flag should _never_ be set).
10672 	WARN_ON_ONCE(vcpu->arch.exception_vmexit.injected ||
10673 		     vcpu->arch.exception_vmexit.pending);
10677 	 * to re-inject a previous event.  See above comments on re-injecting
10682 	if (vcpu->arch.exception.pending) {
10684 		 * Fault-class exceptions, except #DBs, set RF=1 in the RFLAGS
10685 		 * value pushed on the stack.  Trap-like exception and all #DBs
10686 		 * leave RF as-is (KVM follows Intel's behavior in this regard;
10691 		 * fault-like.  They do _not_ set RF, a la code breakpoints.
10693 		if (exception_type(vcpu->arch.exception.vector) == EXCPT_FAULT)
10697 		if (vcpu->arch.exception.vector == DB_VECTOR) {
10698 			kvm_deliver_exception_payload(vcpu, &vcpu->arch.exception);
10699 			if (vcpu->arch.dr7 & DR7_GD) {
10700 				vcpu->arch.dr7 &= ~DR7_GD;
10707 		vcpu->arch.exception.pending = false;
10708 		vcpu->arch.exception.injected = true;
10714 	if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ)
10719 	 * due to architectural conditions (e.g. IF=0) a window-open exit
10720 	 * will re-request KVM_REQ_EVENT.  Sometimes however an event is pending
10726 	 * The kvm_x86_ops hooks communicate this by returning -EBUSY.
10729 	if (vcpu->arch.smi_pending) {
10731 				 -EBUSY;
10735 			vcpu->arch.smi_pending = false;
10736 			++vcpu->arch.smi_count;
10744 	if (vcpu->arch.nmi_pending) {
10746 				 -EBUSY;
10750 			--vcpu->arch.nmi_pending;
10751 			vcpu->arch.nmi_injected = true;
10756 		if (vcpu->arch.nmi_pending)
10762 				 -EBUSY;
10768 			if (!WARN_ON_ONCE(irq == -1)) {
10779 	    kvm_x86_ops.nested_ops->has_events &&
10780 	    kvm_x86_ops.nested_ops->has_events(vcpu, true))
10785 	 * is done emulating and should only propagate the to-be-injected event
10787 	 * infinite loop as KVM will bail from VM-Enter to inject the pending
10795 	WARN_ON_ONCE(vcpu->arch.exception.pending ||
10796 		     vcpu->arch.exception_vmexit.pending);
10800 	if (r == -EBUSY) {
10821 	if (kvm_x86_call(get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
10828 	 * tracked in vcpu->arch.nmi_pending.
10831 		limit--;
10833 	vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
10834 	vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
10836 	if (vcpu->arch.nmi_pending &&
10838 		vcpu->arch.nmi_pending--;
10840 	if (vcpu->arch.nmi_pending)
10847 	return vcpu->arch.nmi_pending +
10864 	struct kvm_lapic *apic = vcpu->arch.apic;
10870 	down_read(&vcpu->kvm->arch.apicv_update_lock);
10877 	if (apic->apicv_active == activate)
10880 	apic->apicv_active = activate;
10890 	if (!apic->apicv_active)
10895 	up_read(&vcpu->kvm->arch.apicv_update_lock);
10909 	 * this case so that KVM can use the AVIC doorbell to inject interrupts
10915 	if (apic_x2apic_mode(vcpu->arch.apic) &&
10927 	lockdep_assert_held_write(&kvm->arch.apicv_update_lock);
10932 	old = new = kvm->arch.apicv_inhibit_reasons;
10950 		kvm->arch.apicv_inhibit_reasons = new;
10953 			int idx = srcu_read_lock(&kvm->srcu);
10956 			srcu_read_unlock(&kvm->srcu, idx);
10959 		kvm->arch.apicv_inhibit_reasons = new;
10969 	down_write(&kvm->arch.apicv_update_lock);
10971 	up_write(&kvm->arch.apicv_update_lock);
10980 	bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
10981 	vcpu->arch.highest_stale_pending_ioapic_eoi = -1;
10985 	if (irqchip_split(vcpu->kvm))
10986 		kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
10988 	else if (ioapic_in_kernel(vcpu->kvm))
10989 		kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
10993 		vcpu->arch.load_eoi_exitmap_pending = true;
11000 	if (!kvm_apic_hw_enabled(vcpu->arch.apic))
11008 			  vcpu->arch.ioapic_handled_vectors,
11009 			  to_hv_synic(vcpu)->vec_bitmap, 256);
11015 		vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors);
11032  * Called within kvm->srcu read side.
11050 			r = -EIO;
11060 			if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
11070 			kvm_update_masterclock(vcpu->kvm);
11094 		 * Fall back to a "full" guest flush if Hyper-V's precise
11095 		 * flushing fails.  Note, Hyper-V's flushing is per-vCPU, but
11106 			vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
11112 				kvm_x86_ops.nested_ops->triple_fault(vcpu);
11115 				vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
11116 				vcpu->mmio_needed = 0;
11123 			vcpu->arch.apf.halted = true;
11140 			BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
11141 			if (test_bit(vcpu->arch.pending_ioapic_eoi,
11142 				     vcpu->arch.ioapic_handled_vectors)) {
11143 				vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
11144 				vcpu->run->eoi.vector =
11145 						vcpu->arch.pending_ioapic_eoi;
11158 			vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
11159 			vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
11160 			vcpu->run->system_event.ndata = 0;
11165 			vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
11166 			vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
11167 			vcpu->run->system_event.ndata = 0;
11174 			vcpu->run->exit_reason = KVM_EXIT_HYPERV;
11175 			vcpu->run->hyperv = hv_vcpu->exit;
11182 		 * KVM_REQ_CLOCK_UPDATE, because Hyper-V SynIC timers
11183 		 * depend on the guest clock being up-to-date
11201 			if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) {
11210 		++vcpu->stat.req_event;
11216 		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
11251 	/* Store vcpu->apicv_active before vcpu->mode.  */
11252 	smp_store_release(&vcpu->mode, IN_GUEST_MODE);
11257 	 * 1) We should set ->mode before checking ->requests.  Please see
11260 	 * 2) For APICv, we should set ->mode before checking PID.ON. This
11281 		vcpu->mode = OUTSIDE_GUEST_MODE;
11300 	if (vcpu->arch.guest_fpu.xfd_err)
11301 		wrmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
11305 	if (unlikely(vcpu->arch.switch_db_regs &&
11306 		     !(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))) {
11308 		set_debugreg(vcpu->arch.eff_db[0], 0);
11309 		set_debugreg(vcpu->arch.eff_db[1], 1);
11310 		set_debugreg(vcpu->arch.eff_db[2], 2);
11311 		set_debugreg(vcpu->arch.eff_db[3], 3);
11313 		if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
11322 	 * vendor code if any host-owned bits were changed, e.g. so that the
11326 	if ((debug_ctl ^ vcpu->arch.host_debugctl) & kvm_x86_ops.HOST_OWNED_DEBUGCTL &&
11327 	    !vcpu->arch.guest_state_protected)
11329 	vcpu->arch.host_debugctl = debug_ctl;
11335 	 * of flows where non-KVM code can run with guest state loaded.
11343 		 * per-VM state, and responding vCPUs must wait for the update
11363 		/* Note, VM-Exits that go down the "slow" path are accounted below. */
11364 		++vcpu->stat.exits;
11375 	if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
11376 		WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
11377 		WARN_ON(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH);
11393 	vcpu->arch.last_vmentry_cpu = vcpu->cpu;
11394 	vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
11396 	vcpu->mode = OUTSIDE_GUEST_MODE;
11403 	 * rely on the fact that guest_fpu::xfd is up-to-date (e.g.
11406 	if (vcpu->arch.xfd_no_write_intercept)
11411 	if (vcpu->arch.guest_fpu.xfd_err)
11425 	 * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
11432 	++vcpu->stat.exits;
11441 	 * acceptable for all known use cases.
11460 		     !vcpu->arch.guest_state_protected)) {
11465 	if (unlikely(vcpu->arch.tsc_always_catchup))
11468 	if (vcpu->arch.apic_attention)
11481 	if (unlikely(vcpu->arch.apic_attention))
11489 	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
11490 		!vcpu->arch.apf.halted);
11495 	if (!list_empty_careful(&vcpu->async_pf.done))
11506 	    (vcpu->arch.nmi_pending &&
11512 	    (vcpu->arch.smi_pending &&
11530 	    kvm_x86_ops.nested_ops->has_events &&
11531 	    kvm_x86_ops.nested_ops->has_events(vcpu, false))
11543 	return kvm_vcpu_running(vcpu) || vcpu->arch.pv.pv_unhalted ||
11547 /* Called within kvm->srcu read side.  */
11554 		 * Switch to the software timer before halt-polling/blocking as
11557 		 * Switch before halt-polling so that KVM recognizes an expired
11565 		if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
11586 	 * state field (AMD does not have a similar field and a VM-Exit always
11592 		WARN_ON_ONCE(r == -EBUSY);
11599 	switch(vcpu->arch.mp_state) {
11605 		vcpu->arch.apf.halted = false;
11616 /* Called within kvm->srcu read side.  */
11621 	vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
11627 		 * use a stale page translation. Assume that any code after
11630 		vcpu->arch.at_instruction_boundary = false;
11650 			vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
11651 			++vcpu->stat.request_irq_exits;
11671 	 * local APIC is in-kernel, the run loop will detect the non-runnable
11676 	++vcpu->stat.halt_exits;
11678 		if (kvm_vcpu_has_events(vcpu) || vcpu->arch.pv.pv_unhalted)
11683 		vcpu->run->exit_reason = reason;
11698 	 * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
11734 	return vcpu->arch.preempted_in_kernel;
11739 	if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
11759 	BUG_ON(!vcpu->arch.pio.count);
11784 	struct kvm_run *run = vcpu->run;
11788 	BUG_ON(!vcpu->mmio_needed);
11791 	frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
11792 	len = min(8u, frag->len);
11793 	if (!vcpu->mmio_is_write)
11794 		memcpy(frag->data, run->mmio.data, len);
11796 	if (frag->len <= 8) {
11799 		vcpu->mmio_cur_fragment++;
11802 		frag->data += len;
11803 		frag->gpa += len;
11804 		frag->len -= len;
11807 	if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
11808 		vcpu->mmio_needed = 0;
11810 		/* FIXME: return into emulator if single-stepping.  */
11811 		if (vcpu->mmio_is_write)
11813 		vcpu->mmio_read_completed = 1;
11817 	run->exit_reason = KVM_EXIT_MMIO;
11818 	run->mmio.phys_addr = frag->gpa;
11819 	if (vcpu->mmio_is_write)
11820 		memcpy(run->mmio.data, frag->data, min(8u, frag->len));
11821 	run->mmio.len = min(8u, frag->len);
11822 	run->mmio.is_write = vcpu->mmio_is_write;
11823 	vcpu->arch.complete_userspace_io = complete_emulated_mmio;
11830 	if (KVM_BUG_ON(vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm))
11833 	/* Exclude PKRU, it's restored separately immediately after VM-Exit. */
11834 	fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true);
11841 	if (KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm))
11844 	fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false);
11845 	++vcpu->stat.fpu_reload;
11852 	 * SIPI_RECEIVED is obsolete; KVM leaves the vCPU in Wait-For-SIPI and
11857 	if (WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED))
11858 		return -EINVAL;
11864 	if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED &&
11866 		return -EINVAL;
11873 	struct kvm_queued_exception *ex = &vcpu->arch.exception;
11874 	struct kvm_run *kvm_run = vcpu->run;
11878 	r = kvm_mmu_post_init_vm(vcpu->kvm);
11884 	kvm_run->flags = 0;
11888 	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
11889 		if (!vcpu->wants_to_run) {
11890 			r = -EINTR;
11912 		r = -EAGAIN;
11914 			r = -EINTR;
11915 			kvm_run->exit_reason = KVM_EXIT_INTR;
11916 			++vcpu->stat.signal_exits;
11921 	sync_valid_fields = kvm_sync_valid_fields(vcpu->kvm);
11922 	if ((kvm_run->kvm_valid_regs & ~sync_valid_fields) ||
11923 	    (kvm_run->kvm_dirty_regs & ~sync_valid_fields)) {
11924 		r = -EINVAL;
11928 	if (kvm_run->kvm_dirty_regs) {
11934 	/* re-sync apic's tpr */
11936 		if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
11937 			r = -EINVAL;
11944 	 * a pending VM-Exit if L1 wants to intercept the exception.
11946 	if (vcpu->arch.exception_from_userspace && is_guest_mode(vcpu) &&
11947 	    kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, ex->vector,
11948 							ex->error_code)) {
11949 		kvm_queue_exception_vmexit(vcpu, ex->vector,
11950 					   ex->has_error_code, ex->error_code,
11951 					   ex->has_payload, ex->payload);
11952 		ex->injected = false;
11953 		ex->pending = false;
11955 	vcpu->arch.exception_from_userspace = false;
11957 	if (unlikely(vcpu->arch.complete_userspace_io)) {
11958 		int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
11959 		vcpu->arch.complete_userspace_io = NULL;
11964 		WARN_ON_ONCE(vcpu->arch.pio.count);
11965 		WARN_ON_ONCE(vcpu->mmio_needed);
11968 	if (!vcpu->wants_to_run) {
11969 		r = -EINTR;
11981 	if (kvm_run->kvm_valid_regs && likely(!vcpu->arch.guest_state_protected))
11993 	if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
12001 		emulator_writeback_register_cache(vcpu->arch.emulate_ctxt);
12002 		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
12004 	regs->rax = kvm_rax_read(vcpu);
12005 	regs->rbx = kvm_rbx_read(vcpu);
12006 	regs->rcx = kvm_rcx_read(vcpu);
12007 	regs->rdx = kvm_rdx_read(vcpu);
12008 	regs->rsi = kvm_rsi_read(vcpu);
12009 	regs->rdi = kvm_rdi_read(vcpu);
12010 	regs->rsp = kvm_rsp_read(vcpu);
12011 	regs->rbp = kvm_rbp_read(vcpu);
12013 	regs->r8 = kvm_r8_read(vcpu);
12014 	regs->r9 = kvm_r9_read(vcpu);
12015 	regs->r10 = kvm_r10_read(vcpu);
12016 	regs->r11 = kvm_r11_read(vcpu);
12017 	regs->r12 = kvm_r12_read(vcpu);
12018 	regs->r13 = kvm_r13_read(vcpu);
12019 	regs->r14 = kvm_r14_read(vcpu);
12020 	regs->r15 = kvm_r15_read(vcpu);
12023 	regs->rip = kvm_rip_read(vcpu);
12024 	regs->rflags = kvm_get_rflags(vcpu);
12029 	if (vcpu->kvm->arch.has_protected_state &&
12030 	    vcpu->arch.guest_state_protected)
12031 		return -EINVAL;
12041 	vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
12042 	vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
12044 	kvm_rax_write(vcpu, regs->rax);
12045 	kvm_rbx_write(vcpu, regs->rbx);
12046 	kvm_rcx_write(vcpu, regs->rcx);
12047 	kvm_rdx_write(vcpu, regs->rdx);
12048 	kvm_rsi_write(vcpu, regs->rsi);
12049 	kvm_rdi_write(vcpu, regs->rdi);
12050 	kvm_rsp_write(vcpu, regs->rsp);
12051 	kvm_rbp_write(vcpu, regs->rbp);
12053 	kvm_r8_write(vcpu, regs->r8);
12054 	kvm_r9_write(vcpu, regs->r9);
12055 	kvm_r10_write(vcpu, regs->r10);
12056 	kvm_r11_write(vcpu, regs->r11);
12057 	kvm_r12_write(vcpu, regs->r12);
12058 	kvm_r13_write(vcpu, regs->r13);
12059 	kvm_r14_write(vcpu, regs->r14);
12060 	kvm_r15_write(vcpu, regs->r15);
12063 	kvm_rip_write(vcpu, regs->rip);
12064 	kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
12066 	vcpu->arch.exception.pending = false;
12067 	vcpu->arch.exception_vmexit.pending = false;
12074 	if (vcpu->kvm->arch.has_protected_state &&
12075 	    vcpu->arch.guest_state_protected)
12076 		return -EINVAL;
12088 	if (vcpu->arch.guest_state_protected)
12091 	kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
12092 	kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
12093 	kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
12094 	kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
12095 	kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
12096 	kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
12098 	kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
12099 	kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
12102 	sregs->idt.limit = dt.size;
12103 	sregs->idt.base = dt.address;
12105 	sregs->gdt.limit = dt.size;
12106 	sregs->gdt.base = dt.address;
12108 	sregs->cr2 = vcpu->arch.cr2;
12109 	sregs->cr3 = kvm_read_cr3(vcpu);
12112 	sregs->cr0 = kvm_read_cr0(vcpu);
12113 	sregs->cr4 = kvm_read_cr4(vcpu);
12114 	sregs->cr8 = kvm_get_cr8(vcpu);
12115 	sregs->efer = vcpu->arch.efer;
12116 	sregs->apic_base = vcpu->arch.apic_base;
12123 	if (vcpu->arch.guest_state_protected)
12126 	if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
12127 		set_bit(vcpu->arch.interrupt.nr,
12128 			(unsigned long *)sregs->interrupt_bitmap);
12137 	if (vcpu->arch.guest_state_protected)
12142 			sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i);
12143 		sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID;
12150 	if (vcpu->kvm->arch.has_protected_state &&
12151 	    vcpu->arch.guest_state_protected)
12152 		return -EINVAL;
12173 	if ((vcpu->arch.mp_state == KVM_MP_STATE_HALTED ||
12174 	     vcpu->arch.mp_state == KVM_MP_STATE_AP_RESET_HOLD) &&
12175 	    vcpu->arch.pv.pv_unhalted)
12176 		mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
12178 		mp_state->mp_state = vcpu->arch.mp_state;
12189 	int ret = -EINVAL;
12193 	switch (mp_state->mp_state) {
12212 	 * leaves the vCPU in INIT_RECIEVED (Wait-For-SIPI) and pends the SIPI.
12215 	if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
12216 		mp_state->mp_state = KVM_MP_STATE_INIT_RECEIVED;
12217 		set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
12220 	kvm_set_mp_state(vcpu, mp_state->mp_state);
12232 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
12239 		 * Check both User and Supervisor on task switches as inter-
12263 	if (ret || vcpu->mmio_needed)
12266 	kvm_rip_write(vcpu, ctxt->eip);
12267 	kvm_set_rflags(vcpu, ctxt->eflags);
12271 	vcpu->mmio_needed = false;
12272 	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
12273 	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
12274 	vcpu->run->internal.ndata = 0;
12281 	if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
12284 		 * 64-bit mode (though maybe in a 32-bit code segment).
12287 		if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA))
12289 		if (!kvm_vcpu_is_legal_cr3(vcpu, sregs->cr3))
12293 		 * Not in 64-bit mode: EFER.LMA is clear and the code
12294 		 * segment cannot be 64-bit.
12296 		if (sregs->efer & EFER_LMA || sregs->cs.l)
12300 	return kvm_is_valid_cr4(vcpu, sregs->cr4) &&
12301 	       kvm_is_valid_cr0(vcpu, sregs->cr0);
12311 		return -EINVAL;
12313 	if (kvm_apic_set_base(vcpu, sregs->apic_base, true))
12314 		return -EINVAL;
12316 	if (vcpu->arch.guest_state_protected)
12319 	dt.size = sregs->idt.limit;
12320 	dt.address = sregs->idt.base;
12322 	dt.size = sregs->gdt.limit;
12323 	dt.address = sregs->gdt.base;
12326 	vcpu->arch.cr2 = sregs->cr2;
12327 	*mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
12328 	vcpu->arch.cr3 = sregs->cr3;
12330 	kvm_x86_call(post_set_cr3)(vcpu, sregs->cr3);
12332 	kvm_set_cr8(vcpu, sregs->cr8);
12334 	*mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
12335 	kvm_x86_call(set_efer)(vcpu, sregs->efer);
12337 	*mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
12338 	kvm_x86_call(set_cr0)(vcpu, sregs->cr0);
12340 	*mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
12341 	kvm_x86_call(set_cr4)(vcpu, sregs->cr4);
12344 		idx = srcu_read_lock(&vcpu->kvm->srcu);
12349 		srcu_read_unlock(&vcpu->kvm->srcu, idx);
12352 	kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
12353 	kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
12354 	kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
12355 	kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
12356 	kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
12357 	kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
12359 	kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
12360 	kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
12366 	    sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
12389 		(const unsigned long *)sregs->interrupt_bitmap, max_bits);
12402 	bool valid_pdptrs = sregs2->flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
12403 	bool pae = (sregs2->cr0 & X86_CR0_PG) && (sregs2->cr4 & X86_CR4_PAE) &&
12404 		!(sregs2->efer & EFER_LMA);
12407 	if (sregs2->flags & ~KVM_SREGS2_FLAGS_PDPTRS_VALID)
12408 		return -EINVAL;
12410 	if (valid_pdptrs && (!pae || vcpu->arch.guest_state_protected))
12411 		return -EINVAL;
12420 			kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]);
12424 		vcpu->arch.pdptrs_from_userspace = true;
12438 	if (vcpu->kvm->arch.has_protected_state &&
12439 	    vcpu->arch.guest_state_protected)
12440 		return -EINVAL;
12457 	down_write(&kvm->arch.apicv_update_lock);
12460 		if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) {
12466 	up_write(&kvm->arch.apicv_update_lock);
12475 	if (vcpu->arch.guest_state_protected)
12476 		return -EINVAL;
12480 	if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
12481 		r = -EBUSY;
12484 		if (dbg->control & KVM_GUESTDBG_INJECT_DB)
12496 	vcpu->guest_debug = dbg->control;
12497 	if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
12498 		vcpu->guest_debug = 0;
12500 	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
12502 			vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
12503 		vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
12506 			vcpu->arch.eff_db[i] = vcpu->arch.db[i];
12510 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
12511 		vcpu->arch.singlestep_rip = kvm_get_linear_rip(vcpu);
12521 	kvm_arch_vcpu_guestdbg_update_apicv_inhibit(vcpu->kvm);
12536 	unsigned long vaddr = tr->linear_address;
12542 	idx = srcu_read_lock(&vcpu->kvm->srcu);
12544 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
12545 	tr->physical_address = gpa;
12546 	tr->valid = gpa != INVALID_GPA;
12547 	tr->writeable = 1;
12548 	tr->usermode = 0;
12558 	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
12559 		return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
12563 	fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
12564 	memcpy(fpu->fpr, fxsave->st_space, 128);
12565 	fpu->fcw = fxsave->cwd;
12566 	fpu->fsw = fxsave->swd;
12567 	fpu->ftwx = fxsave->twd;
12568 	fpu->last_opcode = fxsave->fop;
12569 	fpu->last_ip = fxsave->rip;
12570 	fpu->last_dp = fxsave->rdp;
12571 	memcpy(fpu->xmm, fxsave->xmm_space, sizeof(fxsave->xmm_space));
12581 	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
12582 		return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
12586 	fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
12588 	memcpy(fxsave->st_space, fpu->fpr, 128);
12589 	fxsave->cwd = fpu->fcw;
12590 	fxsave->swd = fpu->fsw;
12591 	fxsave->twd = fpu->ftwx;
12592 	fxsave->fop = fpu->last_opcode;
12593 	fxsave->rip = fpu->last_ip;
12594 	fxsave->rdp = fpu->last_dp;
12595 	memcpy(fxsave->xmm_space, fpu->xmm, sizeof(fxsave->xmm_space));
12605 	if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS)
12606 		__get_regs(vcpu, &vcpu->run->s.regs.regs);
12608 	if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS)
12609 		__get_sregs(vcpu, &vcpu->run->s.regs.sregs);
12611 	if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS)
12613 				vcpu, &vcpu->run->s.regs.events);
12618 	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
12619 		__set_regs(vcpu, &vcpu->run->s.regs.regs);
12620 		vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
12623 	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
12624 		struct kvm_sregs sregs = vcpu->run->s.regs.sregs;
12627 			return -EINVAL;
12629 		vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
12632 	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
12633 		struct kvm_vcpu_events events = vcpu->run->s.regs.events;
12636 			return -EINVAL;
12638 		vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
12646 	if (kvm_check_tsc_unstable() && kvm->created_vcpus)
12650 	if (!kvm->arch.max_vcpu_ids)
12651 		kvm->arch.max_vcpu_ids = KVM_MAX_VCPU_IDS;
12653 	if (id >= kvm->arch.max_vcpu_ids)
12654 		return -EINVAL;
12664 	vcpu->arch.last_vmentry_cpu = -1;
12665 	vcpu->arch.regs_avail = ~0;
12666 	vcpu->arch.regs_dirty = ~0;
12668 	kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm);
12670 	if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
12683 	r = -ENOMEM;
12688 	vcpu->arch.pio_data = page_address(page);
12690 	vcpu->arch.mce_banks = kcalloc(KVM_MAX_MCE_BANKS * 4, sizeof(u64),
12692 	vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64),
12694 	if (!vcpu->arch.mce_banks || !vcpu->arch.mci_ctl2_banks)
12696 	vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
12698 	if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
12705 	if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) {
12712 	if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) {
12713 		vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
12714 		vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
12715 		vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap;
12719 	vcpu->arch.pending_external_vector = -1;
12720 	vcpu->arch.preempted_in_kernel = false;
12723 	vcpu->arch.hv_root_tdp = INVALID_PAGE;
12733 	kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
12740 	fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
12742 	kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
12744 	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
12746 	kfree(vcpu->arch.mce_banks);
12747 	kfree(vcpu->arch.mci_ctl2_banks);
12748 	free_page((unsigned long)vcpu->arch.pio_data);
12758 	if (mutex_lock_killable(&vcpu->mutex))
12765 	vcpu->arch.msr_kvm_poll_control = 1;
12767 	mutex_unlock(&vcpu->mutex);
12784 	kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
12785 	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
12786 	fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
12791 	kfree(vcpu->arch.mce_banks);
12792 	kfree(vcpu->arch.mci_ctl2_banks);
12794 	idx = srcu_read_lock(&vcpu->kvm->srcu);
12796 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
12797 	free_page((unsigned long)vcpu->arch.pio_data);
12798 	kvfree(vcpu->arch.cpuid_entries);
12803 	struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
12834 	 * doesn't match the loaded/in-use state of the FPU, as KVM_RUN is the
12838 	WARN_ON_ONCE(vcpu->wants_to_run != fpstate->in_use);
12839 	fpu_in_use = fpstate->in_use;
12855 	 * Several of the "set" flows, e.g. ->set_cr0(), read other registers
12858 	 * into hardware, to be zeroed at vCPU creation.  Use CRs as a sentinel
12865 	 * SVM doesn't unconditionally VM-Exit on INIT and SHUTDOWN, thus it's
12876 	vcpu->arch.hflags = 0;
12878 	vcpu->arch.smi_pending = 0;
12879 	vcpu->arch.smi_count = 0;
12880 	atomic_set(&vcpu->arch.nmi_queued, 0);
12881 	vcpu->arch.nmi_pending = 0;
12882 	vcpu->arch.nmi_injected = false;
12886 	memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
12888 	vcpu->arch.dr6 = DR6_ACTIVE_LOW;
12889 	vcpu->arch.dr7 = DR7_FIXED_1;
12892 	vcpu->arch.cr2 = 0;
12895 	vcpu->arch.apf.msr_en_val = 0;
12896 	vcpu->arch.apf.msr_int_val = 0;
12897 	vcpu->arch.st.msr_val = 0;
12903 	vcpu->arch.apf.halted = false;
12908 		vcpu->arch.smbase = 0x30000;
12910 		vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
12912 		vcpu->arch.msr_misc_features_enables = 0;
12913 		vcpu->arch.ia32_misc_enable_msr = MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
12921 	memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
12932 	kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600);
12939 	vcpu->arch.cr3 = 0;
12961 	 * which PCIDs have to be flushed.  However, CR0.WP and the paging-related
13031 			if (!stable && vcpu->cpu == smp_processor_id())
13033 			if (stable && vcpu->arch.last_host_tsc > local_tsc) {
13035 				if (vcpu->arch.last_host_tsc > max_tsc)
13036 					max_tsc = vcpu->arch.last_host_tsc;
13066 	 * N.B. - this code below runs only on platforms with reliable TSC,
13080 		u64 delta_cyc = max_tsc - local_tsc;
13082 			kvm->arch.backwards_tsc_observed = true;
13084 				vcpu->arch.tsc_offset_adjustment += delta_cyc;
13085 				vcpu->arch.last_host_tsc = local_tsc;
13095 			kvm->arch.last_tsc_nsec = 0;
13096 			kvm->arch.last_tsc_write = 0;
13108 	 * Leave the user-return notifiers as-is when disabling virtualization
13110 	 * pin kvm.ko (if it's a module) to defend against use-after-free (in
13113 	 * could be actively modifying user-return MSR state when the IPI to
13125 	return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
13131 	return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
13137 	kfree(kvm->arch.hv_pa_pg);
13149 		return -EINVAL;
13151 	kvm->arch.vm_type = type;
13152 	kvm->arch.has_private_mem =
13155 	kvm->arch.pre_fault_allowed =
13157 	kvm->arch.disabled_quirks = kvm_caps.inapplicable_quirks & kvm_caps.supported_quirks;
13171 	atomic_set(&kvm->arch.noncoherent_dma_count, 0);
13173 	raw_spin_lock_init(&kvm->arch.tsc_write_lock);
13174 	mutex_init(&kvm->arch.apic_map_lock);
13175 	seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock);
13176 	kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
13178 	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
13180 	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
13182 	kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz;
13183 	kvm->arch.apic_bus_cycle_ns = APIC_BUS_CYCLE_NS_DEFAULT;
13184 	kvm->arch.guest_can_read_msr_platform_info = true;
13185 	kvm->arch.enable_pmu = enable_pmu;
13188 	spin_lock_init(&kvm->arch.hv_root_tdp_lock);
13189 	kvm->arch.hv_root_tdp = INVALID_PAGE;
13203 	once_init(&kvm->arch.nx_once);
13227  *   -errno:        on error
13229  * The caller should always use IS_ERR() to check the return value
13230  * before use.  Note, the KVM internal memory slots are guaranteed to
13232  * GPA->HVA translation will not change.  However, the HVA is a user
13244 	lockdep_assert_held(&kvm->slots_lock);
13247 		return ERR_PTR_USR(-EINVAL);
13251 		if (slot && slot->npages)
13252 			return ERR_PTR_USR(-EEXIST);
13263 		if (!slot || !slot->npages)
13266 		old_npages = slot->npages;
13267 		hva = slot->userspace_addr;
13295 	 * is unsafe, i.e. will lead to use-after-free.  The PIT also needs to
13308 	if (current->mm == kvm->mm) {
13314 		mutex_lock(&kvm->slots_lock);
13320 		mutex_unlock(&kvm->slots_lock);
13323 	kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
13328 	kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
13329 	kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
13342 		vfree(slot->arch.rmap[i]);
13343 		slot->arch.rmap[i] = NULL;
13354 		vfree(slot->arch.lpage_info[i - 1]);
13355 		slot->arch.lpage_info[i - 1] = NULL;
13363 	const int sz = sizeof(*slot->arch.rmap[0]);
13370 		if (slot->arch.rmap[i])
13373 		slot->arch.rmap[i] = __vcalloc(lpages, sz, GFP_KERNEL_ACCOUNT);
13374 		if (!slot->arch.rmap[i]) {
13376 			return -ENOMEM;
13386 	unsigned long npages = slot->npages;
13394 	memset(&slot->arch, 0, sizeof(slot->arch));
13414 		slot->arch.lpage_info[i - 1] = linfo;
13416 		if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
13418 		if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
13419 			linfo[lpages - 1].disallow_lpage = 1;
13420 		ugfn = slot->userspace_addr >> PAGE_SHIFT;
13425 		if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) {
13446 		vfree(slot->arch.lpage_info[i - 1]);
13447 		slot->arch.lpage_info[i - 1] = NULL;
13449 	return -ENOMEM;
13458 	 * memslots->generation has been incremented.
13463 	/* Force re-initialization of steal_time cache */
13475 	 * trackers attached to the VM, i.e. if KVMGT is in use.
13478 		return -EINVAL;
13481 		if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn())
13482 			return -EINVAL;
13484 		if (kvm_is_gfn_alias(kvm, new->base_gfn + new->npages - 1))
13485 			return -EINVAL;
13491 		memcpy(&new->arch, &old->arch, sizeof(old->arch));
13493 		return -EIO;
13503 	if (!kvm->arch.cpu_dirty_log_size)
13506 	nr_slots = atomic_read(&kvm->nr_memslots_dirty_logging);
13516 	u32 old_flags = old ? old->flags : 0;
13517 	u32 new_flags = new ? new->flags : 0;
13537 	 * CREATE:      No shadow pages exist, thus nothing to write-protect
13546 	 * READONLY and non-flags changes were filtered out above, and the only
13566 		 * Initially-all-set does not require write protecting any page,
13575 		if (kvm->arch.cpu_dirty_log_size) {
13591 		 * write-protected before returning to userspace, i.e. before
13598 		 * Specifically, KVM also write-protects guest page tables to
13607 		 * To handle these scenarios, KVM uses a separate software-only
13608 		 * bit (MMU-writable) to track if a SPTE is !writable due to
13609 		 * a guest page table being write-protected (KVM clears the
13610 		 * MMU-writable flag when write-protecting for shadow paging).
13612 		 * The use of MMU-writable is also the primary motivation for
13615 		 * !MMU-writable SPTE, KVM must flush if it encounters any
13616 		 * MMU-writable SPTE regardless of whether the actual hardware
13619 		 * write access" helpers to ignore MMU-writable entirely.
13622 		 * access-tracked SPTEs is particularly relevant).
13636 	if (!kvm->arch.n_requested_mmu_pages &&
13640 		nr_mmu_pages = kvm->nr_memslot_pages / KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO;
13656 	if (vcpu->arch.guest_state_protected)
13666 	if (vcpu->arch.guest_state_protected)
13685 	if (vcpu->arch.guest_state_protected)
13706 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
13714 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
13715 	    kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
13736 	return (key + 1) & (ASYNC_PF_PER_VCPU - 1);
13743 	while (vcpu->arch.apf.gfns[key] != ~0)
13746 	vcpu->arch.apf.gfns[key] = gfn;
13755 		     (vcpu->arch.apf.gfns[key] != gfn &&
13756 		      vcpu->arch.apf.gfns[key] != ~0); i++)
13764 	return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
13773 	if (WARN_ON_ONCE(vcpu->arch.apf.gfns[i] != gfn))
13777 		vcpu->arch.apf.gfns[i] = ~0;
13780 			if (vcpu->arch.apf.gfns[j] == ~0)
13782 			k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
13789 		vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
13798 	return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &reason,
13806 	return kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
13815 	if (kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
13828 	if (!vcpu->arch.apf.send_always &&
13829 	    (vcpu->arch.guest_state_protected || !kvm_x86_call(get_cpl)(vcpu)))
13837 		return vcpu->arch.apf.delivery_as_pf_vmexit;
13855 	if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
13859 	 * If interrupts are off we cannot even use an artificial
13870 	trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa);
13871 	kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
13879 		fault.address = work->arch.token;
13902 		.vector = vcpu->arch.apf.vec
13905 	if (work->wakeup_all)
13906 		work->arch.token = ~0; /* broadcast wakeup */
13908 		kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
13909 	trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
13911 	if ((work->wakeup_all || work->notpresent_injected) &&
13913 	    !apf_put_user_ready(vcpu, work->arch.token)) {
13914 		WRITE_ONCE(vcpu->arch.apf.pageready_pending, true);
13918 	vcpu->arch.apf.halted = false;
13929 	if (!READ_ONCE(vcpu->arch.apf.pageready_pending))
13944 	 * Non-coherent DMA assignment and de-assignment may affect whether or
13947 	 * (or last) non-coherent device is (un)registered to so that new SPTEs
13958 	if (atomic_inc_return(&kvm->arch.noncoherent_dma_count) == 1)
13964 	if (!atomic_dec_return(&kvm->arch.noncoherent_dma_count))
13970 	return atomic_read(&kvm->arch.noncoherent_dma_count);
13976 	return (vcpu->arch.msr_kvm_poll_control & 1) == 0;
14033 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
14039 	    mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != INVALID_GPA) {
14041 		 * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
14052 	vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault);
14065 		if (KVM_BUG_ON(!e, vcpu->kvm))
14066 			return -EIO;
14076 	 * doesn't seem to be a real use-case behind such requests, just return
14132 		 * page tables, so a non-global flush just degenerates to a
14151 	struct kvm_run *run = vcpu->run;
14155 	BUG_ON(!vcpu->mmio_needed);
14158 	frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
14159 	len = min(8u, frag->len);
14160 	if (!vcpu->mmio_is_write)
14161 		memcpy(frag->data, run->mmio.data, len);
14163 	if (frag->len <= 8) {
14166 		vcpu->mmio_cur_fragment++;
14169 		frag->data += len;
14170 		frag->gpa += len;
14171 		frag->len -= len;
14174 	if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
14175 		vcpu->mmio_needed = 0;
14183 	run->mmio.phys_addr = frag->gpa;
14184 	run->mmio.len = min(8u, frag->len);
14185 	run->mmio.is_write = vcpu->mmio_is_write;
14186 	if (run->mmio.is_write)
14187 		memcpy(run->mmio.data, frag->data, min(8u, frag->len));
14188 	run->exit_reason = KVM_EXIT_MMIO;
14190 	vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
14202 		return -EINVAL;
14208 	bytes -= handled;
14213 	frag = vcpu->mmio_fragments;
14214 	vcpu->mmio_nr_fragments = 1;
14215 	frag->len = bytes;
14216 	frag->gpa = gpa;
14217 	frag->data = data;
14219 	vcpu->mmio_needed = 1;
14220 	vcpu->mmio_cur_fragment = 0;
14222 	vcpu->run->mmio.phys_addr = gpa;
14223 	vcpu->run->mmio.len = min(8u, frag->len);
14224 	vcpu->run->mmio.is_write = 1;
14225 	memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
14226 	vcpu->run->exit_reason = KVM_EXIT_MMIO;
14228 	vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
14241 		return -EINVAL;
14247 	bytes -= handled;
14252 	frag = vcpu->mmio_fragments;
14253 	vcpu->mmio_nr_fragments = 1;
14254 	frag->len = bytes;
14255 	frag->gpa = gpa;
14256 	frag->data = data;
14258 	vcpu->mmio_needed = 1;
14259 	vcpu->mmio_cur_fragment = 0;
14261 	vcpu->run->mmio.phys_addr = gpa;
14262 	vcpu->run->mmio.len = min(8u, frag->len);
14263 	vcpu->run->mmio.is_write = 0;
14264 	vcpu->run->exit_reason = KVM_EXIT_MMIO;
14266 	vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
14274 	vcpu->arch.sev_pio_count -= count;
14275 	vcpu->arch.sev_pio_data += count * size;
14283 	int size = vcpu->arch.pio.size;
14284 	int port = vcpu->arch.pio.port;
14286 	vcpu->arch.pio.count = 0;
14287 	if (vcpu->arch.sev_pio_count)
14297 			min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
14298 		int ret = emulator_pio_out(vcpu, size, port, vcpu->arch.sev_pio_data, count);
14306 		if (!vcpu->arch.sev_pio_count)
14310 	vcpu->arch.complete_userspace_io = complete_sev_es_emulated_outs;
14319 	unsigned count = vcpu->arch.pio.count;
14320 	int size = vcpu->arch.pio.size;
14321 	int port = vcpu->arch.pio.port;
14323 	complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data);
14325 	if (vcpu->arch.sev_pio_count)
14335 			min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
14336 		if (!emulator_pio_in(vcpu, size, port, vcpu->arch.sev_pio_data, count))
14341 		if (!vcpu->arch.sev_pio_count)
14345 	vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins;
14353 	vcpu->arch.sev_pio_data = data;
14354 	vcpu->arch.sev_pio_count = count;