xref: /linux/arch/x86/kvm/svm/avic.c (revision 256e3417065b2721f77bcd37331796b59483ef3b)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Kernel-based Virtual Machine driver for Linux
4  *
5  * AMD SVM support
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9  *
10  * Authors:
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *   Avi Kivity   <avi@qumranet.com>
13  */
14 
15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16 
17 #include <linux/kvm_types.h>
18 #include <linux/hashtable.h>
19 #include <linux/amd-iommu.h>
20 #include <linux/kvm_host.h>
21 #include <linux/kvm_irqfd.h>
22 
23 #include <asm/irq_remapping.h>
24 #include <asm/msr.h>
25 
26 #include "trace.h"
27 #include "lapic.h"
28 #include "x86.h"
29 #include "irq.h"
30 #include "svm.h"
31 
32 /*
33  * Encode the arbitrary VM ID and the vCPU's _index_ into the GATag so that
34  * KVM can retrieve the correct vCPU from a GALog entry if an interrupt can't
35  * be delivered, e.g. because the vCPU isn't running.  Use the vCPU's index
36  * instead of its ID (a.k.a. its default APIC ID), as KVM is guaranteed a fast
37  * lookup on the index, where as vCPUs whose index doesn't match their ID need
38  * to walk the entire xarray of vCPUs in the worst case scenario.
39  *
40  * For the vCPU index, use however many bits are currently allowed for the max
41  * guest physical APIC ID (limited by the size of the physical ID table), and
42  * use whatever bits remain to assign arbitrary AVIC IDs to VMs.  Note, the
43  * size of the GATag is defined by hardware (32 bits), but is an opaque value
44  * as far as hardware is concerned.
45  */
46 #define AVIC_VCPU_IDX_MASK		AVIC_PHYSICAL_MAX_INDEX_MASK
47 
48 #define AVIC_VM_ID_SHIFT		HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK)
49 #define AVIC_VM_ID_MASK			(GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT)
50 
51 #define AVIC_GATAG_TO_VMID(x)		((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK)
52 #define AVIC_GATAG_TO_VCPUIDX(x)	(x & AVIC_VCPU_IDX_MASK)
53 
54 #define __AVIC_GATAG(vm_id, vcpu_idx)	((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \
55 					 ((vcpu_idx) & AVIC_VCPU_IDX_MASK))
56 #define AVIC_GATAG(vm_id, vcpu_idx)					\
57 ({									\
58 	u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_idx);			\
59 									\
60 	WARN_ON_ONCE(AVIC_GATAG_TO_VCPUIDX(ga_tag) != (vcpu_idx));	\
61 	WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id));		\
62 	ga_tag;								\
63 })
64 
65 static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_IDX_MASK) == -1u);
66 
67 #define AVIC_AUTO_MODE -1
68 
avic_param_set(const char * val,const struct kernel_param * kp)69 static int avic_param_set(const char *val, const struct kernel_param *kp)
70 {
71 	if (val && sysfs_streq(val, "auto")) {
72 		*(int *)kp->arg = AVIC_AUTO_MODE;
73 		return 0;
74 	}
75 
76 	return param_set_bint(val, kp);
77 }
78 
79 static const struct kernel_param_ops avic_ops = {
80 	.flags = KERNEL_PARAM_OPS_FL_NOARG,
81 	.set = avic_param_set,
82 	.get = param_get_bool,
83 };
84 
85 /*
86  * Enable / disable AVIC.  In "auto" mode (default behavior), AVIC is enabled
87  * for Zen4+ CPUs with x2AVIC (and all other criteria for enablement are met).
88  */
89 static int avic = AVIC_AUTO_MODE;
90 module_param_cb(avic, &avic_ops, &avic, 0444);
91 __MODULE_PARM_TYPE(avic, "bool");
92 
93 module_param(enable_ipiv, bool, 0444);
94 
95 static bool force_avic;
96 module_param_unsafe(force_avic, bool, 0444);
97 
98 /* Note:
99  * This hash table is used to map VM_ID to a struct kvm_svm,
100  * when handling AMD IOMMU GALOG notification to schedule in
101  * a particular vCPU.
102  */
103 #define SVM_VM_DATA_HASH_BITS	8
104 static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
105 static u32 next_vm_id = 0;
106 static bool next_vm_id_wrapped = 0;
107 static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
108 static bool x2avic_enabled;
109 
110 
avic_set_x2apic_msr_interception(struct vcpu_svm * svm,bool intercept)111 static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm,
112 					     bool intercept)
113 {
114 	static const u32 x2avic_passthrough_msrs[] = {
115 		X2APIC_MSR(APIC_ID),
116 		X2APIC_MSR(APIC_LVR),
117 		X2APIC_MSR(APIC_TASKPRI),
118 		X2APIC_MSR(APIC_ARBPRI),
119 		X2APIC_MSR(APIC_PROCPRI),
120 		X2APIC_MSR(APIC_EOI),
121 		X2APIC_MSR(APIC_RRR),
122 		X2APIC_MSR(APIC_LDR),
123 		X2APIC_MSR(APIC_DFR),
124 		X2APIC_MSR(APIC_SPIV),
125 		X2APIC_MSR(APIC_ISR),
126 		X2APIC_MSR(APIC_TMR),
127 		X2APIC_MSR(APIC_IRR),
128 		X2APIC_MSR(APIC_ESR),
129 		X2APIC_MSR(APIC_ICR),
130 		X2APIC_MSR(APIC_ICR2),
131 
132 		/*
133 		 * Note!  Always intercept LVTT, as TSC-deadline timer mode
134 		 * isn't virtualized by hardware, and the CPU will generate a
135 		 * #GP instead of a #VMEXIT.
136 		 */
137 		X2APIC_MSR(APIC_LVTTHMR),
138 		X2APIC_MSR(APIC_LVTPC),
139 		X2APIC_MSR(APIC_LVT0),
140 		X2APIC_MSR(APIC_LVT1),
141 		X2APIC_MSR(APIC_LVTERR),
142 		X2APIC_MSR(APIC_TMICT),
143 		X2APIC_MSR(APIC_TMCCT),
144 		X2APIC_MSR(APIC_TDCR),
145 	};
146 	int i;
147 
148 	if (intercept == svm->x2avic_msrs_intercepted)
149 		return;
150 
151 	if (!x2avic_enabled)
152 		return;
153 
154 	for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++)
155 		svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i],
156 					  MSR_TYPE_RW, intercept);
157 
158 	svm->x2avic_msrs_intercepted = intercept;
159 }
160 
avic_activate_vmcb(struct vcpu_svm * svm)161 static void avic_activate_vmcb(struct vcpu_svm *svm)
162 {
163 	struct vmcb *vmcb = svm->vmcb01.ptr;
164 
165 	vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
166 	vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
167 
168 	vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
169 
170 	/*
171 	 * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
172 	 * accesses, while interrupt injection to a running vCPU can be
173 	 * achieved using AVIC doorbell.  KVM disables the APIC access page
174 	 * (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling
175 	 * AVIC in hybrid mode activates only the doorbell mechanism.
176 	 */
177 	if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) {
178 		vmcb->control.int_ctl |= X2APIC_MODE_MASK;
179 		vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID;
180 		/* Disabling MSR intercept for x2APIC registers */
181 		avic_set_x2apic_msr_interception(svm, false);
182 	} else {
183 		/*
184 		 * Flush the TLB, the guest may have inserted a non-APIC
185 		 * mapping into the TLB while AVIC was disabled.
186 		 */
187 		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
188 
189 		/* For xAVIC and hybrid-xAVIC modes */
190 		vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID;
191 		/* Enabling MSR intercept for x2APIC registers */
192 		avic_set_x2apic_msr_interception(svm, true);
193 	}
194 }
195 
avic_deactivate_vmcb(struct vcpu_svm * svm)196 static void avic_deactivate_vmcb(struct vcpu_svm *svm)
197 {
198 	struct vmcb *vmcb = svm->vmcb01.ptr;
199 
200 	vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
201 	vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
202 
203 	/*
204 	 * If running nested and the guest uses its own MSR bitmap, there
205 	 * is no need to update L0's msr bitmap
206 	 */
207 	if (is_guest_mode(&svm->vcpu) &&
208 	    vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))
209 		return;
210 
211 	/* Enabling MSR intercept for x2APIC registers */
212 	avic_set_x2apic_msr_interception(svm, true);
213 }
214 
215 /* Note:
216  * This function is called from IOMMU driver to notify
217  * SVM to schedule in a particular vCPU of a particular VM.
218  */
avic_ga_log_notifier(u32 ga_tag)219 int avic_ga_log_notifier(u32 ga_tag)
220 {
221 	unsigned long flags;
222 	struct kvm_svm *kvm_svm;
223 	struct kvm_vcpu *vcpu = NULL;
224 	u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
225 	u32 vcpu_idx = AVIC_GATAG_TO_VCPUIDX(ga_tag);
226 
227 	pr_debug("SVM: %s: vm_id=%#x, vcpu_idx=%#x\n", __func__, vm_id, vcpu_idx);
228 	trace_kvm_avic_ga_log(vm_id, vcpu_idx);
229 
230 	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
231 	hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
232 		if (kvm_svm->avic_vm_id != vm_id)
233 			continue;
234 		vcpu = kvm_get_vcpu(&kvm_svm->kvm, vcpu_idx);
235 		break;
236 	}
237 	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
238 
239 	/* Note:
240 	 * At this point, the IOMMU should have already set the pending
241 	 * bit in the vAPIC backing page. So, we just need to schedule
242 	 * in the vcpu.
243 	 */
244 	if (vcpu)
245 		kvm_vcpu_wake_up(vcpu);
246 
247 	return 0;
248 }
249 
avic_vm_destroy(struct kvm * kvm)250 void avic_vm_destroy(struct kvm *kvm)
251 {
252 	unsigned long flags;
253 	struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
254 
255 	if (!enable_apicv)
256 		return;
257 
258 	free_page((unsigned long)kvm_svm->avic_logical_id_table);
259 	free_page((unsigned long)kvm_svm->avic_physical_id_table);
260 
261 	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
262 	hash_del(&kvm_svm->hnode);
263 	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
264 }
265 
avic_vm_init(struct kvm * kvm)266 int avic_vm_init(struct kvm *kvm)
267 {
268 	unsigned long flags;
269 	int err = -ENOMEM;
270 	struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
271 	struct kvm_svm *k2;
272 	u32 vm_id;
273 
274 	if (!enable_apicv)
275 		return 0;
276 
277 	kvm_svm->avic_physical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
278 	if (!kvm_svm->avic_physical_id_table)
279 		goto free_avic;
280 
281 	kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
282 	if (!kvm_svm->avic_logical_id_table)
283 		goto free_avic;
284 
285 	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
286  again:
287 	vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
288 	if (vm_id == 0) { /* id is 1-based, zero is not okay */
289 		next_vm_id_wrapped = 1;
290 		goto again;
291 	}
292 	/* Is it still in use? Only possible if wrapped at least once */
293 	if (next_vm_id_wrapped) {
294 		hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) {
295 			if (k2->avic_vm_id == vm_id)
296 				goto again;
297 		}
298 	}
299 	kvm_svm->avic_vm_id = vm_id;
300 	hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id);
301 	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
302 
303 	return 0;
304 
305 free_avic:
306 	avic_vm_destroy(kvm);
307 	return err;
308 }
309 
avic_get_backing_page_address(struct vcpu_svm * svm)310 static phys_addr_t avic_get_backing_page_address(struct vcpu_svm *svm)
311 {
312 	return __sme_set(__pa(svm->vcpu.arch.apic->regs));
313 }
314 
avic_init_vmcb(struct vcpu_svm * svm,struct vmcb * vmcb)315 void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
316 {
317 	struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
318 
319 	vmcb->control.avic_backing_page = avic_get_backing_page_address(svm);
320 	vmcb->control.avic_logical_id = __sme_set(__pa(kvm_svm->avic_logical_id_table));
321 	vmcb->control.avic_physical_id = __sme_set(__pa(kvm_svm->avic_physical_id_table));
322 	vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE;
323 
324 	if (kvm_apicv_activated(svm->vcpu.kvm))
325 		avic_activate_vmcb(svm);
326 	else
327 		avic_deactivate_vmcb(svm);
328 }
329 
avic_init_backing_page(struct kvm_vcpu * vcpu)330 static int avic_init_backing_page(struct kvm_vcpu *vcpu)
331 {
332 	struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
333 	struct vcpu_svm *svm = to_svm(vcpu);
334 	u32 id = vcpu->vcpu_id;
335 	u64 new_entry;
336 
337 	/*
338 	 * Inhibit AVIC if the vCPU ID is bigger than what is supported by AVIC
339 	 * hardware.  Immediately clear apicv_active, i.e. don't wait until the
340 	 * KVM_REQ_APICV_UPDATE request is processed on the first KVM_RUN, as
341 	 * avic_vcpu_load() expects to be called if and only if the vCPU has
342 	 * fully initialized AVIC.
343 	 */
344 	if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) ||
345 	    (id > X2AVIC_MAX_PHYSICAL_ID)) {
346 		kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG);
347 		vcpu->arch.apic->apicv_active = false;
348 		return 0;
349 	}
350 
351 	BUILD_BUG_ON((AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE ||
352 		     (X2AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE);
353 
354 	if (WARN_ON_ONCE(!vcpu->arch.apic->regs))
355 		return -EINVAL;
356 
357 	if (kvm_apicv_activated(vcpu->kvm)) {
358 		int ret;
359 
360 		/*
361 		 * Note, AVIC hardware walks the nested page table to check
362 		 * permissions, but does not use the SPA address specified in
363 		 * the leaf SPTE since it uses address in the AVIC_BACKING_PAGE
364 		 * pointer field of the VMCB.
365 		 */
366 		ret = kvm_alloc_apic_access_page(vcpu->kvm);
367 		if (ret)
368 			return ret;
369 	}
370 
371 	/* Note, fls64() returns the bit position, +1. */
372 	BUILD_BUG_ON(__PHYSICAL_MASK_SHIFT >
373 		     fls64(AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK));
374 
375 	/* Setting AVIC backing page address in the phy APIC ID table */
376 	new_entry = avic_get_backing_page_address(svm) |
377 		    AVIC_PHYSICAL_ID_ENTRY_VALID_MASK;
378 	svm->avic_physical_id_entry = new_entry;
379 
380 	/*
381 	 * Initialize the real table, as vCPUs must have a valid entry in order
382 	 * for broadcast IPIs to function correctly (broadcast IPIs ignore
383 	 * invalid entries, i.e. aren't guaranteed to generate a VM-Exit).
384 	 */
385 	WRITE_ONCE(kvm_svm->avic_physical_id_table[id], new_entry);
386 
387 	return 0;
388 }
389 
avic_ring_doorbell(struct kvm_vcpu * vcpu)390 void avic_ring_doorbell(struct kvm_vcpu *vcpu)
391 {
392 	/*
393 	 * Note, the vCPU could get migrated to a different pCPU at any point,
394 	 * which could result in signalling the wrong/previous pCPU.  But if
395 	 * that happens the vCPU is guaranteed to do a VMRUN (after being
396 	 * migrated) and thus will process pending interrupts, i.e. a doorbell
397 	 * is not needed (and the spurious one is harmless).
398 	 */
399 	int cpu = READ_ONCE(vcpu->cpu);
400 
401 	if (cpu != get_cpu()) {
402 		wrmsrq(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
403 		trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu));
404 	}
405 	put_cpu();
406 }
407 
408 
avic_kick_vcpu(struct kvm_vcpu * vcpu,u32 icrl)409 static void avic_kick_vcpu(struct kvm_vcpu *vcpu, u32 icrl)
410 {
411 	vcpu->arch.apic->irr_pending = true;
412 	svm_complete_interrupt_delivery(vcpu,
413 					icrl & APIC_MODE_MASK,
414 					icrl & APIC_INT_LEVELTRIG,
415 					icrl & APIC_VECTOR_MASK);
416 }
417 
avic_kick_vcpu_by_physical_id(struct kvm * kvm,u32 physical_id,u32 icrl)418 static void avic_kick_vcpu_by_physical_id(struct kvm *kvm, u32 physical_id,
419 					  u32 icrl)
420 {
421 	/*
422 	 * KVM inhibits AVIC if any vCPU ID diverges from the vCPUs APIC ID,
423 	 * i.e. APIC ID == vCPU ID.
424 	 */
425 	struct kvm_vcpu *target_vcpu = kvm_get_vcpu_by_id(kvm, physical_id);
426 
427 	/* Once again, nothing to do if the target vCPU doesn't exist. */
428 	if (unlikely(!target_vcpu))
429 		return;
430 
431 	avic_kick_vcpu(target_vcpu, icrl);
432 }
433 
avic_kick_vcpu_by_logical_id(struct kvm * kvm,u32 * avic_logical_id_table,u32 logid_index,u32 icrl)434 static void avic_kick_vcpu_by_logical_id(struct kvm *kvm, u32 *avic_logical_id_table,
435 					 u32 logid_index, u32 icrl)
436 {
437 	u32 physical_id;
438 
439 	if (avic_logical_id_table) {
440 		u32 logid_entry = avic_logical_id_table[logid_index];
441 
442 		/* Nothing to do if the logical destination is invalid. */
443 		if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
444 			return;
445 
446 		physical_id = logid_entry &
447 			      AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
448 	} else {
449 		/*
450 		 * For x2APIC, the logical APIC ID is a read-only value that is
451 		 * derived from the x2APIC ID, thus the x2APIC ID can be found
452 		 * by reversing the calculation (stored in logid_index).  Note,
453 		 * bits 31:20 of the x2APIC ID aren't propagated to the logical
454 		 * ID, but KVM limits the x2APIC ID limited to KVM_MAX_VCPU_IDS.
455 		 */
456 		physical_id = logid_index;
457 	}
458 
459 	avic_kick_vcpu_by_physical_id(kvm, physical_id, icrl);
460 }
461 
462 /*
463  * A fast-path version of avic_kick_target_vcpus(), which attempts to match
464  * destination APIC ID to vCPU without looping through all vCPUs.
465  */
avic_kick_target_vcpus_fast(struct kvm * kvm,struct kvm_lapic * source,u32 icrl,u32 icrh,u32 index)466 static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source,
467 				       u32 icrl, u32 icrh, u32 index)
468 {
469 	int dest_mode = icrl & APIC_DEST_MASK;
470 	int shorthand = icrl & APIC_SHORT_MASK;
471 	struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
472 	u32 dest;
473 
474 	if (shorthand != APIC_DEST_NOSHORT)
475 		return -EINVAL;
476 
477 	if (apic_x2apic_mode(source))
478 		dest = icrh;
479 	else
480 		dest = GET_XAPIC_DEST_FIELD(icrh);
481 
482 	if (dest_mode == APIC_DEST_PHYSICAL) {
483 		/* broadcast destination, use slow path */
484 		if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST)
485 			return -EINVAL;
486 		if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST)
487 			return -EINVAL;
488 
489 		if (WARN_ON_ONCE(dest != index))
490 			return -EINVAL;
491 
492 		avic_kick_vcpu_by_physical_id(kvm, dest, icrl);
493 	} else {
494 		u32 *avic_logical_id_table;
495 		unsigned long bitmap, i;
496 		u32 cluster;
497 
498 		if (apic_x2apic_mode(source)) {
499 			/* 16 bit dest mask, 16 bit cluster id */
500 			bitmap = dest & 0xFFFF;
501 			cluster = (dest >> 16) << 4;
502 		} else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) {
503 			/* 8 bit dest mask*/
504 			bitmap = dest;
505 			cluster = 0;
506 		} else {
507 			/* 4 bit desk mask, 4 bit cluster id */
508 			bitmap = dest & 0xF;
509 			cluster = (dest >> 4) << 2;
510 		}
511 
512 		/* Nothing to do if there are no destinations in the cluster. */
513 		if (unlikely(!bitmap))
514 			return 0;
515 
516 		if (apic_x2apic_mode(source))
517 			avic_logical_id_table = NULL;
518 		else
519 			avic_logical_id_table = kvm_svm->avic_logical_id_table;
520 
521 		/*
522 		 * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical
523 		 * IDs, thus each bit in the destination is guaranteed to map
524 		 * to at most one vCPU.
525 		 */
526 		for_each_set_bit(i, &bitmap, 16)
527 			avic_kick_vcpu_by_logical_id(kvm, avic_logical_id_table,
528 						     cluster + i, icrl);
529 	}
530 
531 	return 0;
532 }
533 
avic_kick_target_vcpus(struct kvm * kvm,struct kvm_lapic * source,u32 icrl,u32 icrh,u32 index)534 static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
535 				   u32 icrl, u32 icrh, u32 index)
536 {
537 	u32 dest = apic_x2apic_mode(source) ? icrh : GET_XAPIC_DEST_FIELD(icrh);
538 	unsigned long i;
539 	struct kvm_vcpu *vcpu;
540 
541 	if (!avic_kick_target_vcpus_fast(kvm, source, icrl, icrh, index))
542 		return;
543 
544 	trace_kvm_avic_kick_vcpu_slowpath(icrh, icrl, index);
545 
546 	/*
547 	 * Wake any target vCPUs that are blocking, i.e. waiting for a wake
548 	 * event.  There's no need to signal doorbells, as hardware has handled
549 	 * vCPUs that were in guest at the time of the IPI, and vCPUs that have
550 	 * since entered the guest will have processed pending IRQs at VMRUN.
551 	 */
552 	kvm_for_each_vcpu(i, vcpu, kvm) {
553 		if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
554 					dest, icrl & APIC_DEST_MASK))
555 			avic_kick_vcpu(vcpu, icrl);
556 	}
557 }
558 
avic_incomplete_ipi_interception(struct kvm_vcpu * vcpu)559 int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
560 {
561 	struct vcpu_svm *svm = to_svm(vcpu);
562 	u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
563 	u32 icrl = svm->vmcb->control.exit_info_1;
564 	u32 id = svm->vmcb->control.exit_info_2 >> 32;
565 	u32 index = svm->vmcb->control.exit_info_2 & 0x1FF;
566 	struct kvm_lapic *apic = vcpu->arch.apic;
567 
568 	trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
569 
570 	switch (id) {
571 	case AVIC_IPI_FAILURE_INVALID_TARGET:
572 	case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
573 		/*
574 		 * Emulate IPIs that are not handled by AVIC hardware, which
575 		 * only virtualizes Fixed, Edge-Triggered INTRs, and falls over
576 		 * if _any_ targets are invalid, e.g. if the logical mode mask
577 		 * is a superset of running vCPUs.
578 		 *
579 		 * The exit is a trap, e.g. ICR holds the correct value and RIP
580 		 * has been advanced, KVM is responsible only for emulating the
581 		 * IPI.  Sadly, hardware may sometimes leave the BUSY flag set,
582 		 * in which case KVM needs to emulate the ICR write as well in
583 		 * order to clear the BUSY flag.
584 		 */
585 		if (icrl & APIC_ICR_BUSY)
586 			kvm_apic_write_nodecode(vcpu, APIC_ICR);
587 		else
588 			kvm_apic_send_ipi(apic, icrl, icrh);
589 		break;
590 	case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING:
591 		/*
592 		 * At this point, we expect that the AVIC HW has already
593 		 * set the appropriate IRR bits on the valid target
594 		 * vcpus. So, we just need to kick the appropriate vcpu.
595 		 */
596 		avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index);
597 		break;
598 	case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
599 		WARN_ONCE(1, "Invalid backing page\n");
600 		break;
601 	case AVIC_IPI_FAILURE_INVALID_IPI_VECTOR:
602 		/* Invalid IPI with vector < 16 */
603 		break;
604 	default:
605 		vcpu_unimpl(vcpu, "Unknown avic incomplete IPI interception\n");
606 	}
607 
608 	return 1;
609 }
610 
avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu * vcpu)611 unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
612 {
613 	if (is_guest_mode(vcpu))
614 		return APICV_INHIBIT_REASON_NESTED;
615 	return 0;
616 }
617 
avic_get_logical_id_entry(struct kvm_vcpu * vcpu,u32 ldr,bool flat)618 static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
619 {
620 	struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
621 	u32 cluster, index;
622 
623 	ldr = GET_APIC_LOGICAL_ID(ldr);
624 
625 	if (flat) {
626 		cluster = 0;
627 	} else {
628 		cluster = (ldr >> 4);
629 		if (cluster >= 0xf)
630 			return NULL;
631 		ldr &= 0xf;
632 	}
633 	if (!ldr || !is_power_of_2(ldr))
634 		return NULL;
635 
636 	index = __ffs(ldr);
637 	if (WARN_ON_ONCE(index > 7))
638 		return NULL;
639 	index += (cluster << 2);
640 
641 	return &kvm_svm->avic_logical_id_table[index];
642 }
643 
avic_ldr_write(struct kvm_vcpu * vcpu,u8 g_physical_id,u32 ldr)644 static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
645 {
646 	bool flat;
647 	u32 *entry, new_entry;
648 
649 	flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
650 	entry = avic_get_logical_id_entry(vcpu, ldr, flat);
651 	if (!entry)
652 		return;
653 
654 	new_entry = READ_ONCE(*entry);
655 	new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
656 	new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
657 	new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
658 	WRITE_ONCE(*entry, new_entry);
659 }
660 
avic_invalidate_logical_id_entry(struct kvm_vcpu * vcpu)661 static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
662 {
663 	struct vcpu_svm *svm = to_svm(vcpu);
664 	bool flat = svm->dfr_reg == APIC_DFR_FLAT;
665 	u32 *entry;
666 
667 	/* Note: x2AVIC does not use logical APIC ID table */
668 	if (apic_x2apic_mode(vcpu->arch.apic))
669 		return;
670 
671 	entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
672 	if (entry)
673 		clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry);
674 }
675 
avic_handle_ldr_update(struct kvm_vcpu * vcpu)676 static void avic_handle_ldr_update(struct kvm_vcpu *vcpu)
677 {
678 	struct vcpu_svm *svm = to_svm(vcpu);
679 	u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
680 	u32 id = kvm_xapic_id(vcpu->arch.apic);
681 
682 	/* AVIC does not support LDR update for x2APIC */
683 	if (apic_x2apic_mode(vcpu->arch.apic))
684 		return;
685 
686 	if (ldr == svm->ldr_reg)
687 		return;
688 
689 	avic_invalidate_logical_id_entry(vcpu);
690 
691 	svm->ldr_reg = ldr;
692 	avic_ldr_write(vcpu, id, ldr);
693 }
694 
avic_handle_dfr_update(struct kvm_vcpu * vcpu)695 static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
696 {
697 	struct vcpu_svm *svm = to_svm(vcpu);
698 	u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
699 
700 	if (svm->dfr_reg == dfr)
701 		return;
702 
703 	avic_invalidate_logical_id_entry(vcpu);
704 	svm->dfr_reg = dfr;
705 }
706 
avic_unaccel_trap_write(struct kvm_vcpu * vcpu)707 static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu)
708 {
709 	u32 offset = to_svm(vcpu)->vmcb->control.exit_info_1 &
710 				AVIC_UNACCEL_ACCESS_OFFSET_MASK;
711 
712 	switch (offset) {
713 	case APIC_LDR:
714 		avic_handle_ldr_update(vcpu);
715 		break;
716 	case APIC_DFR:
717 		avic_handle_dfr_update(vcpu);
718 		break;
719 	case APIC_RRR:
720 		/* Ignore writes to Read Remote Data, it's read-only. */
721 		return 1;
722 	default:
723 		break;
724 	}
725 
726 	kvm_apic_write_nodecode(vcpu, offset);
727 	return 1;
728 }
729 
is_avic_unaccelerated_access_trap(u32 offset)730 static bool is_avic_unaccelerated_access_trap(u32 offset)
731 {
732 	bool ret = false;
733 
734 	switch (offset) {
735 	case APIC_ID:
736 	case APIC_EOI:
737 	case APIC_RRR:
738 	case APIC_LDR:
739 	case APIC_DFR:
740 	case APIC_SPIV:
741 	case APIC_ESR:
742 	case APIC_ICR:
743 	case APIC_LVTT:
744 	case APIC_LVTTHMR:
745 	case APIC_LVTPC:
746 	case APIC_LVT0:
747 	case APIC_LVT1:
748 	case APIC_LVTERR:
749 	case APIC_TMICT:
750 	case APIC_TDCR:
751 		ret = true;
752 		break;
753 	default:
754 		break;
755 	}
756 	return ret;
757 }
758 
avic_unaccelerated_access_interception(struct kvm_vcpu * vcpu)759 int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu)
760 {
761 	struct vcpu_svm *svm = to_svm(vcpu);
762 	int ret = 0;
763 	u32 offset = svm->vmcb->control.exit_info_1 &
764 		     AVIC_UNACCEL_ACCESS_OFFSET_MASK;
765 	u32 vector = svm->vmcb->control.exit_info_2 &
766 		     AVIC_UNACCEL_ACCESS_VECTOR_MASK;
767 	bool write = (svm->vmcb->control.exit_info_1 >> 32) &
768 		     AVIC_UNACCEL_ACCESS_WRITE_MASK;
769 	bool trap = is_avic_unaccelerated_access_trap(offset);
770 
771 	trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset,
772 					    trap, write, vector);
773 	if (trap) {
774 		/* Handling Trap */
775 		WARN_ONCE(!write, "svm: Handling trap read.\n");
776 		ret = avic_unaccel_trap_write(vcpu);
777 	} else {
778 		/* Handling Fault */
779 		ret = kvm_emulate_instruction(vcpu, 0);
780 	}
781 
782 	return ret;
783 }
784 
avic_init_vcpu(struct vcpu_svm * svm)785 int avic_init_vcpu(struct vcpu_svm *svm)
786 {
787 	int ret;
788 	struct kvm_vcpu *vcpu = &svm->vcpu;
789 
790 	INIT_LIST_HEAD(&svm->ir_list);
791 	spin_lock_init(&svm->ir_list_lock);
792 
793 	if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm))
794 		return 0;
795 
796 	ret = avic_init_backing_page(vcpu);
797 	if (ret)
798 		return ret;
799 
800 	svm->dfr_reg = APIC_DFR_FLAT;
801 
802 	return ret;
803 }
804 
avic_apicv_post_state_restore(struct kvm_vcpu * vcpu)805 void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
806 {
807 	avic_handle_dfr_update(vcpu);
808 	avic_handle_ldr_update(vcpu);
809 }
810 
svm_ir_list_del(struct kvm_kernel_irqfd * irqfd)811 static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd)
812 {
813 	struct kvm_vcpu *vcpu = irqfd->irq_bypass_vcpu;
814 	unsigned long flags;
815 
816 	if (!vcpu)
817 		return;
818 
819 	spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags);
820 	list_del(&irqfd->vcpu_list);
821 	spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags);
822 }
823 
avic_pi_update_irte(struct kvm_kernel_irqfd * irqfd,struct kvm * kvm,unsigned int host_irq,uint32_t guest_irq,struct kvm_vcpu * vcpu,u32 vector)824 int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
825 			unsigned int host_irq, uint32_t guest_irq,
826 			struct kvm_vcpu *vcpu, u32 vector)
827 {
828 	/*
829 	 * If the IRQ was affined to a different vCPU, remove the IRTE metadata
830 	 * from the *previous* vCPU's list.
831 	 */
832 	svm_ir_list_del(irqfd);
833 
834 	if (vcpu) {
835 		/*
836 		 * Try to enable guest_mode in IRTE, unless AVIC is inhibited,
837 		 * in which case configure the IRTE for legacy mode, but track
838 		 * the IRTE metadata so that it can be converted to guest mode
839 		 * if AVIC is enabled/uninhibited in the future.
840 		 */
841 		struct amd_iommu_pi_data pi_data = {
842 			.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
843 					     vcpu->vcpu_idx),
844 			.is_guest_mode = kvm_vcpu_apicv_active(vcpu),
845 			.vapic_addr = avic_get_backing_page_address(to_svm(vcpu)),
846 			.vector = vector,
847 		};
848 		struct vcpu_svm *svm = to_svm(vcpu);
849 		u64 entry;
850 		int ret;
851 
852 		/*
853 		 * Prevent the vCPU from being scheduled out or migrated until
854 		 * the IRTE is updated and its metadata has been added to the
855 		 * list of IRQs being posted to the vCPU, to ensure the IRTE
856 		 * isn't programmed with stale pCPU/IsRunning information.
857 		 */
858 		guard(spinlock_irqsave)(&svm->ir_list_lock);
859 
860 		/*
861 		 * Update the target pCPU for IOMMU doorbells if the vCPU is
862 		 * running.  If the vCPU is NOT running, i.e. is blocking or
863 		 * scheduled out, KVM will update the pCPU info when the vCPU
864 		 * is awakened and/or scheduled in.  See also avic_vcpu_load().
865 		 */
866 		entry = svm->avic_physical_id_entry;
867 		if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) {
868 			pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
869 		} else {
870 			pi_data.cpu = -1;
871 			pi_data.ga_log_intr = entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR;
872 		}
873 
874 		ret = irq_set_vcpu_affinity(host_irq, &pi_data);
875 		if (ret)
876 			return ret;
877 
878 		/*
879 		 * Revert to legacy mode if the IOMMU didn't provide metadata
880 		 * for the IRTE, which KVM needs to keep the IRTE up-to-date,
881 		 * e.g. if the vCPU is migrated or AVIC is disabled.
882 		 */
883 		if (WARN_ON_ONCE(!pi_data.ir_data)) {
884 			irq_set_vcpu_affinity(host_irq, NULL);
885 			return -EIO;
886 		}
887 
888 		irqfd->irq_bypass_data = pi_data.ir_data;
889 		list_add(&irqfd->vcpu_list, &svm->ir_list);
890 		return 0;
891 	}
892 	return irq_set_vcpu_affinity(host_irq, NULL);
893 }
894 
895 enum avic_vcpu_action {
896 	/*
897 	 * There is no need to differentiate between activate and deactivate,
898 	 * as KVM only refreshes AVIC state when the vCPU is scheduled in and
899 	 * isn't blocking, i.e. the pCPU must always be (in)valid when AVIC is
900 	 * being (de)activated.
901 	 */
902 	AVIC_TOGGLE_ON_OFF	= BIT(0),
903 	AVIC_ACTIVATE		= AVIC_TOGGLE_ON_OFF,
904 	AVIC_DEACTIVATE		= AVIC_TOGGLE_ON_OFF,
905 
906 	/*
907 	 * No unique action is required to deal with a vCPU that stops/starts
908 	 * running.  A vCPU that starts running by definition stops blocking as
909 	 * well, and a vCPU that stops running can't have been blocking, i.e.
910 	 * doesn't need to toggle GALogIntr.
911 	 */
912 	AVIC_START_RUNNING	= 0,
913 	AVIC_STOP_RUNNING	= 0,
914 
915 	/*
916 	 * When a vCPU starts blocking, KVM needs to set the GALogIntr flag
917 	 * int all associated IRTEs so that KVM can wake the vCPU if an IRQ is
918 	 * sent to the vCPU.
919 	 */
920 	AVIC_START_BLOCKING	= BIT(1),
921 };
922 
avic_update_iommu_vcpu_affinity(struct kvm_vcpu * vcpu,int cpu,enum avic_vcpu_action action)923 static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu,
924 					    enum avic_vcpu_action action)
925 {
926 	bool ga_log_intr = (action & AVIC_START_BLOCKING);
927 	struct vcpu_svm *svm = to_svm(vcpu);
928 	struct kvm_kernel_irqfd *irqfd;
929 
930 	lockdep_assert_held(&svm->ir_list_lock);
931 
932 	/*
933 	 * Here, we go through the per-vcpu ir_list to update all existing
934 	 * interrupt remapping table entry targeting this vcpu.
935 	 */
936 	if (list_empty(&svm->ir_list))
937 		return;
938 
939 	list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) {
940 		void *data = irqfd->irq_bypass_data;
941 
942 		if (!(action & AVIC_TOGGLE_ON_OFF))
943 			WARN_ON_ONCE(amd_iommu_update_ga(data, cpu, ga_log_intr));
944 		else if (cpu >= 0)
945 			WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu, ga_log_intr));
946 		else
947 			WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data));
948 	}
949 }
950 
__avic_vcpu_load(struct kvm_vcpu * vcpu,int cpu,enum avic_vcpu_action action)951 static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu,
952 			     enum avic_vcpu_action action)
953 {
954 	struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
955 	int h_physical_id = kvm_cpu_get_apicid(cpu);
956 	struct vcpu_svm *svm = to_svm(vcpu);
957 	unsigned long flags;
958 	u64 entry;
959 
960 	lockdep_assert_preemption_disabled();
961 
962 	if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
963 		return;
964 
965 	if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE))
966 		return;
967 
968 	/*
969 	 * Grab the per-vCPU interrupt remapping lock even if the VM doesn't
970 	 * _currently_ have assigned devices, as that can change.  Holding
971 	 * ir_list_lock ensures that either svm_ir_list_add() will consume
972 	 * up-to-date entry information, or that this task will wait until
973 	 * svm_ir_list_add() completes to set the new target pCPU.
974 	 */
975 	spin_lock_irqsave(&svm->ir_list_lock, flags);
976 
977 	entry = svm->avic_physical_id_entry;
978 	WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
979 
980 	entry &= ~(AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK |
981 		   AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR);
982 	entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
983 	entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
984 
985 	svm->avic_physical_id_entry = entry;
986 
987 	/*
988 	 * If IPI virtualization is disabled, clear IsRunning when updating the
989 	 * actual Physical ID table, so that the CPU never sees IsRunning=1.
990 	 * Keep the APIC ID up-to-date in the entry to minimize the chances of
991 	 * things going sideways if hardware peeks at the ID.
992 	 */
993 	if (!enable_ipiv)
994 		entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
995 
996 	WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry);
997 
998 	avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action);
999 
1000 	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
1001 }
1002 
avic_vcpu_load(struct kvm_vcpu * vcpu,int cpu)1003 void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1004 {
1005 	/*
1006 	 * No need to update anything if the vCPU is blocking, i.e. if the vCPU
1007 	 * is being scheduled in after being preempted.  The CPU entries in the
1008 	 * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
1009 	 * If the vCPU was migrated, its new CPU value will be stuffed when the
1010 	 * vCPU unblocks.
1011 	 */
1012 	if (kvm_vcpu_is_blocking(vcpu))
1013 		return;
1014 
1015 	__avic_vcpu_load(vcpu, cpu, AVIC_START_RUNNING);
1016 }
1017 
__avic_vcpu_put(struct kvm_vcpu * vcpu,enum avic_vcpu_action action)1018 static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action)
1019 {
1020 	struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
1021 	struct vcpu_svm *svm = to_svm(vcpu);
1022 	unsigned long flags;
1023 	u64 entry = svm->avic_physical_id_entry;
1024 
1025 	lockdep_assert_preemption_disabled();
1026 
1027 	if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE))
1028 		return;
1029 
1030 	/*
1031 	 * Take and hold the per-vCPU interrupt remapping lock while updating
1032 	 * the Physical ID entry even though the lock doesn't protect against
1033 	 * multiple writers (see above).  Holding ir_list_lock ensures that
1034 	 * either svm_ir_list_add() will consume up-to-date entry information,
1035 	 * or that this task will wait until svm_ir_list_add() completes to
1036 	 * mark the vCPU as not running.
1037 	 */
1038 	spin_lock_irqsave(&svm->ir_list_lock, flags);
1039 
1040 	avic_update_iommu_vcpu_affinity(vcpu, -1, action);
1041 
1042 	WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR);
1043 
1044 	/*
1045 	 * Keep the previous APIC ID in the entry so that a rogue doorbell from
1046 	 * hardware is at least restricted to a CPU associated with the vCPU.
1047 	 */
1048 	entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1049 
1050 	if (enable_ipiv)
1051 		WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry);
1052 
1053 	/*
1054 	 * Note!  Don't set AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR in the table as
1055 	 * it's a synthetic flag that usurps an unused should-be-zero bit.
1056 	 */
1057 	if (action & AVIC_START_BLOCKING)
1058 		entry |= AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR;
1059 
1060 	svm->avic_physical_id_entry = entry;
1061 
1062 	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
1063 }
1064 
avic_vcpu_put(struct kvm_vcpu * vcpu)1065 void avic_vcpu_put(struct kvm_vcpu *vcpu)
1066 {
1067 	/*
1068 	 * Note, reading the Physical ID entry outside of ir_list_lock is safe
1069 	 * as only the pCPU that has loaded (or is loading) the vCPU is allowed
1070 	 * to modify the entry, and preemption is disabled.  I.e. the vCPU
1071 	 * can't be scheduled out and thus avic_vcpu_{put,load}() can't run
1072 	 * recursively.
1073 	 */
1074 	u64 entry = to_svm(vcpu)->avic_physical_id_entry;
1075 
1076 	/*
1077 	 * Nothing to do if IsRunning == '0' due to vCPU blocking, i.e. if the
1078 	 * vCPU is preempted while its in the process of blocking.  WARN if the
1079 	 * vCPU wasn't running and isn't blocking, KVM shouldn't attempt to put
1080 	 * the AVIC if it wasn't previously loaded.
1081 	 */
1082 	if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) {
1083 		if (WARN_ON_ONCE(!kvm_vcpu_is_blocking(vcpu)))
1084 			return;
1085 
1086 		/*
1087 		 * The vCPU was preempted while blocking, ensure its IRTEs are
1088 		 * configured to generate GA Log Interrupts.
1089 		 */
1090 		if (!(WARN_ON_ONCE(!(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR))))
1091 			return;
1092 	}
1093 
1094 	__avic_vcpu_put(vcpu, kvm_vcpu_is_blocking(vcpu) ? AVIC_START_BLOCKING :
1095 							   AVIC_STOP_RUNNING);
1096 }
1097 
avic_refresh_virtual_apic_mode(struct kvm_vcpu * vcpu)1098 void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
1099 {
1100 	struct vcpu_svm *svm = to_svm(vcpu);
1101 	struct vmcb *vmcb = svm->vmcb01.ptr;
1102 
1103 	if (!lapic_in_kernel(vcpu) || !enable_apicv)
1104 		return;
1105 
1106 	if (kvm_vcpu_apicv_active(vcpu)) {
1107 		/**
1108 		 * During AVIC temporary deactivation, guest could update
1109 		 * APIC ID, DFR and LDR registers, which would not be trapped
1110 		 * by avic_unaccelerated_access_interception(). In this case,
1111 		 * we need to check and update the AVIC logical APIC ID table
1112 		 * accordingly before re-activating.
1113 		 */
1114 		avic_apicv_post_state_restore(vcpu);
1115 		avic_activate_vmcb(svm);
1116 	} else {
1117 		avic_deactivate_vmcb(svm);
1118 	}
1119 	vmcb_mark_dirty(vmcb, VMCB_AVIC);
1120 }
1121 
avic_refresh_apicv_exec_ctrl(struct kvm_vcpu * vcpu)1122 void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
1123 {
1124 	if (!enable_apicv)
1125 		return;
1126 
1127 	/* APICv should only be toggled on/off while the vCPU is running. */
1128 	WARN_ON_ONCE(kvm_vcpu_is_blocking(vcpu));
1129 
1130 	avic_refresh_virtual_apic_mode(vcpu);
1131 
1132 	if (kvm_vcpu_apicv_active(vcpu))
1133 		__avic_vcpu_load(vcpu, vcpu->cpu, AVIC_ACTIVATE);
1134 	else
1135 		__avic_vcpu_put(vcpu, AVIC_DEACTIVATE);
1136 }
1137 
avic_vcpu_blocking(struct kvm_vcpu * vcpu)1138 void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
1139 {
1140 	if (!kvm_vcpu_apicv_active(vcpu))
1141 		return;
1142 
1143 	/*
1144 	 * Unload the AVIC when the vCPU is about to block, _before_ the vCPU
1145 	 * actually blocks.
1146 	 *
1147 	 * Note, any IRQs that arrive before IsRunning=0 will not cause an
1148 	 * incomplete IPI vmexit on the source; kvm_vcpu_check_block() handles
1149 	 * this by checking vIRR one last time before blocking.  The memory
1150 	 * barrier implicit in set_current_state orders writing IsRunning=0
1151 	 * before reading the vIRR.  The processor needs a matching memory
1152 	 * barrier on interrupt delivery between writing IRR and reading
1153 	 * IsRunning; the lack of this barrier might be the cause of errata #1235).
1154 	 *
1155 	 * Clear IsRunning=0 even if guest IRQs are disabled, i.e. even if KVM
1156 	 * doesn't need to detect events for scheduling purposes.  The doorbell
1157 	 * used to signal running vCPUs cannot be blocked, i.e. will perturb the
1158 	 * CPU and cause noisy neighbor problems if the VM is sending interrupts
1159 	 * to the vCPU while it's scheduled out.
1160 	 */
1161 	__avic_vcpu_put(vcpu, AVIC_START_BLOCKING);
1162 }
1163 
avic_vcpu_unblocking(struct kvm_vcpu * vcpu)1164 void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
1165 {
1166 	if (!kvm_vcpu_apicv_active(vcpu))
1167 		return;
1168 
1169 	avic_vcpu_load(vcpu, vcpu->cpu);
1170 }
1171 
avic_want_avic_enabled(void)1172 static bool __init avic_want_avic_enabled(void)
1173 {
1174 	/*
1175 	 * In "auto" mode, enable AVIC by default for Zen4+ if x2AVIC is
1176 	 * supported (to avoid enabling partial support by default, and because
1177 	 * x2AVIC should be supported by all Zen4+ CPUs).  Explicitly check for
1178 	 * family 0x19 and later (Zen5+), as the kernel's synthetic ZenX flags
1179 	 * aren't inclusive of previous generations, i.e. the kernel will set
1180 	 * at most one ZenX feature flag.
1181 	 */
1182 	if (avic == AVIC_AUTO_MODE)
1183 		avic = boot_cpu_has(X86_FEATURE_X2AVIC) &&
1184 		       (boot_cpu_data.x86 > 0x19 || cpu_feature_enabled(X86_FEATURE_ZEN4));
1185 
1186 	if (!avic || !npt_enabled)
1187 		return false;
1188 
1189 	/* AVIC is a prerequisite for x2AVIC. */
1190 	if (!boot_cpu_has(X86_FEATURE_AVIC) && !force_avic) {
1191 		if (boot_cpu_has(X86_FEATURE_X2AVIC))
1192 			pr_warn(FW_BUG "Cannot enable x2AVIC, AVIC is unsupported\n");
1193 		return false;
1194 	}
1195 
1196 	if (cc_platform_has(CC_ATTR_HOST_SEV_SNP) &&
1197 	    !boot_cpu_has(X86_FEATURE_HV_INUSE_WR_ALLOWED)) {
1198 		pr_warn("AVIC disabled: missing HvInUseWrAllowed on SNP-enabled system\n");
1199 		return false;
1200 	}
1201 
1202 	/*
1203 	 * Print a scary message if AVIC is force enabled to make it abundantly
1204 	 * clear that ignoring CPUID could have repercussions.  See Revision
1205 	 * Guide for specific AMD processor for more details.
1206 	 */
1207 	if (!boot_cpu_has(X86_FEATURE_AVIC))
1208 		pr_warn("AVIC unsupported in CPUID but force enabled, your system might crash and burn\n");
1209 
1210 	return true;
1211 }
1212 
1213 /*
1214  * Note:
1215  * - The module param avic enable both xAPIC and x2APIC mode.
1216  * - Hypervisor can support both xAVIC and x2AVIC in the same guest.
1217  * - The mode can be switched at run-time.
1218  */
avic_hardware_setup(void)1219 bool __init avic_hardware_setup(void)
1220 {
1221 	avic = avic_want_avic_enabled();
1222 	if (!avic)
1223 		return false;
1224 
1225 	pr_info("AVIC enabled\n");
1226 
1227 	/* AVIC is a prerequisite for x2AVIC. */
1228 	x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC);
1229 	if (x2avic_enabled)
1230 		pr_info("x2AVIC enabled\n");
1231 	else
1232 		svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true;
1233 
1234 	/*
1235 	 * Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2)
1236 	 * due to erratum 1235, which results in missed VM-Exits on the sender
1237 	 * and thus missed wake events for blocking vCPUs due to the CPU
1238 	 * failing to see a software update to clear IsRunning.
1239 	 */
1240 	enable_ipiv = enable_ipiv && boot_cpu_data.x86 != 0x17;
1241 
1242 	amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
1243 
1244 	return true;
1245 }
1246