xref: /linux/arch/x86/kvm/svm/avic.c (revision 9e4e86a604dfd06402933467578c4b79f5412b2c)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Kernel-based Virtual Machine driver for Linux
4  *
5  * AMD SVM support
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9  *
10  * Authors:
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *   Avi Kivity   <avi@qumranet.com>
13  */
14 
15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16 
17 #include <linux/kvm_types.h>
18 #include <linux/hashtable.h>
19 #include <linux/amd-iommu.h>
20 #include <linux/kvm_host.h>
21 #include <linux/kvm_irqfd.h>
22 #include <linux/sysfs.h>
23 
24 #include <asm/irq_remapping.h>
25 #include <asm/msr.h>
26 
27 #include "trace.h"
28 #include "lapic.h"
29 #include "x86.h"
30 #include "irq.h"
31 #include "svm.h"
32 
33 /*
34  * Encode the arbitrary VM ID and the vCPU's _index_ into the GATag so that
35  * KVM can retrieve the correct vCPU from a GALog entry if an interrupt can't
36  * be delivered, e.g. because the vCPU isn't running.  Use the vCPU's index
37  * instead of its ID (a.k.a. its default APIC ID), as KVM is guaranteed a fast
38  * lookup on the index, where as vCPUs whose index doesn't match their ID need
39  * to walk the entire xarray of vCPUs in the worst case scenario.
40  *
41  * For the vCPU index, use however many bits are currently allowed for the max
42  * guest physical APIC ID (limited by the size of the physical ID table), and
43  * use whatever bits remain to assign arbitrary AVIC IDs to VMs.  Note, the
44  * size of the GATag is defined by hardware (32 bits), but is an opaque value
45  * as far as hardware is concerned.
46  */
47 #define AVIC_VCPU_IDX_MASK		AVIC_PHYSICAL_MAX_INDEX_MASK
48 
49 #define AVIC_VM_ID_SHIFT		HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK)
50 #define AVIC_VM_ID_MASK			(GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT)
51 
52 #define AVIC_GATAG_TO_VMID(x)		((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK)
53 #define AVIC_GATAG_TO_VCPUIDX(x)	(x & AVIC_VCPU_IDX_MASK)
54 
55 #define __AVIC_GATAG(vm_id, vcpu_idx)	((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \
56 					 ((vcpu_idx) & AVIC_VCPU_IDX_MASK))
57 #define AVIC_GATAG(vm_id, vcpu_idx)					\
58 ({									\
59 	u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_idx);			\
60 									\
61 	WARN_ON_ONCE(AVIC_GATAG_TO_VCPUIDX(ga_tag) != (vcpu_idx));	\
62 	WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id));		\
63 	ga_tag;								\
64 })
65 
66 static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_IDX_MASK) == -1u);
67 
68 #define AVIC_AUTO_MODE -1
69 
avic_param_set(const char * val,const struct kernel_param * kp)70 static int avic_param_set(const char *val, const struct kernel_param *kp)
71 {
72 	if (val && sysfs_streq(val, "auto")) {
73 		*(int *)kp->arg = AVIC_AUTO_MODE;
74 		return 0;
75 	}
76 
77 	return param_set_bint(val, kp);
78 }
79 
avic_param_get(char * buffer,const struct kernel_param * kp)80 static int avic_param_get(char *buffer, const struct kernel_param *kp)
81 {
82 	int val = *(int *)kp->arg;
83 
84 	if (val == AVIC_AUTO_MODE)
85 		return sysfs_emit(buffer, "N\n");
86 
87 	return param_get_bool(buffer, kp);
88 }
89 
90 static const struct kernel_param_ops avic_ops = {
91 	.flags = KERNEL_PARAM_OPS_FL_NOARG,
92 	.set = avic_param_set,
93 	.get = avic_param_get,
94 };
95 
96 /*
97  * Enable / disable AVIC.  In "auto" mode (default behavior), AVIC is enabled
98  * for Zen4+ CPUs with x2AVIC (and all other criteria for enablement are met).
99  */
100 static int __ro_after_init avic = AVIC_AUTO_MODE;
101 module_param_cb(avic, &avic_ops, &avic, 0444);
102 __MODULE_PARM_TYPE(avic, "bool");
103 
104 module_param(enable_ipiv, bool, 0444);
105 
106 static bool __ro_after_init force_avic;
107 module_param_unsafe(force_avic, bool, 0444);
108 
109 /* Note:
110  * This hash table is used to map VM_ID to a struct kvm_svm,
111  * when handling AMD IOMMU GALOG notification to schedule in
112  * a particular vCPU.
113  */
114 #define SVM_VM_DATA_HASH_BITS	8
115 static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
116 static u32 next_vm_id = 0;
117 static bool next_vm_id_wrapped = 0;
118 static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
119 static bool x2avic_enabled;
120 static u32 x2avic_max_physical_id;
121 
avic_set_x2apic_msr_interception(struct vcpu_svm * svm,bool intercept)122 static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm,
123 					     bool intercept)
124 {
125 	static const u32 x2avic_passthrough_msrs[] = {
126 		X2APIC_MSR(APIC_ID),
127 		X2APIC_MSR(APIC_LVR),
128 		X2APIC_MSR(APIC_TASKPRI),
129 		X2APIC_MSR(APIC_ARBPRI),
130 		X2APIC_MSR(APIC_PROCPRI),
131 		X2APIC_MSR(APIC_EOI),
132 		X2APIC_MSR(APIC_RRR),
133 		X2APIC_MSR(APIC_LDR),
134 		X2APIC_MSR(APIC_DFR),
135 		X2APIC_MSR(APIC_SPIV),
136 		X2APIC_MSR(APIC_ISR),
137 		X2APIC_MSR(APIC_TMR),
138 		X2APIC_MSR(APIC_IRR),
139 		X2APIC_MSR(APIC_ESR),
140 		X2APIC_MSR(APIC_ICR),
141 		X2APIC_MSR(APIC_ICR2),
142 
143 		/*
144 		 * Note!  Always intercept LVTT, as TSC-deadline timer mode
145 		 * isn't virtualized by hardware, and the CPU will generate a
146 		 * #GP instead of a #VMEXIT.
147 		 */
148 		X2APIC_MSR(APIC_LVTTHMR),
149 		X2APIC_MSR(APIC_LVTPC),
150 		X2APIC_MSR(APIC_LVT0),
151 		X2APIC_MSR(APIC_LVT1),
152 		X2APIC_MSR(APIC_LVTERR),
153 		X2APIC_MSR(APIC_TMICT),
154 		X2APIC_MSR(APIC_TMCCT),
155 		X2APIC_MSR(APIC_TDCR),
156 	};
157 	int i;
158 
159 	if (intercept == svm->x2avic_msrs_intercepted)
160 		return;
161 
162 	if (!x2avic_enabled)
163 		return;
164 
165 	for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++)
166 		svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i],
167 					  MSR_TYPE_RW, intercept);
168 
169 	svm->x2avic_msrs_intercepted = intercept;
170 }
171 
__avic_get_max_physical_id(struct kvm * kvm,struct kvm_vcpu * vcpu)172 static u32 __avic_get_max_physical_id(struct kvm *kvm, struct kvm_vcpu *vcpu)
173 {
174 	u32 arch_max;
175 
176 	/*
177 	 * Return the largest size (x2APIC) when querying without a vCPU, e.g.
178 	 * to allocate the per-VM table..
179 	 */
180 	if (x2avic_enabled && (!vcpu || apic_x2apic_mode(vcpu->arch.apic)))
181 		arch_max = x2avic_max_physical_id;
182 	else
183 		arch_max = AVIC_MAX_PHYSICAL_ID;
184 
185 	/*
186 	 * Despite its name, KVM_CAP_MAX_VCPU_ID represents the maximum APIC ID
187 	 * plus one, so the max possible APIC ID is one less than that.
188 	 */
189 	return min(kvm->arch.max_vcpu_ids - 1, arch_max);
190 }
191 
avic_get_max_physical_id(struct kvm_vcpu * vcpu)192 static u32 avic_get_max_physical_id(struct kvm_vcpu *vcpu)
193 {
194 	return __avic_get_max_physical_id(vcpu->kvm, vcpu);
195 }
196 
avic_activate_vmcb(struct vcpu_svm * svm)197 static void avic_activate_vmcb(struct vcpu_svm *svm)
198 {
199 	struct vmcb *vmcb = svm->vmcb01.ptr;
200 	struct kvm_vcpu *vcpu = &svm->vcpu;
201 
202 	vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
203 	vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
204 	vmcb->control.avic_physical_id |= avic_get_max_physical_id(vcpu);
205 	vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
206 
207 	svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
208 
209 	/*
210 	 * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
211 	 * accesses, while interrupt injection to a running vCPU can be
212 	 * achieved using AVIC doorbell.  KVM disables the APIC access page
213 	 * (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling
214 	 * AVIC in hybrid mode activates only the doorbell mechanism.
215 	 */
216 	if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) {
217 		vmcb->control.int_ctl |= X2APIC_MODE_MASK;
218 
219 		/* Disabling MSR intercept for x2APIC registers */
220 		avic_set_x2apic_msr_interception(svm, false);
221 	} else {
222 		/*
223 		 * Flush the TLB, the guest may have inserted a non-APIC
224 		 * mapping into the TLB while AVIC was disabled.
225 		 */
226 		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
227 
228 		/* Enabling MSR intercept for x2APIC registers */
229 		avic_set_x2apic_msr_interception(svm, true);
230 	}
231 }
232 
avic_deactivate_vmcb(struct vcpu_svm * svm)233 static void avic_deactivate_vmcb(struct vcpu_svm *svm)
234 {
235 	struct vmcb *vmcb = svm->vmcb01.ptr;
236 
237 	vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
238 	vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
239 
240 	if (!is_sev_es_guest(&svm->vcpu))
241 		svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
242 
243 	/*
244 	 * If running nested and the guest uses its own MSR bitmap, there
245 	 * is no need to update L0's msr bitmap
246 	 */
247 	if (is_guest_mode(&svm->vcpu) &&
248 	    vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))
249 		return;
250 
251 	/* Enabling MSR intercept for x2APIC registers */
252 	avic_set_x2apic_msr_interception(svm, true);
253 }
254 
255 /* Note:
256  * This function is called from IOMMU driver to notify
257  * SVM to schedule in a particular vCPU of a particular VM.
258  */
avic_ga_log_notifier(u32 ga_tag)259 static int avic_ga_log_notifier(u32 ga_tag)
260 {
261 	unsigned long flags;
262 	struct kvm_svm *kvm_svm;
263 	struct kvm_vcpu *vcpu = NULL;
264 	u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
265 	u32 vcpu_idx = AVIC_GATAG_TO_VCPUIDX(ga_tag);
266 
267 	pr_debug("SVM: %s: vm_id=%#x, vcpu_idx=%#x\n", __func__, vm_id, vcpu_idx);
268 	trace_kvm_avic_ga_log(vm_id, vcpu_idx);
269 
270 	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
271 	hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
272 		if (kvm_svm->avic_vm_id != vm_id)
273 			continue;
274 		vcpu = kvm_get_vcpu(&kvm_svm->kvm, vcpu_idx);
275 		break;
276 	}
277 	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
278 
279 	/* Note:
280 	 * At this point, the IOMMU should have already set the pending
281 	 * bit in the vAPIC backing page. So, we just need to schedule
282 	 * in the vcpu.
283 	 */
284 	if (vcpu)
285 		kvm_vcpu_wake_up(vcpu);
286 
287 	return 0;
288 }
289 
avic_get_physical_id_table_order(struct kvm * kvm)290 static int avic_get_physical_id_table_order(struct kvm *kvm)
291 {
292 	/* Provision for the maximum physical ID supported in x2avic mode */
293 	return get_order((__avic_get_max_physical_id(kvm, NULL) + 1) * sizeof(u64));
294 }
295 
avic_alloc_physical_id_table(struct kvm * kvm)296 int avic_alloc_physical_id_table(struct kvm *kvm)
297 {
298 	struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
299 
300 	if (!irqchip_in_kernel(kvm) || !enable_apicv)
301 		return 0;
302 
303 	if (kvm_svm->avic_physical_id_table)
304 		return 0;
305 
306 	kvm_svm->avic_physical_id_table = (void *)__get_free_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
307 								   avic_get_physical_id_table_order(kvm));
308 	if (!kvm_svm->avic_physical_id_table)
309 		return -ENOMEM;
310 
311 	return 0;
312 }
313 
avic_vm_destroy(struct kvm * kvm)314 void avic_vm_destroy(struct kvm *kvm)
315 {
316 	unsigned long flags;
317 	struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
318 
319 	if (!enable_apicv)
320 		return;
321 
322 	free_page((unsigned long)kvm_svm->avic_logical_id_table);
323 	free_pages((unsigned long)kvm_svm->avic_physical_id_table,
324 		   avic_get_physical_id_table_order(kvm));
325 
326 	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
327 	hash_del(&kvm_svm->hnode);
328 	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
329 }
330 
avic_vm_init(struct kvm * kvm)331 int avic_vm_init(struct kvm *kvm)
332 {
333 	unsigned long flags;
334 	int err = -ENOMEM;
335 	struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
336 	struct kvm_svm *k2;
337 	u32 vm_id;
338 
339 	if (!enable_apicv)
340 		return 0;
341 
342 	kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
343 	if (!kvm_svm->avic_logical_id_table)
344 		goto free_avic;
345 
346 	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
347  again:
348 	vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
349 	if (vm_id == 0) { /* id is 1-based, zero is not okay */
350 		next_vm_id_wrapped = 1;
351 		goto again;
352 	}
353 	/* Is it still in use? Only possible if wrapped at least once */
354 	if (next_vm_id_wrapped) {
355 		hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) {
356 			if (k2->avic_vm_id == vm_id)
357 				goto again;
358 		}
359 	}
360 	kvm_svm->avic_vm_id = vm_id;
361 	hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id);
362 	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
363 
364 	return 0;
365 
366 free_avic:
367 	avic_vm_destroy(kvm);
368 	return err;
369 }
370 
avic_get_backing_page_address(struct vcpu_svm * svm)371 static phys_addr_t avic_get_backing_page_address(struct vcpu_svm *svm)
372 {
373 	return __sme_set(__pa(svm->vcpu.arch.apic->regs));
374 }
375 
avic_init_vmcb(struct vcpu_svm * svm,struct vmcb * vmcb)376 void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
377 {
378 	struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
379 
380 	vmcb->control.avic_backing_page = avic_get_backing_page_address(svm);
381 	vmcb->control.avic_logical_id = __sme_set(__pa(kvm_svm->avic_logical_id_table));
382 	vmcb->control.avic_physical_id = __sme_set(__pa(kvm_svm->avic_physical_id_table));
383 	vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE;
384 
385 	if (kvm_vcpu_apicv_active(&svm->vcpu))
386 		avic_activate_vmcb(svm);
387 	else
388 		avic_deactivate_vmcb(svm);
389 }
390 
avic_init_backing_page(struct kvm_vcpu * vcpu)391 static int avic_init_backing_page(struct kvm_vcpu *vcpu)
392 {
393 	u32 max_id = x2avic_enabled ? x2avic_max_physical_id : AVIC_MAX_PHYSICAL_ID;
394 	struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
395 	struct vcpu_svm *svm = to_svm(vcpu);
396 	u32 id = vcpu->vcpu_id;
397 	u64 new_entry;
398 
399 	/*
400 	 * Inhibit AVIC if the vCPU ID is bigger than what is supported by AVIC
401 	 * hardware.  Immediately clear apicv_active, i.e. don't wait until the
402 	 * KVM_REQ_APICV_UPDATE request is processed on the first KVM_RUN, as
403 	 * avic_vcpu_load() expects to be called if and only if the vCPU has
404 	 * fully initialized AVIC.
405 	 */
406 	if (id > max_id) {
407 		kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG);
408 		vcpu->arch.apic->apicv_active = false;
409 		return 0;
410 	}
411 
412 	BUILD_BUG_ON((AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE ||
413 		     (X2AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE);
414 
415 	if (WARN_ON_ONCE(!vcpu->arch.apic->regs))
416 		return -EINVAL;
417 
418 	if (kvm_apicv_activated(vcpu->kvm)) {
419 		int ret;
420 
421 		/*
422 		 * Note, AVIC hardware walks the nested page table to check
423 		 * permissions, but does not use the SPA address specified in
424 		 * the leaf SPTE since it uses address in the AVIC_BACKING_PAGE
425 		 * pointer field of the VMCB.
426 		 */
427 		ret = kvm_alloc_apic_access_page(vcpu->kvm);
428 		if (ret)
429 			return ret;
430 	}
431 
432 	/* Note, fls64() returns the bit position, +1. */
433 	BUILD_BUG_ON(__PHYSICAL_MASK_SHIFT >
434 		     fls64(AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK));
435 
436 	/* Setting AVIC backing page address in the phy APIC ID table */
437 	new_entry = avic_get_backing_page_address(svm) |
438 		    AVIC_PHYSICAL_ID_ENTRY_VALID_MASK;
439 	svm->avic_physical_id_entry = new_entry;
440 
441 	/*
442 	 * Initialize the real table, as vCPUs must have a valid entry in order
443 	 * for broadcast IPIs to function correctly (broadcast IPIs ignore
444 	 * invalid entries, i.e. aren't guaranteed to generate a VM-Exit).
445 	 */
446 	WRITE_ONCE(kvm_svm->avic_physical_id_table[id], new_entry);
447 
448 	return 0;
449 }
450 
avic_ring_doorbell(struct kvm_vcpu * vcpu)451 void avic_ring_doorbell(struct kvm_vcpu *vcpu)
452 {
453 	/*
454 	 * Note, the vCPU could get migrated to a different pCPU at any point,
455 	 * which could result in signalling the wrong/previous pCPU.  But if
456 	 * that happens the vCPU is guaranteed to do a VMRUN (after being
457 	 * migrated) and thus will process pending interrupts, i.e. a doorbell
458 	 * is not needed (and the spurious one is harmless).
459 	 */
460 	int cpu = READ_ONCE(vcpu->cpu);
461 
462 	if (cpu != get_cpu()) {
463 		wrmsrq(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
464 		trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu));
465 	}
466 	put_cpu();
467 }
468 
469 
avic_kick_vcpu(struct kvm_vcpu * vcpu,u32 icrl)470 static void avic_kick_vcpu(struct kvm_vcpu *vcpu, u32 icrl)
471 {
472 	vcpu->arch.apic->irr_pending = true;
473 	svm_complete_interrupt_delivery(vcpu,
474 					icrl & APIC_MODE_MASK,
475 					icrl & APIC_INT_LEVELTRIG,
476 					icrl & APIC_VECTOR_MASK);
477 }
478 
avic_kick_vcpu_by_physical_id(struct kvm * kvm,u32 physical_id,u32 icrl)479 static void avic_kick_vcpu_by_physical_id(struct kvm *kvm, u32 physical_id,
480 					  u32 icrl)
481 {
482 	/*
483 	 * KVM inhibits AVIC if any vCPU ID diverges from the vCPUs APIC ID,
484 	 * i.e. APIC ID == vCPU ID.
485 	 */
486 	struct kvm_vcpu *target_vcpu = kvm_get_vcpu_by_id(kvm, physical_id);
487 
488 	/* Once again, nothing to do if the target vCPU doesn't exist. */
489 	if (unlikely(!target_vcpu))
490 		return;
491 
492 	avic_kick_vcpu(target_vcpu, icrl);
493 }
494 
avic_kick_vcpu_by_logical_id(struct kvm * kvm,u32 * avic_logical_id_table,u32 logid_index,u32 icrl)495 static void avic_kick_vcpu_by_logical_id(struct kvm *kvm, u32 *avic_logical_id_table,
496 					 u32 logid_index, u32 icrl)
497 {
498 	u32 physical_id;
499 
500 	if (avic_logical_id_table) {
501 		u32 logid_entry = avic_logical_id_table[logid_index];
502 
503 		/* Nothing to do if the logical destination is invalid. */
504 		if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
505 			return;
506 
507 		physical_id = logid_entry &
508 			      AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
509 	} else {
510 		/*
511 		 * For x2APIC, the logical APIC ID is a read-only value that is
512 		 * derived from the x2APIC ID, thus the x2APIC ID can be found
513 		 * by reversing the calculation (stored in logid_index).  Note,
514 		 * bits 31:20 of the x2APIC ID aren't propagated to the logical
515 		 * ID, but KVM limits the x2APIC ID limited to KVM_MAX_VCPU_IDS.
516 		 */
517 		physical_id = logid_index;
518 	}
519 
520 	avic_kick_vcpu_by_physical_id(kvm, physical_id, icrl);
521 }
522 
523 /*
524  * A fast-path version of avic_kick_target_vcpus(), which attempts to match
525  * destination APIC ID to vCPU without looping through all vCPUs.
526  */
avic_kick_target_vcpus_fast(struct kvm * kvm,struct kvm_lapic * source,u32 icrl,u32 icrh,u32 index)527 static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source,
528 				       u32 icrl, u32 icrh, u32 index)
529 {
530 	int dest_mode = icrl & APIC_DEST_MASK;
531 	int shorthand = icrl & APIC_SHORT_MASK;
532 	struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
533 	u32 dest;
534 
535 	if (shorthand != APIC_DEST_NOSHORT)
536 		return -EINVAL;
537 
538 	if (apic_x2apic_mode(source))
539 		dest = icrh;
540 	else
541 		dest = GET_XAPIC_DEST_FIELD(icrh);
542 
543 	if (dest_mode == APIC_DEST_PHYSICAL) {
544 		/* broadcast destination, use slow path */
545 		if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST)
546 			return -EINVAL;
547 		if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST)
548 			return -EINVAL;
549 
550 		if (WARN_ON_ONCE(dest != index))
551 			return -EINVAL;
552 
553 		avic_kick_vcpu_by_physical_id(kvm, dest, icrl);
554 	} else {
555 		u32 *avic_logical_id_table;
556 		unsigned long bitmap, i;
557 		u32 cluster;
558 
559 		if (apic_x2apic_mode(source)) {
560 			/* 16 bit dest mask, 16 bit cluster id */
561 			bitmap = dest & 0xFFFF;
562 			cluster = (dest >> 16) << 4;
563 		} else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) {
564 			/* 8 bit dest mask*/
565 			bitmap = dest;
566 			cluster = 0;
567 		} else {
568 			/* 4 bit desk mask, 4 bit cluster id */
569 			bitmap = dest & 0xF;
570 			cluster = (dest >> 4) << 2;
571 		}
572 
573 		/* Nothing to do if there are no destinations in the cluster. */
574 		if (unlikely(!bitmap))
575 			return 0;
576 
577 		if (apic_x2apic_mode(source))
578 			avic_logical_id_table = NULL;
579 		else
580 			avic_logical_id_table = kvm_svm->avic_logical_id_table;
581 
582 		/*
583 		 * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical
584 		 * IDs, thus each bit in the destination is guaranteed to map
585 		 * to at most one vCPU.
586 		 */
587 		for_each_set_bit(i, &bitmap, 16)
588 			avic_kick_vcpu_by_logical_id(kvm, avic_logical_id_table,
589 						     cluster + i, icrl);
590 	}
591 
592 	return 0;
593 }
594 
avic_kick_target_vcpus(struct kvm * kvm,struct kvm_lapic * source,u32 icrl,u32 icrh,u32 index)595 static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
596 				   u32 icrl, u32 icrh, u32 index)
597 {
598 	u32 dest = apic_x2apic_mode(source) ? icrh : GET_XAPIC_DEST_FIELD(icrh);
599 	unsigned long i;
600 	struct kvm_vcpu *vcpu;
601 
602 	if (!avic_kick_target_vcpus_fast(kvm, source, icrl, icrh, index))
603 		return;
604 
605 	trace_kvm_avic_kick_vcpu_slowpath(icrh, icrl, index);
606 
607 	/*
608 	 * Wake any target vCPUs that are blocking, i.e. waiting for a wake
609 	 * event.  There's no need to signal doorbells, as hardware has handled
610 	 * vCPUs that were in guest at the time of the IPI, and vCPUs that have
611 	 * since entered the guest will have processed pending IRQs at VMRUN.
612 	 */
613 	kvm_for_each_vcpu(i, vcpu, kvm) {
614 		if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
615 					dest, icrl & APIC_DEST_MASK))
616 			avic_kick_vcpu(vcpu, icrl);
617 	}
618 }
619 
avic_incomplete_ipi_interception(struct kvm_vcpu * vcpu)620 int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
621 {
622 	struct vcpu_svm *svm = to_svm(vcpu);
623 	u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
624 	u32 icrl = svm->vmcb->control.exit_info_1;
625 	u32 id = svm->vmcb->control.exit_info_2 >> 32;
626 	u32 index = svm->vmcb->control.exit_info_2 & AVIC_PHYSICAL_MAX_INDEX_MASK;
627 	struct kvm_lapic *apic = vcpu->arch.apic;
628 
629 	trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
630 
631 	switch (id) {
632 	case AVIC_IPI_FAILURE_INVALID_TARGET:
633 	case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
634 		/*
635 		 * Emulate IPIs that are not handled by AVIC hardware, which
636 		 * only virtualizes Fixed, Edge-Triggered INTRs, and falls over
637 		 * if _any_ targets are invalid, e.g. if the logical mode mask
638 		 * is a superset of running vCPUs.
639 		 *
640 		 * The exit is a trap, e.g. ICR holds the correct value and RIP
641 		 * has been advanced, KVM is responsible only for emulating the
642 		 * IPI.  Sadly, hardware may sometimes leave the BUSY flag set,
643 		 * in which case KVM needs to emulate the ICR write as well in
644 		 * order to clear the BUSY flag.
645 		 */
646 		if (icrl & APIC_ICR_BUSY)
647 			kvm_apic_write_nodecode(vcpu, APIC_ICR);
648 		else
649 			kvm_apic_send_ipi(apic, icrl, icrh);
650 		break;
651 	case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING:
652 		/*
653 		 * At this point, we expect that the AVIC HW has already
654 		 * set the appropriate IRR bits on the valid target
655 		 * vcpus. So, we just need to kick the appropriate vcpu.
656 		 */
657 		avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index);
658 		break;
659 	case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
660 		WARN_ONCE(1, "Invalid backing page\n");
661 		break;
662 	case AVIC_IPI_FAILURE_INVALID_IPI_VECTOR:
663 		/* Invalid IPI with vector < 16 */
664 		break;
665 	default:
666 		vcpu_unimpl(vcpu, "Unknown avic incomplete IPI interception\n");
667 	}
668 
669 	return 1;
670 }
671 
avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu * vcpu)672 unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
673 {
674 	if (is_guest_mode(vcpu))
675 		return APICV_INHIBIT_REASON_NESTED;
676 	return 0;
677 }
678 
avic_get_logical_id_entry(struct kvm_vcpu * vcpu,u32 ldr,bool flat)679 static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
680 {
681 	struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
682 	u32 cluster, index;
683 
684 	ldr = GET_APIC_LOGICAL_ID(ldr);
685 
686 	if (flat) {
687 		cluster = 0;
688 	} else {
689 		cluster = (ldr >> 4);
690 		if (cluster >= 0xf)
691 			return NULL;
692 		ldr &= 0xf;
693 	}
694 	if (!ldr || !is_power_of_2(ldr))
695 		return NULL;
696 
697 	index = __ffs(ldr);
698 	if (WARN_ON_ONCE(index > 7))
699 		return NULL;
700 	index += (cluster << 2);
701 
702 	return &kvm_svm->avic_logical_id_table[index];
703 }
704 
avic_ldr_write(struct kvm_vcpu * vcpu,u8 g_physical_id,u32 ldr)705 static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
706 {
707 	bool flat;
708 	u32 *entry, new_entry;
709 
710 	flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
711 	entry = avic_get_logical_id_entry(vcpu, ldr, flat);
712 	if (!entry)
713 		return;
714 
715 	new_entry = READ_ONCE(*entry);
716 	new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
717 	new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
718 	new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
719 	WRITE_ONCE(*entry, new_entry);
720 }
721 
avic_invalidate_logical_id_entry(struct kvm_vcpu * vcpu)722 static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
723 {
724 	struct vcpu_svm *svm = to_svm(vcpu);
725 	bool flat = svm->dfr_reg == APIC_DFR_FLAT;
726 	u32 *entry;
727 
728 	/* Note: x2AVIC does not use logical APIC ID table */
729 	if (apic_x2apic_mode(vcpu->arch.apic))
730 		return;
731 
732 	entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
733 	if (entry)
734 		clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry);
735 }
736 
avic_handle_ldr_update(struct kvm_vcpu * vcpu)737 static void avic_handle_ldr_update(struct kvm_vcpu *vcpu)
738 {
739 	struct vcpu_svm *svm = to_svm(vcpu);
740 	u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
741 	u32 id = kvm_xapic_id(vcpu->arch.apic);
742 
743 	/* AVIC does not support LDR update for x2APIC */
744 	if (apic_x2apic_mode(vcpu->arch.apic))
745 		return;
746 
747 	if (ldr == svm->ldr_reg)
748 		return;
749 
750 	avic_invalidate_logical_id_entry(vcpu);
751 
752 	svm->ldr_reg = ldr;
753 	avic_ldr_write(vcpu, id, ldr);
754 }
755 
avic_handle_dfr_update(struct kvm_vcpu * vcpu)756 static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
757 {
758 	struct vcpu_svm *svm = to_svm(vcpu);
759 	u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
760 
761 	if (svm->dfr_reg == dfr)
762 		return;
763 
764 	avic_invalidate_logical_id_entry(vcpu);
765 	svm->dfr_reg = dfr;
766 }
767 
avic_unaccel_trap_write(struct kvm_vcpu * vcpu)768 static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu)
769 {
770 	u32 offset = to_svm(vcpu)->vmcb->control.exit_info_1 &
771 				AVIC_UNACCEL_ACCESS_OFFSET_MASK;
772 
773 	switch (offset) {
774 	case APIC_LDR:
775 		avic_handle_ldr_update(vcpu);
776 		break;
777 	case APIC_DFR:
778 		avic_handle_dfr_update(vcpu);
779 		break;
780 	case APIC_RRR:
781 		/* Ignore writes to Read Remote Data, it's read-only. */
782 		return 1;
783 	default:
784 		break;
785 	}
786 
787 	kvm_apic_write_nodecode(vcpu, offset);
788 	return 1;
789 }
790 
is_avic_unaccelerated_access_trap(u32 offset)791 static bool is_avic_unaccelerated_access_trap(u32 offset)
792 {
793 	bool ret = false;
794 
795 	switch (offset) {
796 	case APIC_ID:
797 	case APIC_EOI:
798 	case APIC_RRR:
799 	case APIC_LDR:
800 	case APIC_DFR:
801 	case APIC_SPIV:
802 	case APIC_ESR:
803 	case APIC_ICR:
804 	case APIC_LVTT:
805 	case APIC_LVTTHMR:
806 	case APIC_LVTPC:
807 	case APIC_LVT0:
808 	case APIC_LVT1:
809 	case APIC_LVTERR:
810 	case APIC_TMICT:
811 	case APIC_TDCR:
812 		ret = true;
813 		break;
814 	default:
815 		break;
816 	}
817 	return ret;
818 }
819 
avic_unaccelerated_access_interception(struct kvm_vcpu * vcpu)820 int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu)
821 {
822 	struct vcpu_svm *svm = to_svm(vcpu);
823 	int ret = 0;
824 	u32 offset = svm->vmcb->control.exit_info_1 &
825 		     AVIC_UNACCEL_ACCESS_OFFSET_MASK;
826 	u32 vector = svm->vmcb->control.exit_info_2 &
827 		     AVIC_UNACCEL_ACCESS_VECTOR_MASK;
828 	bool write = (svm->vmcb->control.exit_info_1 >> 32) &
829 		     AVIC_UNACCEL_ACCESS_WRITE_MASK;
830 	bool trap = is_avic_unaccelerated_access_trap(offset);
831 
832 	trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset,
833 					    trap, write, vector);
834 	if (trap) {
835 		/* Handling Trap */
836 		WARN_ONCE(!write, "svm: Handling trap read.\n");
837 		ret = avic_unaccel_trap_write(vcpu);
838 	} else {
839 		/* Handling Fault */
840 		ret = kvm_emulate_instruction(vcpu, 0);
841 	}
842 
843 	return ret;
844 }
845 
avic_init_vcpu(struct vcpu_svm * svm)846 int avic_init_vcpu(struct vcpu_svm *svm)
847 {
848 	int ret;
849 	struct kvm_vcpu *vcpu = &svm->vcpu;
850 
851 	INIT_LIST_HEAD(&svm->ir_list);
852 	raw_spin_lock_init(&svm->ir_list_lock);
853 
854 	if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm))
855 		return 0;
856 
857 	ret = avic_init_backing_page(vcpu);
858 	if (ret)
859 		return ret;
860 
861 	svm->dfr_reg = APIC_DFR_FLAT;
862 
863 	return ret;
864 }
865 
avic_apicv_post_state_restore(struct kvm_vcpu * vcpu)866 void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
867 {
868 	avic_handle_dfr_update(vcpu);
869 	avic_handle_ldr_update(vcpu);
870 }
871 
svm_ir_list_del(struct kvm_kernel_irqfd * irqfd)872 static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd)
873 {
874 	struct kvm_vcpu *vcpu = irqfd->irq_bypass_vcpu;
875 	unsigned long flags;
876 
877 	if (!vcpu)
878 		return;
879 
880 	raw_spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags);
881 	list_del(&irqfd->vcpu_list);
882 	raw_spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags);
883 }
884 
avic_pi_update_irte(struct kvm_kernel_irqfd * irqfd,struct kvm * kvm,unsigned int host_irq,uint32_t guest_irq,struct kvm_vcpu * vcpu,u32 vector)885 int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
886 			unsigned int host_irq, uint32_t guest_irq,
887 			struct kvm_vcpu *vcpu, u32 vector)
888 {
889 	/*
890 	 * If the IRQ was affined to a different vCPU, remove the IRTE metadata
891 	 * from the *previous* vCPU's list.
892 	 */
893 	svm_ir_list_del(irqfd);
894 
895 	if (vcpu) {
896 		/*
897 		 * Try to enable guest_mode in IRTE, unless AVIC is inhibited,
898 		 * in which case configure the IRTE for legacy mode, but track
899 		 * the IRTE metadata so that it can be converted to guest mode
900 		 * if AVIC is enabled/uninhibited in the future.
901 		 */
902 		struct amd_iommu_pi_data pi_data = {
903 			.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
904 					     vcpu->vcpu_idx),
905 			.is_guest_mode = kvm_vcpu_apicv_active(vcpu),
906 			.vapic_addr = avic_get_backing_page_address(to_svm(vcpu)),
907 			.vector = vector,
908 		};
909 		struct vcpu_svm *svm = to_svm(vcpu);
910 		u64 entry;
911 		int ret;
912 
913 		/*
914 		 * Prevent the vCPU from being scheduled out or migrated until
915 		 * the IRTE is updated and its metadata has been added to the
916 		 * list of IRQs being posted to the vCPU, to ensure the IRTE
917 		 * isn't programmed with stale pCPU/IsRunning information.
918 		 */
919 		guard(raw_spinlock_irqsave)(&svm->ir_list_lock);
920 
921 		/*
922 		 * Update the target pCPU for IOMMU doorbells if the vCPU is
923 		 * running.  If the vCPU is NOT running, i.e. is blocking or
924 		 * scheduled out, KVM will update the pCPU info when the vCPU
925 		 * is awakened and/or scheduled in.  See also avic_vcpu_load().
926 		 */
927 		entry = svm->avic_physical_id_entry;
928 		if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) {
929 			pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
930 		} else {
931 			pi_data.cpu = -1;
932 			pi_data.ga_log_intr = entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR;
933 		}
934 
935 		ret = irq_set_vcpu_affinity(host_irq, &pi_data);
936 		if (ret)
937 			return ret;
938 
939 		/*
940 		 * Revert to legacy mode if the IOMMU didn't provide metadata
941 		 * for the IRTE, which KVM needs to keep the IRTE up-to-date,
942 		 * e.g. if the vCPU is migrated or AVIC is disabled.
943 		 */
944 		if (WARN_ON_ONCE(!pi_data.ir_data)) {
945 			irq_set_vcpu_affinity(host_irq, NULL);
946 			return -EIO;
947 		}
948 
949 		irqfd->irq_bypass_data = pi_data.ir_data;
950 		list_add(&irqfd->vcpu_list, &svm->ir_list);
951 		return 0;
952 	}
953 	return irq_set_vcpu_affinity(host_irq, NULL);
954 }
955 
956 enum avic_vcpu_action {
957 	/*
958 	 * There is no need to differentiate between activate and deactivate,
959 	 * as KVM only refreshes AVIC state when the vCPU is scheduled in and
960 	 * isn't blocking, i.e. the pCPU must always be (in)valid when AVIC is
961 	 * being (de)activated.
962 	 */
963 	AVIC_TOGGLE_ON_OFF	= BIT(0),
964 	AVIC_ACTIVATE		= AVIC_TOGGLE_ON_OFF,
965 	AVIC_DEACTIVATE		= AVIC_TOGGLE_ON_OFF,
966 
967 	/*
968 	 * No unique action is required to deal with a vCPU that stops/starts
969 	 * running.  A vCPU that starts running by definition stops blocking as
970 	 * well, and a vCPU that stops running can't have been blocking, i.e.
971 	 * doesn't need to toggle GALogIntr.
972 	 */
973 	AVIC_START_RUNNING	= 0,
974 	AVIC_STOP_RUNNING	= 0,
975 
976 	/*
977 	 * When a vCPU starts blocking, KVM needs to set the GALogIntr flag
978 	 * int all associated IRTEs so that KVM can wake the vCPU if an IRQ is
979 	 * sent to the vCPU.
980 	 */
981 	AVIC_START_BLOCKING	= BIT(1),
982 };
983 
avic_update_iommu_vcpu_affinity(struct kvm_vcpu * vcpu,int cpu,enum avic_vcpu_action action)984 static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu,
985 					    enum avic_vcpu_action action)
986 {
987 	bool ga_log_intr = (action & AVIC_START_BLOCKING);
988 	struct vcpu_svm *svm = to_svm(vcpu);
989 	struct kvm_kernel_irqfd *irqfd;
990 
991 	lockdep_assert_held(&svm->ir_list_lock);
992 
993 	/*
994 	 * Here, we go through the per-vcpu ir_list to update all existing
995 	 * interrupt remapping table entry targeting this vcpu.
996 	 */
997 	if (list_empty(&svm->ir_list))
998 		return;
999 
1000 	list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) {
1001 		void *data = irqfd->irq_bypass_data;
1002 
1003 		if (!(action & AVIC_TOGGLE_ON_OFF))
1004 			WARN_ON_ONCE(amd_iommu_update_ga(data, cpu, ga_log_intr));
1005 		else if (cpu >= 0)
1006 			WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu, ga_log_intr));
1007 		else
1008 			WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data));
1009 	}
1010 }
1011 
__avic_vcpu_load(struct kvm_vcpu * vcpu,int cpu,enum avic_vcpu_action action)1012 static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu,
1013 			     enum avic_vcpu_action action)
1014 {
1015 	struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
1016 	int h_physical_id = kvm_cpu_get_apicid(cpu);
1017 	struct vcpu_svm *svm = to_svm(vcpu);
1018 	unsigned long flags;
1019 	u64 entry;
1020 
1021 	lockdep_assert_preemption_disabled();
1022 
1023 	if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
1024 		return;
1025 
1026 	if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >=
1027 			 PAGE_SIZE << avic_get_physical_id_table_order(vcpu->kvm)))
1028 		return;
1029 
1030 	/*
1031 	 * Grab the per-vCPU interrupt remapping lock even if the VM doesn't
1032 	 * _currently_ have assigned devices, as that can change.  Holding
1033 	 * ir_list_lock ensures that either svm_ir_list_add() will consume
1034 	 * up-to-date entry information, or that this task will wait until
1035 	 * svm_ir_list_add() completes to set the new target pCPU.
1036 	 */
1037 	raw_spin_lock_irqsave(&svm->ir_list_lock, flags);
1038 
1039 	entry = svm->avic_physical_id_entry;
1040 	WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
1041 
1042 	entry &= ~(AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK |
1043 		   AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR);
1044 	entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
1045 	entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1046 
1047 	svm->avic_physical_id_entry = entry;
1048 
1049 	/*
1050 	 * If IPI virtualization is disabled, clear IsRunning when updating the
1051 	 * actual Physical ID table, so that the CPU never sees IsRunning=1.
1052 	 * Keep the APIC ID up-to-date in the entry to minimize the chances of
1053 	 * things going sideways if hardware peeks at the ID.
1054 	 */
1055 	if (!enable_ipiv)
1056 		entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1057 
1058 	WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry);
1059 
1060 	avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action);
1061 
1062 	raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags);
1063 }
1064 
avic_vcpu_load(struct kvm_vcpu * vcpu,int cpu)1065 void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1066 {
1067 	/*
1068 	 * No need to update anything if the vCPU is blocking, i.e. if the vCPU
1069 	 * is being scheduled in after being preempted.  The CPU entries in the
1070 	 * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
1071 	 * If the vCPU was migrated, its new CPU value will be stuffed when the
1072 	 * vCPU unblocks.
1073 	 */
1074 	if (kvm_vcpu_is_blocking(vcpu))
1075 		return;
1076 
1077 	__avic_vcpu_load(vcpu, cpu, AVIC_START_RUNNING);
1078 }
1079 
__avic_vcpu_put(struct kvm_vcpu * vcpu,enum avic_vcpu_action action)1080 static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action)
1081 {
1082 	struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
1083 	struct vcpu_svm *svm = to_svm(vcpu);
1084 	unsigned long flags;
1085 	u64 entry = svm->avic_physical_id_entry;
1086 
1087 	lockdep_assert_preemption_disabled();
1088 
1089 	if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >=
1090 			 PAGE_SIZE << avic_get_physical_id_table_order(vcpu->kvm)))
1091 		return;
1092 
1093 	/*
1094 	 * Take and hold the per-vCPU interrupt remapping lock while updating
1095 	 * the Physical ID entry even though the lock doesn't protect against
1096 	 * multiple writers (see above).  Holding ir_list_lock ensures that
1097 	 * either svm_ir_list_add() will consume up-to-date entry information,
1098 	 * or that this task will wait until svm_ir_list_add() completes to
1099 	 * mark the vCPU as not running.
1100 	 */
1101 	raw_spin_lock_irqsave(&svm->ir_list_lock, flags);
1102 
1103 	avic_update_iommu_vcpu_affinity(vcpu, -1, action);
1104 
1105 	WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR);
1106 
1107 	/*
1108 	 * Keep the previous APIC ID in the entry so that a rogue doorbell from
1109 	 * hardware is at least restricted to a CPU associated with the vCPU.
1110 	 */
1111 	entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1112 
1113 	if (enable_ipiv)
1114 		WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry);
1115 
1116 	/*
1117 	 * Note!  Don't set AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR in the table as
1118 	 * it's a synthetic flag that usurps an unused should-be-zero bit.
1119 	 */
1120 	if (action & AVIC_START_BLOCKING)
1121 		entry |= AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR;
1122 
1123 	svm->avic_physical_id_entry = entry;
1124 
1125 	raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags);
1126 }
1127 
avic_vcpu_put(struct kvm_vcpu * vcpu)1128 void avic_vcpu_put(struct kvm_vcpu *vcpu)
1129 {
1130 	/*
1131 	 * Note, reading the Physical ID entry outside of ir_list_lock is safe
1132 	 * as only the pCPU that has loaded (or is loading) the vCPU is allowed
1133 	 * to modify the entry, and preemption is disabled.  I.e. the vCPU
1134 	 * can't be scheduled out and thus avic_vcpu_{put,load}() can't run
1135 	 * recursively.
1136 	 */
1137 	u64 entry = to_svm(vcpu)->avic_physical_id_entry;
1138 
1139 	/*
1140 	 * Nothing to do if IsRunning == '0' due to vCPU blocking, i.e. if the
1141 	 * vCPU is preempted while its in the process of blocking.  WARN if the
1142 	 * vCPU wasn't running and isn't blocking, KVM shouldn't attempt to put
1143 	 * the AVIC if it wasn't previously loaded.
1144 	 */
1145 	if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) {
1146 		if (WARN_ON_ONCE(!kvm_vcpu_is_blocking(vcpu)))
1147 			return;
1148 
1149 		/*
1150 		 * The vCPU was preempted while blocking, ensure its IRTEs are
1151 		 * configured to generate GA Log Interrupts.
1152 		 */
1153 		if (!(WARN_ON_ONCE(!(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR))))
1154 			return;
1155 	}
1156 
1157 	__avic_vcpu_put(vcpu, kvm_vcpu_is_blocking(vcpu) ? AVIC_START_BLOCKING :
1158 							   AVIC_STOP_RUNNING);
1159 }
1160 
avic_refresh_virtual_apic_mode(struct kvm_vcpu * vcpu)1161 void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
1162 {
1163 	struct vcpu_svm *svm = to_svm(vcpu);
1164 	struct vmcb *vmcb = svm->vmcb01.ptr;
1165 
1166 	if (!lapic_in_kernel(vcpu) || !enable_apicv)
1167 		return;
1168 
1169 	if (kvm_vcpu_apicv_active(vcpu)) {
1170 		/**
1171 		 * During AVIC temporary deactivation, guest could update
1172 		 * APIC ID, DFR and LDR registers, which would not be trapped
1173 		 * by avic_unaccelerated_access_interception(). In this case,
1174 		 * we need to check and update the AVIC logical APIC ID table
1175 		 * accordingly before re-activating.
1176 		 */
1177 		avic_apicv_post_state_restore(vcpu);
1178 		avic_activate_vmcb(svm);
1179 	} else {
1180 		avic_deactivate_vmcb(svm);
1181 	}
1182 	vmcb_mark_dirty(vmcb, VMCB_AVIC);
1183 }
1184 
avic_refresh_apicv_exec_ctrl(struct kvm_vcpu * vcpu)1185 void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
1186 {
1187 	if (!enable_apicv)
1188 		return;
1189 
1190 	/* APICv should only be toggled on/off while the vCPU is running. */
1191 	WARN_ON_ONCE(kvm_vcpu_is_blocking(vcpu));
1192 
1193 	avic_refresh_virtual_apic_mode(vcpu);
1194 
1195 	if (kvm_vcpu_apicv_active(vcpu))
1196 		__avic_vcpu_load(vcpu, vcpu->cpu, AVIC_ACTIVATE);
1197 	else
1198 		__avic_vcpu_put(vcpu, AVIC_DEACTIVATE);
1199 }
1200 
avic_vcpu_blocking(struct kvm_vcpu * vcpu)1201 void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
1202 {
1203 	if (!kvm_vcpu_apicv_active(vcpu))
1204 		return;
1205 
1206 	/*
1207 	 * Unload the AVIC when the vCPU is about to block, _before_ the vCPU
1208 	 * actually blocks.
1209 	 *
1210 	 * Note, any IRQs that arrive before IsRunning=0 will not cause an
1211 	 * incomplete IPI vmexit on the source; kvm_vcpu_check_block() handles
1212 	 * this by checking vIRR one last time before blocking.  The memory
1213 	 * barrier implicit in set_current_state orders writing IsRunning=0
1214 	 * before reading the vIRR.  The processor needs a matching memory
1215 	 * barrier on interrupt delivery between writing IRR and reading
1216 	 * IsRunning; the lack of this barrier might be the cause of errata #1235).
1217 	 *
1218 	 * Clear IsRunning=0 even if guest IRQs are disabled, i.e. even if KVM
1219 	 * doesn't need to detect events for scheduling purposes.  The doorbell
1220 	 * used to signal running vCPUs cannot be blocked, i.e. will perturb the
1221 	 * CPU and cause noisy neighbor problems if the VM is sending interrupts
1222 	 * to the vCPU while it's scheduled out.
1223 	 */
1224 	__avic_vcpu_put(vcpu, AVIC_START_BLOCKING);
1225 }
1226 
avic_vcpu_unblocking(struct kvm_vcpu * vcpu)1227 void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
1228 {
1229 	if (!kvm_vcpu_apicv_active(vcpu))
1230 		return;
1231 
1232 	avic_vcpu_load(vcpu, vcpu->cpu);
1233 }
1234 
avic_want_avic_enabled(void)1235 static bool __init avic_want_avic_enabled(void)
1236 {
1237 	/*
1238 	 * In "auto" mode, enable AVIC by default for Zen4+ if x2AVIC is
1239 	 * supported (to avoid enabling partial support by default, and because
1240 	 * x2AVIC should be supported by all Zen4+ CPUs).  Explicitly check for
1241 	 * family 0x1A and later (Zen5+), as the kernel's synthetic ZenX flags
1242 	 * aren't inclusive of previous generations, i.e. the kernel will set
1243 	 * at most one ZenX feature flag.
1244 	 */
1245 	if (avic == AVIC_AUTO_MODE)
1246 		avic = boot_cpu_has(X86_FEATURE_X2AVIC) &&
1247 		       (cpu_feature_enabled(X86_FEATURE_ZEN4) || boot_cpu_data.x86 >= 0x1A);
1248 
1249 	if (!avic || !npt_enabled)
1250 		return false;
1251 
1252 	/* AVIC is a prerequisite for x2AVIC. */
1253 	if (!boot_cpu_has(X86_FEATURE_AVIC) && !force_avic) {
1254 		if (boot_cpu_has(X86_FEATURE_X2AVIC))
1255 			pr_warn(FW_BUG "Cannot enable x2AVIC, AVIC is unsupported\n");
1256 		return false;
1257 	}
1258 
1259 	if (cc_platform_has(CC_ATTR_HOST_SEV_SNP) &&
1260 	    !boot_cpu_has(X86_FEATURE_HV_INUSE_WR_ALLOWED)) {
1261 		pr_warn("AVIC disabled: missing HvInUseWrAllowed on SNP-enabled system\n");
1262 		return false;
1263 	}
1264 
1265 	/*
1266 	 * Print a scary message if AVIC is force enabled to make it abundantly
1267 	 * clear that ignoring CPUID could have repercussions.  See Revision
1268 	 * Guide for specific AMD processor for more details.
1269 	 */
1270 	if (!boot_cpu_has(X86_FEATURE_AVIC))
1271 		pr_warn("AVIC unsupported in CPUID but force enabled, your system might crash and burn\n");
1272 
1273 	return true;
1274 }
1275 
1276 /*
1277  * Note:
1278  * - The module param avic enable both xAPIC and x2APIC mode.
1279  * - Hypervisor can support both xAVIC and x2AVIC in the same guest.
1280  * - The mode can be switched at run-time.
1281  */
avic_hardware_setup(void)1282 bool __init avic_hardware_setup(void)
1283 {
1284 	avic = avic_want_avic_enabled();
1285 	if (!avic)
1286 		return false;
1287 
1288 	pr_info("AVIC enabled\n");
1289 
1290 	/* AVIC is a prerequisite for x2AVIC. */
1291 	x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC);
1292 	if (x2avic_enabled) {
1293 		if (cpu_feature_enabled(X86_FEATURE_X2AVIC_EXT))
1294 			x2avic_max_physical_id = X2AVIC_4K_MAX_PHYSICAL_ID;
1295 		else
1296 			x2avic_max_physical_id = X2AVIC_MAX_PHYSICAL_ID;
1297 		pr_info("x2AVIC enabled (max %u vCPUs)\n", x2avic_max_physical_id + 1);
1298 	} else {
1299 		svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true;
1300 	}
1301 
1302 	/*
1303 	 * Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2)
1304 	 * due to erratum 1235, which results in missed VM-Exits on the sender
1305 	 * and thus missed wake events for blocking vCPUs due to the CPU
1306 	 * failing to see a software update to clear IsRunning.
1307 	 */
1308 	enable_ipiv = enable_ipiv && boot_cpu_data.x86 != 0x17;
1309 
1310 	amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
1311 
1312 	return true;
1313 }
1314 
avic_hardware_unsetup(void)1315 void avic_hardware_unsetup(void)
1316 {
1317 	if (avic)
1318 		amd_iommu_register_ga_log_notifier(NULL);
1319 }
1320