1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Kernel-based Virtual Machine driver for Linux
4 *
5 * AMD SVM support
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
8 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9 *
10 * Authors:
11 * Yaniv Kamay <yaniv@qumranet.com>
12 * Avi Kivity <avi@qumranet.com>
13 */
14
15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16
17 #include <linux/kvm_types.h>
18 #include <linux/hashtable.h>
19 #include <linux/amd-iommu.h>
20 #include <linux/kvm_host.h>
21 #include <linux/kvm_irqfd.h>
22 #include <linux/sysfs.h>
23
24 #include <asm/irq_remapping.h>
25 #include <asm/msr.h>
26
27 #include "trace.h"
28 #include "lapic.h"
29 #include "x86.h"
30 #include "irq.h"
31 #include "svm.h"
32
33 /*
34 * Encode the arbitrary VM ID and the vCPU's _index_ into the GATag so that
35 * KVM can retrieve the correct vCPU from a GALog entry if an interrupt can't
36 * be delivered, e.g. because the vCPU isn't running. Use the vCPU's index
37 * instead of its ID (a.k.a. its default APIC ID), as KVM is guaranteed a fast
38 * lookup on the index, where as vCPUs whose index doesn't match their ID need
39 * to walk the entire xarray of vCPUs in the worst case scenario.
40 *
41 * For the vCPU index, use however many bits are currently allowed for the max
42 * guest physical APIC ID (limited by the size of the physical ID table), and
43 * use whatever bits remain to assign arbitrary AVIC IDs to VMs. Note, the
44 * size of the GATag is defined by hardware (32 bits), but is an opaque value
45 * as far as hardware is concerned.
46 */
47 #define AVIC_VCPU_IDX_MASK AVIC_PHYSICAL_MAX_INDEX_MASK
48
49 #define AVIC_VM_ID_SHIFT HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK)
50 #define AVIC_VM_ID_MASK (GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT)
51
52 #define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK)
53 #define AVIC_GATAG_TO_VCPUIDX(x) (x & AVIC_VCPU_IDX_MASK)
54
55 #define __AVIC_GATAG(vm_id, vcpu_idx) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \
56 ((vcpu_idx) & AVIC_VCPU_IDX_MASK))
57 #define AVIC_GATAG(vm_id, vcpu_idx) \
58 ({ \
59 u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_idx); \
60 \
61 WARN_ON_ONCE(AVIC_GATAG_TO_VCPUIDX(ga_tag) != (vcpu_idx)); \
62 WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id)); \
63 ga_tag; \
64 })
65
66 static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_IDX_MASK) == -1u);
67
68 #define AVIC_AUTO_MODE -1
69
avic_param_set(const char * val,const struct kernel_param * kp)70 static int avic_param_set(const char *val, const struct kernel_param *kp)
71 {
72 if (val && sysfs_streq(val, "auto")) {
73 *(int *)kp->arg = AVIC_AUTO_MODE;
74 return 0;
75 }
76
77 return param_set_bint(val, kp);
78 }
79
avic_param_get(char * buffer,const struct kernel_param * kp)80 static int avic_param_get(char *buffer, const struct kernel_param *kp)
81 {
82 int val = *(int *)kp->arg;
83
84 if (val == AVIC_AUTO_MODE)
85 return sysfs_emit(buffer, "N\n");
86
87 return param_get_bool(buffer, kp);
88 }
89
90 static const struct kernel_param_ops avic_ops = {
91 .flags = KERNEL_PARAM_OPS_FL_NOARG,
92 .set = avic_param_set,
93 .get = avic_param_get,
94 };
95
96 /*
97 * Enable / disable AVIC. In "auto" mode (default behavior), AVIC is enabled
98 * for Zen4+ CPUs with x2AVIC (and all other criteria for enablement are met).
99 */
100 static int __ro_after_init avic = AVIC_AUTO_MODE;
101 module_param_cb(avic, &avic_ops, &avic, 0444);
102 __MODULE_PARM_TYPE(avic, "bool");
103
104 module_param(enable_ipiv, bool, 0444);
105
106 static bool __ro_after_init force_avic;
107 module_param_unsafe(force_avic, bool, 0444);
108
109 /* Note:
110 * This hash table is used to map VM_ID to a struct kvm_svm,
111 * when handling AMD IOMMU GALOG notification to schedule in
112 * a particular vCPU.
113 */
114 #define SVM_VM_DATA_HASH_BITS 8
115 static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
116 static u32 next_vm_id = 0;
117 static bool next_vm_id_wrapped = 0;
118 static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
119 static bool x2avic_enabled;
120 static u32 x2avic_max_physical_id;
121
avic_set_x2apic_msr_interception(struct vcpu_svm * svm,bool intercept)122 static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm,
123 bool intercept)
124 {
125 static const u32 x2avic_passthrough_msrs[] = {
126 X2APIC_MSR(APIC_ID),
127 X2APIC_MSR(APIC_LVR),
128 X2APIC_MSR(APIC_TASKPRI),
129 X2APIC_MSR(APIC_ARBPRI),
130 X2APIC_MSR(APIC_PROCPRI),
131 X2APIC_MSR(APIC_EOI),
132 X2APIC_MSR(APIC_RRR),
133 X2APIC_MSR(APIC_LDR),
134 X2APIC_MSR(APIC_DFR),
135 X2APIC_MSR(APIC_SPIV),
136 X2APIC_MSR(APIC_ISR),
137 X2APIC_MSR(APIC_TMR),
138 X2APIC_MSR(APIC_IRR),
139 X2APIC_MSR(APIC_ESR),
140 X2APIC_MSR(APIC_ICR),
141 X2APIC_MSR(APIC_ICR2),
142
143 /*
144 * Note! Always intercept LVTT, as TSC-deadline timer mode
145 * isn't virtualized by hardware, and the CPU will generate a
146 * #GP instead of a #VMEXIT.
147 */
148 X2APIC_MSR(APIC_LVTTHMR),
149 X2APIC_MSR(APIC_LVTPC),
150 X2APIC_MSR(APIC_LVT0),
151 X2APIC_MSR(APIC_LVT1),
152 X2APIC_MSR(APIC_LVTERR),
153 X2APIC_MSR(APIC_TMICT),
154 X2APIC_MSR(APIC_TMCCT),
155 X2APIC_MSR(APIC_TDCR),
156 };
157 int i;
158
159 if (intercept == svm->x2avic_msrs_intercepted)
160 return;
161
162 if (!x2avic_enabled)
163 return;
164
165 for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++)
166 svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i],
167 MSR_TYPE_RW, intercept);
168
169 svm->x2avic_msrs_intercepted = intercept;
170 }
171
__avic_get_max_physical_id(struct kvm * kvm,struct kvm_vcpu * vcpu)172 static u32 __avic_get_max_physical_id(struct kvm *kvm, struct kvm_vcpu *vcpu)
173 {
174 u32 arch_max;
175
176 /*
177 * Return the largest size (x2APIC) when querying without a vCPU, e.g.
178 * to allocate the per-VM table..
179 */
180 if (x2avic_enabled && (!vcpu || apic_x2apic_mode(vcpu->arch.apic)))
181 arch_max = x2avic_max_physical_id;
182 else
183 arch_max = AVIC_MAX_PHYSICAL_ID;
184
185 /*
186 * Despite its name, KVM_CAP_MAX_VCPU_ID represents the maximum APIC ID
187 * plus one, so the max possible APIC ID is one less than that.
188 */
189 return min(kvm->arch.max_vcpu_ids - 1, arch_max);
190 }
191
avic_get_max_physical_id(struct kvm_vcpu * vcpu)192 static u32 avic_get_max_physical_id(struct kvm_vcpu *vcpu)
193 {
194 return __avic_get_max_physical_id(vcpu->kvm, vcpu);
195 }
196
avic_activate_vmcb(struct vcpu_svm * svm)197 static void avic_activate_vmcb(struct vcpu_svm *svm)
198 {
199 struct vmcb *vmcb = svm->vmcb01.ptr;
200 struct kvm_vcpu *vcpu = &svm->vcpu;
201
202 vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
203 vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
204 vmcb->control.avic_physical_id |= avic_get_max_physical_id(vcpu);
205 vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
206
207 svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
208
209 /*
210 * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
211 * accesses, while interrupt injection to a running vCPU can be
212 * achieved using AVIC doorbell. KVM disables the APIC access page
213 * (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling
214 * AVIC in hybrid mode activates only the doorbell mechanism.
215 */
216 if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) {
217 vmcb->control.int_ctl |= X2APIC_MODE_MASK;
218
219 /* Disabling MSR intercept for x2APIC registers */
220 avic_set_x2apic_msr_interception(svm, false);
221 } else {
222 /*
223 * Flush the TLB, the guest may have inserted a non-APIC
224 * mapping into the TLB while AVIC was disabled.
225 */
226 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
227
228 /* Enabling MSR intercept for x2APIC registers */
229 avic_set_x2apic_msr_interception(svm, true);
230 }
231 }
232
avic_deactivate_vmcb(struct vcpu_svm * svm)233 static void avic_deactivate_vmcb(struct vcpu_svm *svm)
234 {
235 struct vmcb *vmcb = svm->vmcb01.ptr;
236
237 vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
238 vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
239
240 if (!is_sev_es_guest(&svm->vcpu))
241 svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
242
243 /*
244 * If running nested and the guest uses its own MSR bitmap, there
245 * is no need to update L0's msr bitmap
246 */
247 if (is_guest_mode(&svm->vcpu) &&
248 vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))
249 return;
250
251 /* Enabling MSR intercept for x2APIC registers */
252 avic_set_x2apic_msr_interception(svm, true);
253 }
254
255 /* Note:
256 * This function is called from IOMMU driver to notify
257 * SVM to schedule in a particular vCPU of a particular VM.
258 */
avic_ga_log_notifier(u32 ga_tag)259 static int avic_ga_log_notifier(u32 ga_tag)
260 {
261 unsigned long flags;
262 struct kvm_svm *kvm_svm;
263 struct kvm_vcpu *vcpu = NULL;
264 u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
265 u32 vcpu_idx = AVIC_GATAG_TO_VCPUIDX(ga_tag);
266
267 pr_debug("SVM: %s: vm_id=%#x, vcpu_idx=%#x\n", __func__, vm_id, vcpu_idx);
268 trace_kvm_avic_ga_log(vm_id, vcpu_idx);
269
270 spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
271 hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
272 if (kvm_svm->avic_vm_id != vm_id)
273 continue;
274 vcpu = kvm_get_vcpu(&kvm_svm->kvm, vcpu_idx);
275 break;
276 }
277 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
278
279 /* Note:
280 * At this point, the IOMMU should have already set the pending
281 * bit in the vAPIC backing page. So, we just need to schedule
282 * in the vcpu.
283 */
284 if (vcpu)
285 kvm_vcpu_wake_up(vcpu);
286
287 return 0;
288 }
289
avic_get_physical_id_table_order(struct kvm * kvm)290 static int avic_get_physical_id_table_order(struct kvm *kvm)
291 {
292 /* Provision for the maximum physical ID supported in x2avic mode */
293 return get_order((__avic_get_max_physical_id(kvm, NULL) + 1) * sizeof(u64));
294 }
295
avic_alloc_physical_id_table(struct kvm * kvm)296 int avic_alloc_physical_id_table(struct kvm *kvm)
297 {
298 struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
299
300 if (!irqchip_in_kernel(kvm) || !enable_apicv)
301 return 0;
302
303 if (kvm_svm->avic_physical_id_table)
304 return 0;
305
306 kvm_svm->avic_physical_id_table = (void *)__get_free_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
307 avic_get_physical_id_table_order(kvm));
308 if (!kvm_svm->avic_physical_id_table)
309 return -ENOMEM;
310
311 return 0;
312 }
313
avic_vm_destroy(struct kvm * kvm)314 void avic_vm_destroy(struct kvm *kvm)
315 {
316 unsigned long flags;
317 struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
318
319 if (!enable_apicv)
320 return;
321
322 free_page((unsigned long)kvm_svm->avic_logical_id_table);
323 free_pages((unsigned long)kvm_svm->avic_physical_id_table,
324 avic_get_physical_id_table_order(kvm));
325
326 spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
327 hash_del(&kvm_svm->hnode);
328 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
329 }
330
avic_vm_init(struct kvm * kvm)331 int avic_vm_init(struct kvm *kvm)
332 {
333 unsigned long flags;
334 int err = -ENOMEM;
335 struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
336 struct kvm_svm *k2;
337 u32 vm_id;
338
339 if (!enable_apicv)
340 return 0;
341
342 kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
343 if (!kvm_svm->avic_logical_id_table)
344 goto free_avic;
345
346 spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
347 again:
348 vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
349 if (vm_id == 0) { /* id is 1-based, zero is not okay */
350 next_vm_id_wrapped = 1;
351 goto again;
352 }
353 /* Is it still in use? Only possible if wrapped at least once */
354 if (next_vm_id_wrapped) {
355 hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) {
356 if (k2->avic_vm_id == vm_id)
357 goto again;
358 }
359 }
360 kvm_svm->avic_vm_id = vm_id;
361 hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id);
362 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
363
364 return 0;
365
366 free_avic:
367 avic_vm_destroy(kvm);
368 return err;
369 }
370
avic_get_backing_page_address(struct vcpu_svm * svm)371 static phys_addr_t avic_get_backing_page_address(struct vcpu_svm *svm)
372 {
373 return __sme_set(__pa(svm->vcpu.arch.apic->regs));
374 }
375
avic_init_vmcb(struct vcpu_svm * svm,struct vmcb * vmcb)376 void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
377 {
378 struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
379
380 vmcb->control.avic_backing_page = avic_get_backing_page_address(svm);
381 vmcb->control.avic_logical_id = __sme_set(__pa(kvm_svm->avic_logical_id_table));
382 vmcb->control.avic_physical_id = __sme_set(__pa(kvm_svm->avic_physical_id_table));
383 vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE;
384
385 if (kvm_vcpu_apicv_active(&svm->vcpu))
386 avic_activate_vmcb(svm);
387 else
388 avic_deactivate_vmcb(svm);
389 }
390
avic_init_backing_page(struct kvm_vcpu * vcpu)391 static int avic_init_backing_page(struct kvm_vcpu *vcpu)
392 {
393 u32 max_id = x2avic_enabled ? x2avic_max_physical_id : AVIC_MAX_PHYSICAL_ID;
394 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
395 struct vcpu_svm *svm = to_svm(vcpu);
396 u32 id = vcpu->vcpu_id;
397 u64 new_entry;
398
399 /*
400 * Inhibit AVIC if the vCPU ID is bigger than what is supported by AVIC
401 * hardware. Immediately clear apicv_active, i.e. don't wait until the
402 * KVM_REQ_APICV_UPDATE request is processed on the first KVM_RUN, as
403 * avic_vcpu_load() expects to be called if and only if the vCPU has
404 * fully initialized AVIC.
405 */
406 if (id > max_id) {
407 kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG);
408 vcpu->arch.apic->apicv_active = false;
409 return 0;
410 }
411
412 BUILD_BUG_ON((AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE ||
413 (X2AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE);
414
415 if (WARN_ON_ONCE(!vcpu->arch.apic->regs))
416 return -EINVAL;
417
418 if (kvm_apicv_activated(vcpu->kvm)) {
419 int ret;
420
421 /*
422 * Note, AVIC hardware walks the nested page table to check
423 * permissions, but does not use the SPA address specified in
424 * the leaf SPTE since it uses address in the AVIC_BACKING_PAGE
425 * pointer field of the VMCB.
426 */
427 ret = kvm_alloc_apic_access_page(vcpu->kvm);
428 if (ret)
429 return ret;
430 }
431
432 /* Note, fls64() returns the bit position, +1. */
433 BUILD_BUG_ON(__PHYSICAL_MASK_SHIFT >
434 fls64(AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK));
435
436 /* Setting AVIC backing page address in the phy APIC ID table */
437 new_entry = avic_get_backing_page_address(svm) |
438 AVIC_PHYSICAL_ID_ENTRY_VALID_MASK;
439 svm->avic_physical_id_entry = new_entry;
440
441 /*
442 * Initialize the real table, as vCPUs must have a valid entry in order
443 * for broadcast IPIs to function correctly (broadcast IPIs ignore
444 * invalid entries, i.e. aren't guaranteed to generate a VM-Exit).
445 */
446 WRITE_ONCE(kvm_svm->avic_physical_id_table[id], new_entry);
447
448 return 0;
449 }
450
avic_ring_doorbell(struct kvm_vcpu * vcpu)451 void avic_ring_doorbell(struct kvm_vcpu *vcpu)
452 {
453 /*
454 * Note, the vCPU could get migrated to a different pCPU at any point,
455 * which could result in signalling the wrong/previous pCPU. But if
456 * that happens the vCPU is guaranteed to do a VMRUN (after being
457 * migrated) and thus will process pending interrupts, i.e. a doorbell
458 * is not needed (and the spurious one is harmless).
459 */
460 int cpu = READ_ONCE(vcpu->cpu);
461
462 if (cpu != get_cpu()) {
463 wrmsrq(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
464 trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu));
465 }
466 put_cpu();
467 }
468
469
avic_kick_vcpu(struct kvm_vcpu * vcpu,u32 icrl)470 static void avic_kick_vcpu(struct kvm_vcpu *vcpu, u32 icrl)
471 {
472 vcpu->arch.apic->irr_pending = true;
473 svm_complete_interrupt_delivery(vcpu,
474 icrl & APIC_MODE_MASK,
475 icrl & APIC_INT_LEVELTRIG,
476 icrl & APIC_VECTOR_MASK);
477 }
478
avic_kick_vcpu_by_physical_id(struct kvm * kvm,u32 physical_id,u32 icrl)479 static void avic_kick_vcpu_by_physical_id(struct kvm *kvm, u32 physical_id,
480 u32 icrl)
481 {
482 /*
483 * KVM inhibits AVIC if any vCPU ID diverges from the vCPUs APIC ID,
484 * i.e. APIC ID == vCPU ID.
485 */
486 struct kvm_vcpu *target_vcpu = kvm_get_vcpu_by_id(kvm, physical_id);
487
488 /* Once again, nothing to do if the target vCPU doesn't exist. */
489 if (unlikely(!target_vcpu))
490 return;
491
492 avic_kick_vcpu(target_vcpu, icrl);
493 }
494
avic_kick_vcpu_by_logical_id(struct kvm * kvm,u32 * avic_logical_id_table,u32 logid_index,u32 icrl)495 static void avic_kick_vcpu_by_logical_id(struct kvm *kvm, u32 *avic_logical_id_table,
496 u32 logid_index, u32 icrl)
497 {
498 u32 physical_id;
499
500 if (avic_logical_id_table) {
501 u32 logid_entry = avic_logical_id_table[logid_index];
502
503 /* Nothing to do if the logical destination is invalid. */
504 if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
505 return;
506
507 physical_id = logid_entry &
508 AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
509 } else {
510 /*
511 * For x2APIC, the logical APIC ID is a read-only value that is
512 * derived from the x2APIC ID, thus the x2APIC ID can be found
513 * by reversing the calculation (stored in logid_index). Note,
514 * bits 31:20 of the x2APIC ID aren't propagated to the logical
515 * ID, but KVM limits the x2APIC ID limited to KVM_MAX_VCPU_IDS.
516 */
517 physical_id = logid_index;
518 }
519
520 avic_kick_vcpu_by_physical_id(kvm, physical_id, icrl);
521 }
522
523 /*
524 * A fast-path version of avic_kick_target_vcpus(), which attempts to match
525 * destination APIC ID to vCPU without looping through all vCPUs.
526 */
avic_kick_target_vcpus_fast(struct kvm * kvm,struct kvm_lapic * source,u32 icrl,u32 icrh,u32 index)527 static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source,
528 u32 icrl, u32 icrh, u32 index)
529 {
530 int dest_mode = icrl & APIC_DEST_MASK;
531 int shorthand = icrl & APIC_SHORT_MASK;
532 struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
533 u32 dest;
534
535 if (shorthand != APIC_DEST_NOSHORT)
536 return -EINVAL;
537
538 if (apic_x2apic_mode(source))
539 dest = icrh;
540 else
541 dest = GET_XAPIC_DEST_FIELD(icrh);
542
543 if (dest_mode == APIC_DEST_PHYSICAL) {
544 /* broadcast destination, use slow path */
545 if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST)
546 return -EINVAL;
547 if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST)
548 return -EINVAL;
549
550 if (WARN_ON_ONCE(dest != index))
551 return -EINVAL;
552
553 avic_kick_vcpu_by_physical_id(kvm, dest, icrl);
554 } else {
555 u32 *avic_logical_id_table;
556 unsigned long bitmap, i;
557 u32 cluster;
558
559 if (apic_x2apic_mode(source)) {
560 /* 16 bit dest mask, 16 bit cluster id */
561 bitmap = dest & 0xFFFF;
562 cluster = (dest >> 16) << 4;
563 } else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) {
564 /* 8 bit dest mask*/
565 bitmap = dest;
566 cluster = 0;
567 } else {
568 /* 4 bit desk mask, 4 bit cluster id */
569 bitmap = dest & 0xF;
570 cluster = (dest >> 4) << 2;
571 }
572
573 /* Nothing to do if there are no destinations in the cluster. */
574 if (unlikely(!bitmap))
575 return 0;
576
577 if (apic_x2apic_mode(source))
578 avic_logical_id_table = NULL;
579 else
580 avic_logical_id_table = kvm_svm->avic_logical_id_table;
581
582 /*
583 * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical
584 * IDs, thus each bit in the destination is guaranteed to map
585 * to at most one vCPU.
586 */
587 for_each_set_bit(i, &bitmap, 16)
588 avic_kick_vcpu_by_logical_id(kvm, avic_logical_id_table,
589 cluster + i, icrl);
590 }
591
592 return 0;
593 }
594
avic_kick_target_vcpus(struct kvm * kvm,struct kvm_lapic * source,u32 icrl,u32 icrh,u32 index)595 static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
596 u32 icrl, u32 icrh, u32 index)
597 {
598 u32 dest = apic_x2apic_mode(source) ? icrh : GET_XAPIC_DEST_FIELD(icrh);
599 unsigned long i;
600 struct kvm_vcpu *vcpu;
601
602 if (!avic_kick_target_vcpus_fast(kvm, source, icrl, icrh, index))
603 return;
604
605 trace_kvm_avic_kick_vcpu_slowpath(icrh, icrl, index);
606
607 /*
608 * Wake any target vCPUs that are blocking, i.e. waiting for a wake
609 * event. There's no need to signal doorbells, as hardware has handled
610 * vCPUs that were in guest at the time of the IPI, and vCPUs that have
611 * since entered the guest will have processed pending IRQs at VMRUN.
612 */
613 kvm_for_each_vcpu(i, vcpu, kvm) {
614 if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
615 dest, icrl & APIC_DEST_MASK))
616 avic_kick_vcpu(vcpu, icrl);
617 }
618 }
619
avic_incomplete_ipi_interception(struct kvm_vcpu * vcpu)620 int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
621 {
622 struct vcpu_svm *svm = to_svm(vcpu);
623 u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
624 u32 icrl = svm->vmcb->control.exit_info_1;
625 u32 id = svm->vmcb->control.exit_info_2 >> 32;
626 u32 index = svm->vmcb->control.exit_info_2 & AVIC_PHYSICAL_MAX_INDEX_MASK;
627 struct kvm_lapic *apic = vcpu->arch.apic;
628
629 trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
630
631 switch (id) {
632 case AVIC_IPI_FAILURE_INVALID_TARGET:
633 case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
634 /*
635 * Emulate IPIs that are not handled by AVIC hardware, which
636 * only virtualizes Fixed, Edge-Triggered INTRs, and falls over
637 * if _any_ targets are invalid, e.g. if the logical mode mask
638 * is a superset of running vCPUs.
639 *
640 * The exit is a trap, e.g. ICR holds the correct value and RIP
641 * has been advanced, KVM is responsible only for emulating the
642 * IPI. Sadly, hardware may sometimes leave the BUSY flag set,
643 * in which case KVM needs to emulate the ICR write as well in
644 * order to clear the BUSY flag.
645 */
646 if (icrl & APIC_ICR_BUSY)
647 kvm_apic_write_nodecode(vcpu, APIC_ICR);
648 else
649 kvm_apic_send_ipi(apic, icrl, icrh);
650 break;
651 case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING:
652 /*
653 * At this point, we expect that the AVIC HW has already
654 * set the appropriate IRR bits on the valid target
655 * vcpus. So, we just need to kick the appropriate vcpu.
656 */
657 avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index);
658 break;
659 case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
660 WARN_ONCE(1, "Invalid backing page\n");
661 break;
662 case AVIC_IPI_FAILURE_INVALID_IPI_VECTOR:
663 /* Invalid IPI with vector < 16 */
664 break;
665 default:
666 vcpu_unimpl(vcpu, "Unknown avic incomplete IPI interception\n");
667 }
668
669 return 1;
670 }
671
avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu * vcpu)672 unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
673 {
674 if (is_guest_mode(vcpu))
675 return APICV_INHIBIT_REASON_NESTED;
676 return 0;
677 }
678
avic_get_logical_id_entry(struct kvm_vcpu * vcpu,u32 ldr,bool flat)679 static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
680 {
681 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
682 u32 cluster, index;
683
684 ldr = GET_APIC_LOGICAL_ID(ldr);
685
686 if (flat) {
687 cluster = 0;
688 } else {
689 cluster = (ldr >> 4);
690 if (cluster >= 0xf)
691 return NULL;
692 ldr &= 0xf;
693 }
694 if (!ldr || !is_power_of_2(ldr))
695 return NULL;
696
697 index = __ffs(ldr);
698 if (WARN_ON_ONCE(index > 7))
699 return NULL;
700 index += (cluster << 2);
701
702 return &kvm_svm->avic_logical_id_table[index];
703 }
704
avic_ldr_write(struct kvm_vcpu * vcpu,u8 g_physical_id,u32 ldr)705 static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
706 {
707 bool flat;
708 u32 *entry, new_entry;
709
710 flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
711 entry = avic_get_logical_id_entry(vcpu, ldr, flat);
712 if (!entry)
713 return;
714
715 new_entry = READ_ONCE(*entry);
716 new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
717 new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
718 new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
719 WRITE_ONCE(*entry, new_entry);
720 }
721
avic_invalidate_logical_id_entry(struct kvm_vcpu * vcpu)722 static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
723 {
724 struct vcpu_svm *svm = to_svm(vcpu);
725 bool flat = svm->dfr_reg == APIC_DFR_FLAT;
726 u32 *entry;
727
728 /* Note: x2AVIC does not use logical APIC ID table */
729 if (apic_x2apic_mode(vcpu->arch.apic))
730 return;
731
732 entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
733 if (entry)
734 clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry);
735 }
736
avic_handle_ldr_update(struct kvm_vcpu * vcpu)737 static void avic_handle_ldr_update(struct kvm_vcpu *vcpu)
738 {
739 struct vcpu_svm *svm = to_svm(vcpu);
740 u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
741 u32 id = kvm_xapic_id(vcpu->arch.apic);
742
743 /* AVIC does not support LDR update for x2APIC */
744 if (apic_x2apic_mode(vcpu->arch.apic))
745 return;
746
747 if (ldr == svm->ldr_reg)
748 return;
749
750 avic_invalidate_logical_id_entry(vcpu);
751
752 svm->ldr_reg = ldr;
753 avic_ldr_write(vcpu, id, ldr);
754 }
755
avic_handle_dfr_update(struct kvm_vcpu * vcpu)756 static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
757 {
758 struct vcpu_svm *svm = to_svm(vcpu);
759 u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
760
761 if (svm->dfr_reg == dfr)
762 return;
763
764 avic_invalidate_logical_id_entry(vcpu);
765 svm->dfr_reg = dfr;
766 }
767
avic_unaccel_trap_write(struct kvm_vcpu * vcpu)768 static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu)
769 {
770 u32 offset = to_svm(vcpu)->vmcb->control.exit_info_1 &
771 AVIC_UNACCEL_ACCESS_OFFSET_MASK;
772
773 switch (offset) {
774 case APIC_LDR:
775 avic_handle_ldr_update(vcpu);
776 break;
777 case APIC_DFR:
778 avic_handle_dfr_update(vcpu);
779 break;
780 case APIC_RRR:
781 /* Ignore writes to Read Remote Data, it's read-only. */
782 return 1;
783 default:
784 break;
785 }
786
787 kvm_apic_write_nodecode(vcpu, offset);
788 return 1;
789 }
790
is_avic_unaccelerated_access_trap(u32 offset)791 static bool is_avic_unaccelerated_access_trap(u32 offset)
792 {
793 bool ret = false;
794
795 switch (offset) {
796 case APIC_ID:
797 case APIC_EOI:
798 case APIC_RRR:
799 case APIC_LDR:
800 case APIC_DFR:
801 case APIC_SPIV:
802 case APIC_ESR:
803 case APIC_ICR:
804 case APIC_LVTT:
805 case APIC_LVTTHMR:
806 case APIC_LVTPC:
807 case APIC_LVT0:
808 case APIC_LVT1:
809 case APIC_LVTERR:
810 case APIC_TMICT:
811 case APIC_TDCR:
812 ret = true;
813 break;
814 default:
815 break;
816 }
817 return ret;
818 }
819
avic_unaccelerated_access_interception(struct kvm_vcpu * vcpu)820 int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu)
821 {
822 struct vcpu_svm *svm = to_svm(vcpu);
823 int ret = 0;
824 u32 offset = svm->vmcb->control.exit_info_1 &
825 AVIC_UNACCEL_ACCESS_OFFSET_MASK;
826 u32 vector = svm->vmcb->control.exit_info_2 &
827 AVIC_UNACCEL_ACCESS_VECTOR_MASK;
828 bool write = (svm->vmcb->control.exit_info_1 >> 32) &
829 AVIC_UNACCEL_ACCESS_WRITE_MASK;
830 bool trap = is_avic_unaccelerated_access_trap(offset);
831
832 trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset,
833 trap, write, vector);
834 if (trap) {
835 /* Handling Trap */
836 WARN_ONCE(!write, "svm: Handling trap read.\n");
837 ret = avic_unaccel_trap_write(vcpu);
838 } else {
839 /* Handling Fault */
840 ret = kvm_emulate_instruction(vcpu, 0);
841 }
842
843 return ret;
844 }
845
avic_init_vcpu(struct vcpu_svm * svm)846 int avic_init_vcpu(struct vcpu_svm *svm)
847 {
848 int ret;
849 struct kvm_vcpu *vcpu = &svm->vcpu;
850
851 INIT_LIST_HEAD(&svm->ir_list);
852 raw_spin_lock_init(&svm->ir_list_lock);
853
854 if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm))
855 return 0;
856
857 ret = avic_init_backing_page(vcpu);
858 if (ret)
859 return ret;
860
861 svm->dfr_reg = APIC_DFR_FLAT;
862
863 return ret;
864 }
865
avic_apicv_post_state_restore(struct kvm_vcpu * vcpu)866 void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
867 {
868 avic_handle_dfr_update(vcpu);
869 avic_handle_ldr_update(vcpu);
870 }
871
svm_ir_list_del(struct kvm_kernel_irqfd * irqfd)872 static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd)
873 {
874 struct kvm_vcpu *vcpu = irqfd->irq_bypass_vcpu;
875 unsigned long flags;
876
877 if (!vcpu)
878 return;
879
880 raw_spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags);
881 list_del(&irqfd->vcpu_list);
882 raw_spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags);
883 }
884
avic_pi_update_irte(struct kvm_kernel_irqfd * irqfd,struct kvm * kvm,unsigned int host_irq,uint32_t guest_irq,struct kvm_vcpu * vcpu,u32 vector)885 int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
886 unsigned int host_irq, uint32_t guest_irq,
887 struct kvm_vcpu *vcpu, u32 vector)
888 {
889 /*
890 * If the IRQ was affined to a different vCPU, remove the IRTE metadata
891 * from the *previous* vCPU's list.
892 */
893 svm_ir_list_del(irqfd);
894
895 if (vcpu) {
896 /*
897 * Try to enable guest_mode in IRTE, unless AVIC is inhibited,
898 * in which case configure the IRTE for legacy mode, but track
899 * the IRTE metadata so that it can be converted to guest mode
900 * if AVIC is enabled/uninhibited in the future.
901 */
902 struct amd_iommu_pi_data pi_data = {
903 .ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
904 vcpu->vcpu_idx),
905 .is_guest_mode = kvm_vcpu_apicv_active(vcpu),
906 .vapic_addr = avic_get_backing_page_address(to_svm(vcpu)),
907 .vector = vector,
908 };
909 struct vcpu_svm *svm = to_svm(vcpu);
910 u64 entry;
911 int ret;
912
913 /*
914 * Prevent the vCPU from being scheduled out or migrated until
915 * the IRTE is updated and its metadata has been added to the
916 * list of IRQs being posted to the vCPU, to ensure the IRTE
917 * isn't programmed with stale pCPU/IsRunning information.
918 */
919 guard(raw_spinlock_irqsave)(&svm->ir_list_lock);
920
921 /*
922 * Update the target pCPU for IOMMU doorbells if the vCPU is
923 * running. If the vCPU is NOT running, i.e. is blocking or
924 * scheduled out, KVM will update the pCPU info when the vCPU
925 * is awakened and/or scheduled in. See also avic_vcpu_load().
926 */
927 entry = svm->avic_physical_id_entry;
928 if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) {
929 pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
930 } else {
931 pi_data.cpu = -1;
932 pi_data.ga_log_intr = entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR;
933 }
934
935 ret = irq_set_vcpu_affinity(host_irq, &pi_data);
936 if (ret)
937 return ret;
938
939 /*
940 * Revert to legacy mode if the IOMMU didn't provide metadata
941 * for the IRTE, which KVM needs to keep the IRTE up-to-date,
942 * e.g. if the vCPU is migrated or AVIC is disabled.
943 */
944 if (WARN_ON_ONCE(!pi_data.ir_data)) {
945 irq_set_vcpu_affinity(host_irq, NULL);
946 return -EIO;
947 }
948
949 irqfd->irq_bypass_data = pi_data.ir_data;
950 list_add(&irqfd->vcpu_list, &svm->ir_list);
951 return 0;
952 }
953 return irq_set_vcpu_affinity(host_irq, NULL);
954 }
955
956 enum avic_vcpu_action {
957 /*
958 * There is no need to differentiate between activate and deactivate,
959 * as KVM only refreshes AVIC state when the vCPU is scheduled in and
960 * isn't blocking, i.e. the pCPU must always be (in)valid when AVIC is
961 * being (de)activated.
962 */
963 AVIC_TOGGLE_ON_OFF = BIT(0),
964 AVIC_ACTIVATE = AVIC_TOGGLE_ON_OFF,
965 AVIC_DEACTIVATE = AVIC_TOGGLE_ON_OFF,
966
967 /*
968 * No unique action is required to deal with a vCPU that stops/starts
969 * running. A vCPU that starts running by definition stops blocking as
970 * well, and a vCPU that stops running can't have been blocking, i.e.
971 * doesn't need to toggle GALogIntr.
972 */
973 AVIC_START_RUNNING = 0,
974 AVIC_STOP_RUNNING = 0,
975
976 /*
977 * When a vCPU starts blocking, KVM needs to set the GALogIntr flag
978 * int all associated IRTEs so that KVM can wake the vCPU if an IRQ is
979 * sent to the vCPU.
980 */
981 AVIC_START_BLOCKING = BIT(1),
982 };
983
avic_update_iommu_vcpu_affinity(struct kvm_vcpu * vcpu,int cpu,enum avic_vcpu_action action)984 static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu,
985 enum avic_vcpu_action action)
986 {
987 bool ga_log_intr = (action & AVIC_START_BLOCKING);
988 struct vcpu_svm *svm = to_svm(vcpu);
989 struct kvm_kernel_irqfd *irqfd;
990
991 lockdep_assert_held(&svm->ir_list_lock);
992
993 /*
994 * Here, we go through the per-vcpu ir_list to update all existing
995 * interrupt remapping table entry targeting this vcpu.
996 */
997 if (list_empty(&svm->ir_list))
998 return;
999
1000 list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) {
1001 void *data = irqfd->irq_bypass_data;
1002
1003 if (!(action & AVIC_TOGGLE_ON_OFF))
1004 WARN_ON_ONCE(amd_iommu_update_ga(data, cpu, ga_log_intr));
1005 else if (cpu >= 0)
1006 WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu, ga_log_intr));
1007 else
1008 WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data));
1009 }
1010 }
1011
__avic_vcpu_load(struct kvm_vcpu * vcpu,int cpu,enum avic_vcpu_action action)1012 static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu,
1013 enum avic_vcpu_action action)
1014 {
1015 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
1016 int h_physical_id = kvm_cpu_get_apicid(cpu);
1017 struct vcpu_svm *svm = to_svm(vcpu);
1018 unsigned long flags;
1019 u64 entry;
1020
1021 lockdep_assert_preemption_disabled();
1022
1023 if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
1024 return;
1025
1026 if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >=
1027 PAGE_SIZE << avic_get_physical_id_table_order(vcpu->kvm)))
1028 return;
1029
1030 /*
1031 * Grab the per-vCPU interrupt remapping lock even if the VM doesn't
1032 * _currently_ have assigned devices, as that can change. Holding
1033 * ir_list_lock ensures that either svm_ir_list_add() will consume
1034 * up-to-date entry information, or that this task will wait until
1035 * svm_ir_list_add() completes to set the new target pCPU.
1036 */
1037 raw_spin_lock_irqsave(&svm->ir_list_lock, flags);
1038
1039 entry = svm->avic_physical_id_entry;
1040 WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
1041
1042 entry &= ~(AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK |
1043 AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR);
1044 entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
1045 entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1046
1047 svm->avic_physical_id_entry = entry;
1048
1049 /*
1050 * If IPI virtualization is disabled, clear IsRunning when updating the
1051 * actual Physical ID table, so that the CPU never sees IsRunning=1.
1052 * Keep the APIC ID up-to-date in the entry to minimize the chances of
1053 * things going sideways if hardware peeks at the ID.
1054 */
1055 if (!enable_ipiv)
1056 entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1057
1058 WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry);
1059
1060 avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action);
1061
1062 raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags);
1063 }
1064
avic_vcpu_load(struct kvm_vcpu * vcpu,int cpu)1065 void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1066 {
1067 /*
1068 * No need to update anything if the vCPU is blocking, i.e. if the vCPU
1069 * is being scheduled in after being preempted. The CPU entries in the
1070 * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
1071 * If the vCPU was migrated, its new CPU value will be stuffed when the
1072 * vCPU unblocks.
1073 */
1074 if (kvm_vcpu_is_blocking(vcpu))
1075 return;
1076
1077 __avic_vcpu_load(vcpu, cpu, AVIC_START_RUNNING);
1078 }
1079
__avic_vcpu_put(struct kvm_vcpu * vcpu,enum avic_vcpu_action action)1080 static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action)
1081 {
1082 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
1083 struct vcpu_svm *svm = to_svm(vcpu);
1084 unsigned long flags;
1085 u64 entry = svm->avic_physical_id_entry;
1086
1087 lockdep_assert_preemption_disabled();
1088
1089 if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >=
1090 PAGE_SIZE << avic_get_physical_id_table_order(vcpu->kvm)))
1091 return;
1092
1093 /*
1094 * Take and hold the per-vCPU interrupt remapping lock while updating
1095 * the Physical ID entry even though the lock doesn't protect against
1096 * multiple writers (see above). Holding ir_list_lock ensures that
1097 * either svm_ir_list_add() will consume up-to-date entry information,
1098 * or that this task will wait until svm_ir_list_add() completes to
1099 * mark the vCPU as not running.
1100 */
1101 raw_spin_lock_irqsave(&svm->ir_list_lock, flags);
1102
1103 avic_update_iommu_vcpu_affinity(vcpu, -1, action);
1104
1105 WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR);
1106
1107 /*
1108 * Keep the previous APIC ID in the entry so that a rogue doorbell from
1109 * hardware is at least restricted to a CPU associated with the vCPU.
1110 */
1111 entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1112
1113 if (enable_ipiv)
1114 WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry);
1115
1116 /*
1117 * Note! Don't set AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR in the table as
1118 * it's a synthetic flag that usurps an unused should-be-zero bit.
1119 */
1120 if (action & AVIC_START_BLOCKING)
1121 entry |= AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR;
1122
1123 svm->avic_physical_id_entry = entry;
1124
1125 raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags);
1126 }
1127
avic_vcpu_put(struct kvm_vcpu * vcpu)1128 void avic_vcpu_put(struct kvm_vcpu *vcpu)
1129 {
1130 /*
1131 * Note, reading the Physical ID entry outside of ir_list_lock is safe
1132 * as only the pCPU that has loaded (or is loading) the vCPU is allowed
1133 * to modify the entry, and preemption is disabled. I.e. the vCPU
1134 * can't be scheduled out and thus avic_vcpu_{put,load}() can't run
1135 * recursively.
1136 */
1137 u64 entry = to_svm(vcpu)->avic_physical_id_entry;
1138
1139 /*
1140 * Nothing to do if IsRunning == '0' due to vCPU blocking, i.e. if the
1141 * vCPU is preempted while its in the process of blocking. WARN if the
1142 * vCPU wasn't running and isn't blocking, KVM shouldn't attempt to put
1143 * the AVIC if it wasn't previously loaded.
1144 */
1145 if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) {
1146 if (WARN_ON_ONCE(!kvm_vcpu_is_blocking(vcpu)))
1147 return;
1148
1149 /*
1150 * The vCPU was preempted while blocking, ensure its IRTEs are
1151 * configured to generate GA Log Interrupts.
1152 */
1153 if (!(WARN_ON_ONCE(!(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR))))
1154 return;
1155 }
1156
1157 __avic_vcpu_put(vcpu, kvm_vcpu_is_blocking(vcpu) ? AVIC_START_BLOCKING :
1158 AVIC_STOP_RUNNING);
1159 }
1160
avic_refresh_virtual_apic_mode(struct kvm_vcpu * vcpu)1161 void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
1162 {
1163 struct vcpu_svm *svm = to_svm(vcpu);
1164 struct vmcb *vmcb = svm->vmcb01.ptr;
1165
1166 if (!lapic_in_kernel(vcpu) || !enable_apicv)
1167 return;
1168
1169 if (kvm_vcpu_apicv_active(vcpu)) {
1170 /**
1171 * During AVIC temporary deactivation, guest could update
1172 * APIC ID, DFR and LDR registers, which would not be trapped
1173 * by avic_unaccelerated_access_interception(). In this case,
1174 * we need to check and update the AVIC logical APIC ID table
1175 * accordingly before re-activating.
1176 */
1177 avic_apicv_post_state_restore(vcpu);
1178 avic_activate_vmcb(svm);
1179 } else {
1180 avic_deactivate_vmcb(svm);
1181 }
1182 vmcb_mark_dirty(vmcb, VMCB_AVIC);
1183 }
1184
avic_refresh_apicv_exec_ctrl(struct kvm_vcpu * vcpu)1185 void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
1186 {
1187 if (!enable_apicv)
1188 return;
1189
1190 /* APICv should only be toggled on/off while the vCPU is running. */
1191 WARN_ON_ONCE(kvm_vcpu_is_blocking(vcpu));
1192
1193 avic_refresh_virtual_apic_mode(vcpu);
1194
1195 if (kvm_vcpu_apicv_active(vcpu))
1196 __avic_vcpu_load(vcpu, vcpu->cpu, AVIC_ACTIVATE);
1197 else
1198 __avic_vcpu_put(vcpu, AVIC_DEACTIVATE);
1199 }
1200
avic_vcpu_blocking(struct kvm_vcpu * vcpu)1201 void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
1202 {
1203 if (!kvm_vcpu_apicv_active(vcpu))
1204 return;
1205
1206 /*
1207 * Unload the AVIC when the vCPU is about to block, _before_ the vCPU
1208 * actually blocks.
1209 *
1210 * Note, any IRQs that arrive before IsRunning=0 will not cause an
1211 * incomplete IPI vmexit on the source; kvm_vcpu_check_block() handles
1212 * this by checking vIRR one last time before blocking. The memory
1213 * barrier implicit in set_current_state orders writing IsRunning=0
1214 * before reading the vIRR. The processor needs a matching memory
1215 * barrier on interrupt delivery between writing IRR and reading
1216 * IsRunning; the lack of this barrier might be the cause of errata #1235).
1217 *
1218 * Clear IsRunning=0 even if guest IRQs are disabled, i.e. even if KVM
1219 * doesn't need to detect events for scheduling purposes. The doorbell
1220 * used to signal running vCPUs cannot be blocked, i.e. will perturb the
1221 * CPU and cause noisy neighbor problems if the VM is sending interrupts
1222 * to the vCPU while it's scheduled out.
1223 */
1224 __avic_vcpu_put(vcpu, AVIC_START_BLOCKING);
1225 }
1226
avic_vcpu_unblocking(struct kvm_vcpu * vcpu)1227 void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
1228 {
1229 if (!kvm_vcpu_apicv_active(vcpu))
1230 return;
1231
1232 avic_vcpu_load(vcpu, vcpu->cpu);
1233 }
1234
avic_want_avic_enabled(void)1235 static bool __init avic_want_avic_enabled(void)
1236 {
1237 /*
1238 * In "auto" mode, enable AVIC by default for Zen4+ if x2AVIC is
1239 * supported (to avoid enabling partial support by default, and because
1240 * x2AVIC should be supported by all Zen4+ CPUs). Explicitly check for
1241 * family 0x1A and later (Zen5+), as the kernel's synthetic ZenX flags
1242 * aren't inclusive of previous generations, i.e. the kernel will set
1243 * at most one ZenX feature flag.
1244 */
1245 if (avic == AVIC_AUTO_MODE)
1246 avic = boot_cpu_has(X86_FEATURE_X2AVIC) &&
1247 (cpu_feature_enabled(X86_FEATURE_ZEN4) || boot_cpu_data.x86 >= 0x1A);
1248
1249 if (!avic || !npt_enabled)
1250 return false;
1251
1252 /* AVIC is a prerequisite for x2AVIC. */
1253 if (!boot_cpu_has(X86_FEATURE_AVIC) && !force_avic) {
1254 if (boot_cpu_has(X86_FEATURE_X2AVIC))
1255 pr_warn(FW_BUG "Cannot enable x2AVIC, AVIC is unsupported\n");
1256 return false;
1257 }
1258
1259 if (cc_platform_has(CC_ATTR_HOST_SEV_SNP) &&
1260 !boot_cpu_has(X86_FEATURE_HV_INUSE_WR_ALLOWED)) {
1261 pr_warn("AVIC disabled: missing HvInUseWrAllowed on SNP-enabled system\n");
1262 return false;
1263 }
1264
1265 /*
1266 * Print a scary message if AVIC is force enabled to make it abundantly
1267 * clear that ignoring CPUID could have repercussions. See Revision
1268 * Guide for specific AMD processor for more details.
1269 */
1270 if (!boot_cpu_has(X86_FEATURE_AVIC))
1271 pr_warn("AVIC unsupported in CPUID but force enabled, your system might crash and burn\n");
1272
1273 return true;
1274 }
1275
1276 /*
1277 * Note:
1278 * - The module param avic enable both xAPIC and x2APIC mode.
1279 * - Hypervisor can support both xAVIC and x2AVIC in the same guest.
1280 * - The mode can be switched at run-time.
1281 */
avic_hardware_setup(void)1282 bool __init avic_hardware_setup(void)
1283 {
1284 avic = avic_want_avic_enabled();
1285 if (!avic)
1286 return false;
1287
1288 pr_info("AVIC enabled\n");
1289
1290 /* AVIC is a prerequisite for x2AVIC. */
1291 x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC);
1292 if (x2avic_enabled) {
1293 if (cpu_feature_enabled(X86_FEATURE_X2AVIC_EXT))
1294 x2avic_max_physical_id = X2AVIC_4K_MAX_PHYSICAL_ID;
1295 else
1296 x2avic_max_physical_id = X2AVIC_MAX_PHYSICAL_ID;
1297 pr_info("x2AVIC enabled (max %u vCPUs)\n", x2avic_max_physical_id + 1);
1298 } else {
1299 svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true;
1300 }
1301
1302 /*
1303 * Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2)
1304 * due to erratum 1235, which results in missed VM-Exits on the sender
1305 * and thus missed wake events for blocking vCPUs due to the CPU
1306 * failing to see a software update to clear IsRunning.
1307 */
1308 enable_ipiv = enable_ipiv && boot_cpu_data.x86 != 0x17;
1309
1310 amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
1311
1312 return true;
1313 }
1314
avic_hardware_unsetup(void)1315 void avic_hardware_unsetup(void)
1316 {
1317 if (avic)
1318 amd_iommu_register_ga_log_notifier(NULL);
1319 }
1320