1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Kernel-based Virtual Machine driver for Linux
4 *
5 * AMD SVM support
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
8 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9 *
10 * Authors:
11 * Yaniv Kamay <yaniv@qumranet.com>
12 * Avi Kivity <avi@qumranet.com>
13 */
14
15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16
17 #include <linux/kvm_types.h>
18 #include <linux/hashtable.h>
19 #include <linux/amd-iommu.h>
20 #include <linux/kvm_host.h>
21 #include <linux/kvm_irqfd.h>
22
23 #include <asm/irq_remapping.h>
24 #include <asm/msr.h>
25
26 #include "trace.h"
27 #include "lapic.h"
28 #include "x86.h"
29 #include "irq.h"
30 #include "svm.h"
31
32 /*
33 * Encode the arbitrary VM ID and the vCPU's _index_ into the GATag so that
34 * KVM can retrieve the correct vCPU from a GALog entry if an interrupt can't
35 * be delivered, e.g. because the vCPU isn't running. Use the vCPU's index
36 * instead of its ID (a.k.a. its default APIC ID), as KVM is guaranteed a fast
37 * lookup on the index, where as vCPUs whose index doesn't match their ID need
38 * to walk the entire xarray of vCPUs in the worst case scenario.
39 *
40 * For the vCPU index, use however many bits are currently allowed for the max
41 * guest physical APIC ID (limited by the size of the physical ID table), and
42 * use whatever bits remain to assign arbitrary AVIC IDs to VMs. Note, the
43 * size of the GATag is defined by hardware (32 bits), but is an opaque value
44 * as far as hardware is concerned.
45 */
46 #define AVIC_VCPU_IDX_MASK AVIC_PHYSICAL_MAX_INDEX_MASK
47
48 #define AVIC_VM_ID_SHIFT HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK)
49 #define AVIC_VM_ID_MASK (GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT)
50
51 #define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK)
52 #define AVIC_GATAG_TO_VCPUIDX(x) (x & AVIC_VCPU_IDX_MASK)
53
54 #define __AVIC_GATAG(vm_id, vcpu_idx) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \
55 ((vcpu_idx) & AVIC_VCPU_IDX_MASK))
56 #define AVIC_GATAG(vm_id, vcpu_idx) \
57 ({ \
58 u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_idx); \
59 \
60 WARN_ON_ONCE(AVIC_GATAG_TO_VCPUIDX(ga_tag) != (vcpu_idx)); \
61 WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id)); \
62 ga_tag; \
63 })
64
65 static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_IDX_MASK) == -1u);
66
67 #define AVIC_AUTO_MODE -1
68
avic_param_set(const char * val,const struct kernel_param * kp)69 static int avic_param_set(const char *val, const struct kernel_param *kp)
70 {
71 if (val && sysfs_streq(val, "auto")) {
72 *(int *)kp->arg = AVIC_AUTO_MODE;
73 return 0;
74 }
75
76 return param_set_bint(val, kp);
77 }
78
79 static const struct kernel_param_ops avic_ops = {
80 .flags = KERNEL_PARAM_OPS_FL_NOARG,
81 .set = avic_param_set,
82 .get = param_get_bool,
83 };
84
85 /*
86 * Enable / disable AVIC. In "auto" mode (default behavior), AVIC is enabled
87 * for Zen4+ CPUs with x2AVIC (and all other criteria for enablement are met).
88 */
89 static int avic = AVIC_AUTO_MODE;
90 module_param_cb(avic, &avic_ops, &avic, 0444);
91 __MODULE_PARM_TYPE(avic, "bool");
92
93 module_param(enable_ipiv, bool, 0444);
94
95 static bool force_avic;
96 module_param_unsafe(force_avic, bool, 0444);
97
98 /* Note:
99 * This hash table is used to map VM_ID to a struct kvm_svm,
100 * when handling AMD IOMMU GALOG notification to schedule in
101 * a particular vCPU.
102 */
103 #define SVM_VM_DATA_HASH_BITS 8
104 static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
105 static u32 next_vm_id = 0;
106 static bool next_vm_id_wrapped = 0;
107 static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
108 static bool x2avic_enabled;
109
110
avic_set_x2apic_msr_interception(struct vcpu_svm * svm,bool intercept)111 static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm,
112 bool intercept)
113 {
114 static const u32 x2avic_passthrough_msrs[] = {
115 X2APIC_MSR(APIC_ID),
116 X2APIC_MSR(APIC_LVR),
117 X2APIC_MSR(APIC_TASKPRI),
118 X2APIC_MSR(APIC_ARBPRI),
119 X2APIC_MSR(APIC_PROCPRI),
120 X2APIC_MSR(APIC_EOI),
121 X2APIC_MSR(APIC_RRR),
122 X2APIC_MSR(APIC_LDR),
123 X2APIC_MSR(APIC_DFR),
124 X2APIC_MSR(APIC_SPIV),
125 X2APIC_MSR(APIC_ISR),
126 X2APIC_MSR(APIC_TMR),
127 X2APIC_MSR(APIC_IRR),
128 X2APIC_MSR(APIC_ESR),
129 X2APIC_MSR(APIC_ICR),
130 X2APIC_MSR(APIC_ICR2),
131
132 /*
133 * Note! Always intercept LVTT, as TSC-deadline timer mode
134 * isn't virtualized by hardware, and the CPU will generate a
135 * #GP instead of a #VMEXIT.
136 */
137 X2APIC_MSR(APIC_LVTTHMR),
138 X2APIC_MSR(APIC_LVTPC),
139 X2APIC_MSR(APIC_LVT0),
140 X2APIC_MSR(APIC_LVT1),
141 X2APIC_MSR(APIC_LVTERR),
142 X2APIC_MSR(APIC_TMICT),
143 X2APIC_MSR(APIC_TMCCT),
144 X2APIC_MSR(APIC_TDCR),
145 };
146 int i;
147
148 if (intercept == svm->x2avic_msrs_intercepted)
149 return;
150
151 if (!x2avic_enabled)
152 return;
153
154 for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++)
155 svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i],
156 MSR_TYPE_RW, intercept);
157
158 svm->x2avic_msrs_intercepted = intercept;
159 }
160
avic_activate_vmcb(struct vcpu_svm * svm)161 static void avic_activate_vmcb(struct vcpu_svm *svm)
162 {
163 struct vmcb *vmcb = svm->vmcb01.ptr;
164
165 vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
166 vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
167
168 vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
169
170 /*
171 * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
172 * accesses, while interrupt injection to a running vCPU can be
173 * achieved using AVIC doorbell. KVM disables the APIC access page
174 * (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling
175 * AVIC in hybrid mode activates only the doorbell mechanism.
176 */
177 if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) {
178 vmcb->control.int_ctl |= X2APIC_MODE_MASK;
179 vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID;
180 /* Disabling MSR intercept for x2APIC registers */
181 avic_set_x2apic_msr_interception(svm, false);
182 } else {
183 /*
184 * Flush the TLB, the guest may have inserted a non-APIC
185 * mapping into the TLB while AVIC was disabled.
186 */
187 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
188
189 /* For xAVIC and hybrid-xAVIC modes */
190 vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID;
191 /* Enabling MSR intercept for x2APIC registers */
192 avic_set_x2apic_msr_interception(svm, true);
193 }
194 }
195
avic_deactivate_vmcb(struct vcpu_svm * svm)196 static void avic_deactivate_vmcb(struct vcpu_svm *svm)
197 {
198 struct vmcb *vmcb = svm->vmcb01.ptr;
199
200 vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
201 vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
202
203 /*
204 * If running nested and the guest uses its own MSR bitmap, there
205 * is no need to update L0's msr bitmap
206 */
207 if (is_guest_mode(&svm->vcpu) &&
208 vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))
209 return;
210
211 /* Enabling MSR intercept for x2APIC registers */
212 avic_set_x2apic_msr_interception(svm, true);
213 }
214
215 /* Note:
216 * This function is called from IOMMU driver to notify
217 * SVM to schedule in a particular vCPU of a particular VM.
218 */
avic_ga_log_notifier(u32 ga_tag)219 int avic_ga_log_notifier(u32 ga_tag)
220 {
221 unsigned long flags;
222 struct kvm_svm *kvm_svm;
223 struct kvm_vcpu *vcpu = NULL;
224 u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
225 u32 vcpu_idx = AVIC_GATAG_TO_VCPUIDX(ga_tag);
226
227 pr_debug("SVM: %s: vm_id=%#x, vcpu_idx=%#x\n", __func__, vm_id, vcpu_idx);
228 trace_kvm_avic_ga_log(vm_id, vcpu_idx);
229
230 spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
231 hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
232 if (kvm_svm->avic_vm_id != vm_id)
233 continue;
234 vcpu = kvm_get_vcpu(&kvm_svm->kvm, vcpu_idx);
235 break;
236 }
237 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
238
239 /* Note:
240 * At this point, the IOMMU should have already set the pending
241 * bit in the vAPIC backing page. So, we just need to schedule
242 * in the vcpu.
243 */
244 if (vcpu)
245 kvm_vcpu_wake_up(vcpu);
246
247 return 0;
248 }
249
avic_vm_destroy(struct kvm * kvm)250 void avic_vm_destroy(struct kvm *kvm)
251 {
252 unsigned long flags;
253 struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
254
255 if (!enable_apicv)
256 return;
257
258 free_page((unsigned long)kvm_svm->avic_logical_id_table);
259 free_page((unsigned long)kvm_svm->avic_physical_id_table);
260
261 spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
262 hash_del(&kvm_svm->hnode);
263 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
264 }
265
avic_vm_init(struct kvm * kvm)266 int avic_vm_init(struct kvm *kvm)
267 {
268 unsigned long flags;
269 int err = -ENOMEM;
270 struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
271 struct kvm_svm *k2;
272 u32 vm_id;
273
274 if (!enable_apicv)
275 return 0;
276
277 kvm_svm->avic_physical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
278 if (!kvm_svm->avic_physical_id_table)
279 goto free_avic;
280
281 kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
282 if (!kvm_svm->avic_logical_id_table)
283 goto free_avic;
284
285 spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
286 again:
287 vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
288 if (vm_id == 0) { /* id is 1-based, zero is not okay */
289 next_vm_id_wrapped = 1;
290 goto again;
291 }
292 /* Is it still in use? Only possible if wrapped at least once */
293 if (next_vm_id_wrapped) {
294 hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) {
295 if (k2->avic_vm_id == vm_id)
296 goto again;
297 }
298 }
299 kvm_svm->avic_vm_id = vm_id;
300 hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id);
301 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
302
303 return 0;
304
305 free_avic:
306 avic_vm_destroy(kvm);
307 return err;
308 }
309
avic_get_backing_page_address(struct vcpu_svm * svm)310 static phys_addr_t avic_get_backing_page_address(struct vcpu_svm *svm)
311 {
312 return __sme_set(__pa(svm->vcpu.arch.apic->regs));
313 }
314
avic_init_vmcb(struct vcpu_svm * svm,struct vmcb * vmcb)315 void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
316 {
317 struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
318
319 vmcb->control.avic_backing_page = avic_get_backing_page_address(svm);
320 vmcb->control.avic_logical_id = __sme_set(__pa(kvm_svm->avic_logical_id_table));
321 vmcb->control.avic_physical_id = __sme_set(__pa(kvm_svm->avic_physical_id_table));
322 vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE;
323
324 if (kvm_apicv_activated(svm->vcpu.kvm))
325 avic_activate_vmcb(svm);
326 else
327 avic_deactivate_vmcb(svm);
328 }
329
avic_init_backing_page(struct kvm_vcpu * vcpu)330 static int avic_init_backing_page(struct kvm_vcpu *vcpu)
331 {
332 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
333 struct vcpu_svm *svm = to_svm(vcpu);
334 u32 id = vcpu->vcpu_id;
335 u64 new_entry;
336
337 /*
338 * Inhibit AVIC if the vCPU ID is bigger than what is supported by AVIC
339 * hardware. Immediately clear apicv_active, i.e. don't wait until the
340 * KVM_REQ_APICV_UPDATE request is processed on the first KVM_RUN, as
341 * avic_vcpu_load() expects to be called if and only if the vCPU has
342 * fully initialized AVIC.
343 */
344 if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) ||
345 (id > X2AVIC_MAX_PHYSICAL_ID)) {
346 kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG);
347 vcpu->arch.apic->apicv_active = false;
348 return 0;
349 }
350
351 BUILD_BUG_ON((AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE ||
352 (X2AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE);
353
354 if (WARN_ON_ONCE(!vcpu->arch.apic->regs))
355 return -EINVAL;
356
357 if (kvm_apicv_activated(vcpu->kvm)) {
358 int ret;
359
360 /*
361 * Note, AVIC hardware walks the nested page table to check
362 * permissions, but does not use the SPA address specified in
363 * the leaf SPTE since it uses address in the AVIC_BACKING_PAGE
364 * pointer field of the VMCB.
365 */
366 ret = kvm_alloc_apic_access_page(vcpu->kvm);
367 if (ret)
368 return ret;
369 }
370
371 /* Note, fls64() returns the bit position, +1. */
372 BUILD_BUG_ON(__PHYSICAL_MASK_SHIFT >
373 fls64(AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK));
374
375 /* Setting AVIC backing page address in the phy APIC ID table */
376 new_entry = avic_get_backing_page_address(svm) |
377 AVIC_PHYSICAL_ID_ENTRY_VALID_MASK;
378 svm->avic_physical_id_entry = new_entry;
379
380 /*
381 * Initialize the real table, as vCPUs must have a valid entry in order
382 * for broadcast IPIs to function correctly (broadcast IPIs ignore
383 * invalid entries, i.e. aren't guaranteed to generate a VM-Exit).
384 */
385 WRITE_ONCE(kvm_svm->avic_physical_id_table[id], new_entry);
386
387 return 0;
388 }
389
avic_ring_doorbell(struct kvm_vcpu * vcpu)390 void avic_ring_doorbell(struct kvm_vcpu *vcpu)
391 {
392 /*
393 * Note, the vCPU could get migrated to a different pCPU at any point,
394 * which could result in signalling the wrong/previous pCPU. But if
395 * that happens the vCPU is guaranteed to do a VMRUN (after being
396 * migrated) and thus will process pending interrupts, i.e. a doorbell
397 * is not needed (and the spurious one is harmless).
398 */
399 int cpu = READ_ONCE(vcpu->cpu);
400
401 if (cpu != get_cpu()) {
402 wrmsrq(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
403 trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu));
404 }
405 put_cpu();
406 }
407
408
avic_kick_vcpu(struct kvm_vcpu * vcpu,u32 icrl)409 static void avic_kick_vcpu(struct kvm_vcpu *vcpu, u32 icrl)
410 {
411 vcpu->arch.apic->irr_pending = true;
412 svm_complete_interrupt_delivery(vcpu,
413 icrl & APIC_MODE_MASK,
414 icrl & APIC_INT_LEVELTRIG,
415 icrl & APIC_VECTOR_MASK);
416 }
417
avic_kick_vcpu_by_physical_id(struct kvm * kvm,u32 physical_id,u32 icrl)418 static void avic_kick_vcpu_by_physical_id(struct kvm *kvm, u32 physical_id,
419 u32 icrl)
420 {
421 /*
422 * KVM inhibits AVIC if any vCPU ID diverges from the vCPUs APIC ID,
423 * i.e. APIC ID == vCPU ID.
424 */
425 struct kvm_vcpu *target_vcpu = kvm_get_vcpu_by_id(kvm, physical_id);
426
427 /* Once again, nothing to do if the target vCPU doesn't exist. */
428 if (unlikely(!target_vcpu))
429 return;
430
431 avic_kick_vcpu(target_vcpu, icrl);
432 }
433
avic_kick_vcpu_by_logical_id(struct kvm * kvm,u32 * avic_logical_id_table,u32 logid_index,u32 icrl)434 static void avic_kick_vcpu_by_logical_id(struct kvm *kvm, u32 *avic_logical_id_table,
435 u32 logid_index, u32 icrl)
436 {
437 u32 physical_id;
438
439 if (avic_logical_id_table) {
440 u32 logid_entry = avic_logical_id_table[logid_index];
441
442 /* Nothing to do if the logical destination is invalid. */
443 if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
444 return;
445
446 physical_id = logid_entry &
447 AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
448 } else {
449 /*
450 * For x2APIC, the logical APIC ID is a read-only value that is
451 * derived from the x2APIC ID, thus the x2APIC ID can be found
452 * by reversing the calculation (stored in logid_index). Note,
453 * bits 31:20 of the x2APIC ID aren't propagated to the logical
454 * ID, but KVM limits the x2APIC ID limited to KVM_MAX_VCPU_IDS.
455 */
456 physical_id = logid_index;
457 }
458
459 avic_kick_vcpu_by_physical_id(kvm, physical_id, icrl);
460 }
461
462 /*
463 * A fast-path version of avic_kick_target_vcpus(), which attempts to match
464 * destination APIC ID to vCPU without looping through all vCPUs.
465 */
avic_kick_target_vcpus_fast(struct kvm * kvm,struct kvm_lapic * source,u32 icrl,u32 icrh,u32 index)466 static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source,
467 u32 icrl, u32 icrh, u32 index)
468 {
469 int dest_mode = icrl & APIC_DEST_MASK;
470 int shorthand = icrl & APIC_SHORT_MASK;
471 struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
472 u32 dest;
473
474 if (shorthand != APIC_DEST_NOSHORT)
475 return -EINVAL;
476
477 if (apic_x2apic_mode(source))
478 dest = icrh;
479 else
480 dest = GET_XAPIC_DEST_FIELD(icrh);
481
482 if (dest_mode == APIC_DEST_PHYSICAL) {
483 /* broadcast destination, use slow path */
484 if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST)
485 return -EINVAL;
486 if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST)
487 return -EINVAL;
488
489 if (WARN_ON_ONCE(dest != index))
490 return -EINVAL;
491
492 avic_kick_vcpu_by_physical_id(kvm, dest, icrl);
493 } else {
494 u32 *avic_logical_id_table;
495 unsigned long bitmap, i;
496 u32 cluster;
497
498 if (apic_x2apic_mode(source)) {
499 /* 16 bit dest mask, 16 bit cluster id */
500 bitmap = dest & 0xFFFF;
501 cluster = (dest >> 16) << 4;
502 } else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) {
503 /* 8 bit dest mask*/
504 bitmap = dest;
505 cluster = 0;
506 } else {
507 /* 4 bit desk mask, 4 bit cluster id */
508 bitmap = dest & 0xF;
509 cluster = (dest >> 4) << 2;
510 }
511
512 /* Nothing to do if there are no destinations in the cluster. */
513 if (unlikely(!bitmap))
514 return 0;
515
516 if (apic_x2apic_mode(source))
517 avic_logical_id_table = NULL;
518 else
519 avic_logical_id_table = kvm_svm->avic_logical_id_table;
520
521 /*
522 * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical
523 * IDs, thus each bit in the destination is guaranteed to map
524 * to at most one vCPU.
525 */
526 for_each_set_bit(i, &bitmap, 16)
527 avic_kick_vcpu_by_logical_id(kvm, avic_logical_id_table,
528 cluster + i, icrl);
529 }
530
531 return 0;
532 }
533
avic_kick_target_vcpus(struct kvm * kvm,struct kvm_lapic * source,u32 icrl,u32 icrh,u32 index)534 static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
535 u32 icrl, u32 icrh, u32 index)
536 {
537 u32 dest = apic_x2apic_mode(source) ? icrh : GET_XAPIC_DEST_FIELD(icrh);
538 unsigned long i;
539 struct kvm_vcpu *vcpu;
540
541 if (!avic_kick_target_vcpus_fast(kvm, source, icrl, icrh, index))
542 return;
543
544 trace_kvm_avic_kick_vcpu_slowpath(icrh, icrl, index);
545
546 /*
547 * Wake any target vCPUs that are blocking, i.e. waiting for a wake
548 * event. There's no need to signal doorbells, as hardware has handled
549 * vCPUs that were in guest at the time of the IPI, and vCPUs that have
550 * since entered the guest will have processed pending IRQs at VMRUN.
551 */
552 kvm_for_each_vcpu(i, vcpu, kvm) {
553 if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
554 dest, icrl & APIC_DEST_MASK))
555 avic_kick_vcpu(vcpu, icrl);
556 }
557 }
558
avic_incomplete_ipi_interception(struct kvm_vcpu * vcpu)559 int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
560 {
561 struct vcpu_svm *svm = to_svm(vcpu);
562 u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
563 u32 icrl = svm->vmcb->control.exit_info_1;
564 u32 id = svm->vmcb->control.exit_info_2 >> 32;
565 u32 index = svm->vmcb->control.exit_info_2 & 0x1FF;
566 struct kvm_lapic *apic = vcpu->arch.apic;
567
568 trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
569
570 switch (id) {
571 case AVIC_IPI_FAILURE_INVALID_TARGET:
572 case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
573 /*
574 * Emulate IPIs that are not handled by AVIC hardware, which
575 * only virtualizes Fixed, Edge-Triggered INTRs, and falls over
576 * if _any_ targets are invalid, e.g. if the logical mode mask
577 * is a superset of running vCPUs.
578 *
579 * The exit is a trap, e.g. ICR holds the correct value and RIP
580 * has been advanced, KVM is responsible only for emulating the
581 * IPI. Sadly, hardware may sometimes leave the BUSY flag set,
582 * in which case KVM needs to emulate the ICR write as well in
583 * order to clear the BUSY flag.
584 */
585 if (icrl & APIC_ICR_BUSY)
586 kvm_apic_write_nodecode(vcpu, APIC_ICR);
587 else
588 kvm_apic_send_ipi(apic, icrl, icrh);
589 break;
590 case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING:
591 /*
592 * At this point, we expect that the AVIC HW has already
593 * set the appropriate IRR bits on the valid target
594 * vcpus. So, we just need to kick the appropriate vcpu.
595 */
596 avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index);
597 break;
598 case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
599 WARN_ONCE(1, "Invalid backing page\n");
600 break;
601 case AVIC_IPI_FAILURE_INVALID_IPI_VECTOR:
602 /* Invalid IPI with vector < 16 */
603 break;
604 default:
605 vcpu_unimpl(vcpu, "Unknown avic incomplete IPI interception\n");
606 }
607
608 return 1;
609 }
610
avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu * vcpu)611 unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
612 {
613 if (is_guest_mode(vcpu))
614 return APICV_INHIBIT_REASON_NESTED;
615 return 0;
616 }
617
avic_get_logical_id_entry(struct kvm_vcpu * vcpu,u32 ldr,bool flat)618 static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
619 {
620 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
621 u32 cluster, index;
622
623 ldr = GET_APIC_LOGICAL_ID(ldr);
624
625 if (flat) {
626 cluster = 0;
627 } else {
628 cluster = (ldr >> 4);
629 if (cluster >= 0xf)
630 return NULL;
631 ldr &= 0xf;
632 }
633 if (!ldr || !is_power_of_2(ldr))
634 return NULL;
635
636 index = __ffs(ldr);
637 if (WARN_ON_ONCE(index > 7))
638 return NULL;
639 index += (cluster << 2);
640
641 return &kvm_svm->avic_logical_id_table[index];
642 }
643
avic_ldr_write(struct kvm_vcpu * vcpu,u8 g_physical_id,u32 ldr)644 static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
645 {
646 bool flat;
647 u32 *entry, new_entry;
648
649 flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
650 entry = avic_get_logical_id_entry(vcpu, ldr, flat);
651 if (!entry)
652 return;
653
654 new_entry = READ_ONCE(*entry);
655 new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
656 new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
657 new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
658 WRITE_ONCE(*entry, new_entry);
659 }
660
avic_invalidate_logical_id_entry(struct kvm_vcpu * vcpu)661 static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
662 {
663 struct vcpu_svm *svm = to_svm(vcpu);
664 bool flat = svm->dfr_reg == APIC_DFR_FLAT;
665 u32 *entry;
666
667 /* Note: x2AVIC does not use logical APIC ID table */
668 if (apic_x2apic_mode(vcpu->arch.apic))
669 return;
670
671 entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
672 if (entry)
673 clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry);
674 }
675
avic_handle_ldr_update(struct kvm_vcpu * vcpu)676 static void avic_handle_ldr_update(struct kvm_vcpu *vcpu)
677 {
678 struct vcpu_svm *svm = to_svm(vcpu);
679 u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
680 u32 id = kvm_xapic_id(vcpu->arch.apic);
681
682 /* AVIC does not support LDR update for x2APIC */
683 if (apic_x2apic_mode(vcpu->arch.apic))
684 return;
685
686 if (ldr == svm->ldr_reg)
687 return;
688
689 avic_invalidate_logical_id_entry(vcpu);
690
691 svm->ldr_reg = ldr;
692 avic_ldr_write(vcpu, id, ldr);
693 }
694
avic_handle_dfr_update(struct kvm_vcpu * vcpu)695 static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
696 {
697 struct vcpu_svm *svm = to_svm(vcpu);
698 u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
699
700 if (svm->dfr_reg == dfr)
701 return;
702
703 avic_invalidate_logical_id_entry(vcpu);
704 svm->dfr_reg = dfr;
705 }
706
avic_unaccel_trap_write(struct kvm_vcpu * vcpu)707 static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu)
708 {
709 u32 offset = to_svm(vcpu)->vmcb->control.exit_info_1 &
710 AVIC_UNACCEL_ACCESS_OFFSET_MASK;
711
712 switch (offset) {
713 case APIC_LDR:
714 avic_handle_ldr_update(vcpu);
715 break;
716 case APIC_DFR:
717 avic_handle_dfr_update(vcpu);
718 break;
719 case APIC_RRR:
720 /* Ignore writes to Read Remote Data, it's read-only. */
721 return 1;
722 default:
723 break;
724 }
725
726 kvm_apic_write_nodecode(vcpu, offset);
727 return 1;
728 }
729
is_avic_unaccelerated_access_trap(u32 offset)730 static bool is_avic_unaccelerated_access_trap(u32 offset)
731 {
732 bool ret = false;
733
734 switch (offset) {
735 case APIC_ID:
736 case APIC_EOI:
737 case APIC_RRR:
738 case APIC_LDR:
739 case APIC_DFR:
740 case APIC_SPIV:
741 case APIC_ESR:
742 case APIC_ICR:
743 case APIC_LVTT:
744 case APIC_LVTTHMR:
745 case APIC_LVTPC:
746 case APIC_LVT0:
747 case APIC_LVT1:
748 case APIC_LVTERR:
749 case APIC_TMICT:
750 case APIC_TDCR:
751 ret = true;
752 break;
753 default:
754 break;
755 }
756 return ret;
757 }
758
avic_unaccelerated_access_interception(struct kvm_vcpu * vcpu)759 int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu)
760 {
761 struct vcpu_svm *svm = to_svm(vcpu);
762 int ret = 0;
763 u32 offset = svm->vmcb->control.exit_info_1 &
764 AVIC_UNACCEL_ACCESS_OFFSET_MASK;
765 u32 vector = svm->vmcb->control.exit_info_2 &
766 AVIC_UNACCEL_ACCESS_VECTOR_MASK;
767 bool write = (svm->vmcb->control.exit_info_1 >> 32) &
768 AVIC_UNACCEL_ACCESS_WRITE_MASK;
769 bool trap = is_avic_unaccelerated_access_trap(offset);
770
771 trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset,
772 trap, write, vector);
773 if (trap) {
774 /* Handling Trap */
775 WARN_ONCE(!write, "svm: Handling trap read.\n");
776 ret = avic_unaccel_trap_write(vcpu);
777 } else {
778 /* Handling Fault */
779 ret = kvm_emulate_instruction(vcpu, 0);
780 }
781
782 return ret;
783 }
784
avic_init_vcpu(struct vcpu_svm * svm)785 int avic_init_vcpu(struct vcpu_svm *svm)
786 {
787 int ret;
788 struct kvm_vcpu *vcpu = &svm->vcpu;
789
790 INIT_LIST_HEAD(&svm->ir_list);
791 spin_lock_init(&svm->ir_list_lock);
792
793 if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm))
794 return 0;
795
796 ret = avic_init_backing_page(vcpu);
797 if (ret)
798 return ret;
799
800 svm->dfr_reg = APIC_DFR_FLAT;
801
802 return ret;
803 }
804
avic_apicv_post_state_restore(struct kvm_vcpu * vcpu)805 void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
806 {
807 avic_handle_dfr_update(vcpu);
808 avic_handle_ldr_update(vcpu);
809 }
810
svm_ir_list_del(struct kvm_kernel_irqfd * irqfd)811 static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd)
812 {
813 struct kvm_vcpu *vcpu = irqfd->irq_bypass_vcpu;
814 unsigned long flags;
815
816 if (!vcpu)
817 return;
818
819 spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags);
820 list_del(&irqfd->vcpu_list);
821 spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags);
822 }
823
avic_pi_update_irte(struct kvm_kernel_irqfd * irqfd,struct kvm * kvm,unsigned int host_irq,uint32_t guest_irq,struct kvm_vcpu * vcpu,u32 vector)824 int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
825 unsigned int host_irq, uint32_t guest_irq,
826 struct kvm_vcpu *vcpu, u32 vector)
827 {
828 /*
829 * If the IRQ was affined to a different vCPU, remove the IRTE metadata
830 * from the *previous* vCPU's list.
831 */
832 svm_ir_list_del(irqfd);
833
834 if (vcpu) {
835 /*
836 * Try to enable guest_mode in IRTE, unless AVIC is inhibited,
837 * in which case configure the IRTE for legacy mode, but track
838 * the IRTE metadata so that it can be converted to guest mode
839 * if AVIC is enabled/uninhibited in the future.
840 */
841 struct amd_iommu_pi_data pi_data = {
842 .ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
843 vcpu->vcpu_idx),
844 .is_guest_mode = kvm_vcpu_apicv_active(vcpu),
845 .vapic_addr = avic_get_backing_page_address(to_svm(vcpu)),
846 .vector = vector,
847 };
848 struct vcpu_svm *svm = to_svm(vcpu);
849 u64 entry;
850 int ret;
851
852 /*
853 * Prevent the vCPU from being scheduled out or migrated until
854 * the IRTE is updated and its metadata has been added to the
855 * list of IRQs being posted to the vCPU, to ensure the IRTE
856 * isn't programmed with stale pCPU/IsRunning information.
857 */
858 guard(spinlock_irqsave)(&svm->ir_list_lock);
859
860 /*
861 * Update the target pCPU for IOMMU doorbells if the vCPU is
862 * running. If the vCPU is NOT running, i.e. is blocking or
863 * scheduled out, KVM will update the pCPU info when the vCPU
864 * is awakened and/or scheduled in. See also avic_vcpu_load().
865 */
866 entry = svm->avic_physical_id_entry;
867 if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) {
868 pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
869 } else {
870 pi_data.cpu = -1;
871 pi_data.ga_log_intr = entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR;
872 }
873
874 ret = irq_set_vcpu_affinity(host_irq, &pi_data);
875 if (ret)
876 return ret;
877
878 /*
879 * Revert to legacy mode if the IOMMU didn't provide metadata
880 * for the IRTE, which KVM needs to keep the IRTE up-to-date,
881 * e.g. if the vCPU is migrated or AVIC is disabled.
882 */
883 if (WARN_ON_ONCE(!pi_data.ir_data)) {
884 irq_set_vcpu_affinity(host_irq, NULL);
885 return -EIO;
886 }
887
888 irqfd->irq_bypass_data = pi_data.ir_data;
889 list_add(&irqfd->vcpu_list, &svm->ir_list);
890 return 0;
891 }
892 return irq_set_vcpu_affinity(host_irq, NULL);
893 }
894
895 enum avic_vcpu_action {
896 /*
897 * There is no need to differentiate between activate and deactivate,
898 * as KVM only refreshes AVIC state when the vCPU is scheduled in and
899 * isn't blocking, i.e. the pCPU must always be (in)valid when AVIC is
900 * being (de)activated.
901 */
902 AVIC_TOGGLE_ON_OFF = BIT(0),
903 AVIC_ACTIVATE = AVIC_TOGGLE_ON_OFF,
904 AVIC_DEACTIVATE = AVIC_TOGGLE_ON_OFF,
905
906 /*
907 * No unique action is required to deal with a vCPU that stops/starts
908 * running. A vCPU that starts running by definition stops blocking as
909 * well, and a vCPU that stops running can't have been blocking, i.e.
910 * doesn't need to toggle GALogIntr.
911 */
912 AVIC_START_RUNNING = 0,
913 AVIC_STOP_RUNNING = 0,
914
915 /*
916 * When a vCPU starts blocking, KVM needs to set the GALogIntr flag
917 * int all associated IRTEs so that KVM can wake the vCPU if an IRQ is
918 * sent to the vCPU.
919 */
920 AVIC_START_BLOCKING = BIT(1),
921 };
922
avic_update_iommu_vcpu_affinity(struct kvm_vcpu * vcpu,int cpu,enum avic_vcpu_action action)923 static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu,
924 enum avic_vcpu_action action)
925 {
926 bool ga_log_intr = (action & AVIC_START_BLOCKING);
927 struct vcpu_svm *svm = to_svm(vcpu);
928 struct kvm_kernel_irqfd *irqfd;
929
930 lockdep_assert_held(&svm->ir_list_lock);
931
932 /*
933 * Here, we go through the per-vcpu ir_list to update all existing
934 * interrupt remapping table entry targeting this vcpu.
935 */
936 if (list_empty(&svm->ir_list))
937 return;
938
939 list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) {
940 void *data = irqfd->irq_bypass_data;
941
942 if (!(action & AVIC_TOGGLE_ON_OFF))
943 WARN_ON_ONCE(amd_iommu_update_ga(data, cpu, ga_log_intr));
944 else if (cpu >= 0)
945 WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu, ga_log_intr));
946 else
947 WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data));
948 }
949 }
950
__avic_vcpu_load(struct kvm_vcpu * vcpu,int cpu,enum avic_vcpu_action action)951 static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu,
952 enum avic_vcpu_action action)
953 {
954 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
955 int h_physical_id = kvm_cpu_get_apicid(cpu);
956 struct vcpu_svm *svm = to_svm(vcpu);
957 unsigned long flags;
958 u64 entry;
959
960 lockdep_assert_preemption_disabled();
961
962 if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
963 return;
964
965 if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE))
966 return;
967
968 /*
969 * Grab the per-vCPU interrupt remapping lock even if the VM doesn't
970 * _currently_ have assigned devices, as that can change. Holding
971 * ir_list_lock ensures that either svm_ir_list_add() will consume
972 * up-to-date entry information, or that this task will wait until
973 * svm_ir_list_add() completes to set the new target pCPU.
974 */
975 spin_lock_irqsave(&svm->ir_list_lock, flags);
976
977 entry = svm->avic_physical_id_entry;
978 WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
979
980 entry &= ~(AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK |
981 AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR);
982 entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
983 entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
984
985 svm->avic_physical_id_entry = entry;
986
987 /*
988 * If IPI virtualization is disabled, clear IsRunning when updating the
989 * actual Physical ID table, so that the CPU never sees IsRunning=1.
990 * Keep the APIC ID up-to-date in the entry to minimize the chances of
991 * things going sideways if hardware peeks at the ID.
992 */
993 if (!enable_ipiv)
994 entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
995
996 WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry);
997
998 avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action);
999
1000 spin_unlock_irqrestore(&svm->ir_list_lock, flags);
1001 }
1002
avic_vcpu_load(struct kvm_vcpu * vcpu,int cpu)1003 void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1004 {
1005 /*
1006 * No need to update anything if the vCPU is blocking, i.e. if the vCPU
1007 * is being scheduled in after being preempted. The CPU entries in the
1008 * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
1009 * If the vCPU was migrated, its new CPU value will be stuffed when the
1010 * vCPU unblocks.
1011 */
1012 if (kvm_vcpu_is_blocking(vcpu))
1013 return;
1014
1015 __avic_vcpu_load(vcpu, cpu, AVIC_START_RUNNING);
1016 }
1017
__avic_vcpu_put(struct kvm_vcpu * vcpu,enum avic_vcpu_action action)1018 static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action)
1019 {
1020 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
1021 struct vcpu_svm *svm = to_svm(vcpu);
1022 unsigned long flags;
1023 u64 entry = svm->avic_physical_id_entry;
1024
1025 lockdep_assert_preemption_disabled();
1026
1027 if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE))
1028 return;
1029
1030 /*
1031 * Take and hold the per-vCPU interrupt remapping lock while updating
1032 * the Physical ID entry even though the lock doesn't protect against
1033 * multiple writers (see above). Holding ir_list_lock ensures that
1034 * either svm_ir_list_add() will consume up-to-date entry information,
1035 * or that this task will wait until svm_ir_list_add() completes to
1036 * mark the vCPU as not running.
1037 */
1038 spin_lock_irqsave(&svm->ir_list_lock, flags);
1039
1040 avic_update_iommu_vcpu_affinity(vcpu, -1, action);
1041
1042 WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR);
1043
1044 /*
1045 * Keep the previous APIC ID in the entry so that a rogue doorbell from
1046 * hardware is at least restricted to a CPU associated with the vCPU.
1047 */
1048 entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1049
1050 if (enable_ipiv)
1051 WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry);
1052
1053 /*
1054 * Note! Don't set AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR in the table as
1055 * it's a synthetic flag that usurps an unused should-be-zero bit.
1056 */
1057 if (action & AVIC_START_BLOCKING)
1058 entry |= AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR;
1059
1060 svm->avic_physical_id_entry = entry;
1061
1062 spin_unlock_irqrestore(&svm->ir_list_lock, flags);
1063 }
1064
avic_vcpu_put(struct kvm_vcpu * vcpu)1065 void avic_vcpu_put(struct kvm_vcpu *vcpu)
1066 {
1067 /*
1068 * Note, reading the Physical ID entry outside of ir_list_lock is safe
1069 * as only the pCPU that has loaded (or is loading) the vCPU is allowed
1070 * to modify the entry, and preemption is disabled. I.e. the vCPU
1071 * can't be scheduled out and thus avic_vcpu_{put,load}() can't run
1072 * recursively.
1073 */
1074 u64 entry = to_svm(vcpu)->avic_physical_id_entry;
1075
1076 /*
1077 * Nothing to do if IsRunning == '0' due to vCPU blocking, i.e. if the
1078 * vCPU is preempted while its in the process of blocking. WARN if the
1079 * vCPU wasn't running and isn't blocking, KVM shouldn't attempt to put
1080 * the AVIC if it wasn't previously loaded.
1081 */
1082 if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) {
1083 if (WARN_ON_ONCE(!kvm_vcpu_is_blocking(vcpu)))
1084 return;
1085
1086 /*
1087 * The vCPU was preempted while blocking, ensure its IRTEs are
1088 * configured to generate GA Log Interrupts.
1089 */
1090 if (!(WARN_ON_ONCE(!(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR))))
1091 return;
1092 }
1093
1094 __avic_vcpu_put(vcpu, kvm_vcpu_is_blocking(vcpu) ? AVIC_START_BLOCKING :
1095 AVIC_STOP_RUNNING);
1096 }
1097
avic_refresh_virtual_apic_mode(struct kvm_vcpu * vcpu)1098 void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
1099 {
1100 struct vcpu_svm *svm = to_svm(vcpu);
1101 struct vmcb *vmcb = svm->vmcb01.ptr;
1102
1103 if (!lapic_in_kernel(vcpu) || !enable_apicv)
1104 return;
1105
1106 if (kvm_vcpu_apicv_active(vcpu)) {
1107 /**
1108 * During AVIC temporary deactivation, guest could update
1109 * APIC ID, DFR and LDR registers, which would not be trapped
1110 * by avic_unaccelerated_access_interception(). In this case,
1111 * we need to check and update the AVIC logical APIC ID table
1112 * accordingly before re-activating.
1113 */
1114 avic_apicv_post_state_restore(vcpu);
1115 avic_activate_vmcb(svm);
1116 } else {
1117 avic_deactivate_vmcb(svm);
1118 }
1119 vmcb_mark_dirty(vmcb, VMCB_AVIC);
1120 }
1121
avic_refresh_apicv_exec_ctrl(struct kvm_vcpu * vcpu)1122 void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
1123 {
1124 if (!enable_apicv)
1125 return;
1126
1127 /* APICv should only be toggled on/off while the vCPU is running. */
1128 WARN_ON_ONCE(kvm_vcpu_is_blocking(vcpu));
1129
1130 avic_refresh_virtual_apic_mode(vcpu);
1131
1132 if (kvm_vcpu_apicv_active(vcpu))
1133 __avic_vcpu_load(vcpu, vcpu->cpu, AVIC_ACTIVATE);
1134 else
1135 __avic_vcpu_put(vcpu, AVIC_DEACTIVATE);
1136 }
1137
avic_vcpu_blocking(struct kvm_vcpu * vcpu)1138 void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
1139 {
1140 if (!kvm_vcpu_apicv_active(vcpu))
1141 return;
1142
1143 /*
1144 * Unload the AVIC when the vCPU is about to block, _before_ the vCPU
1145 * actually blocks.
1146 *
1147 * Note, any IRQs that arrive before IsRunning=0 will not cause an
1148 * incomplete IPI vmexit on the source; kvm_vcpu_check_block() handles
1149 * this by checking vIRR one last time before blocking. The memory
1150 * barrier implicit in set_current_state orders writing IsRunning=0
1151 * before reading the vIRR. The processor needs a matching memory
1152 * barrier on interrupt delivery between writing IRR and reading
1153 * IsRunning; the lack of this barrier might be the cause of errata #1235).
1154 *
1155 * Clear IsRunning=0 even if guest IRQs are disabled, i.e. even if KVM
1156 * doesn't need to detect events for scheduling purposes. The doorbell
1157 * used to signal running vCPUs cannot be blocked, i.e. will perturb the
1158 * CPU and cause noisy neighbor problems if the VM is sending interrupts
1159 * to the vCPU while it's scheduled out.
1160 */
1161 __avic_vcpu_put(vcpu, AVIC_START_BLOCKING);
1162 }
1163
avic_vcpu_unblocking(struct kvm_vcpu * vcpu)1164 void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
1165 {
1166 if (!kvm_vcpu_apicv_active(vcpu))
1167 return;
1168
1169 avic_vcpu_load(vcpu, vcpu->cpu);
1170 }
1171
avic_want_avic_enabled(void)1172 static bool __init avic_want_avic_enabled(void)
1173 {
1174 /*
1175 * In "auto" mode, enable AVIC by default for Zen4+ if x2AVIC is
1176 * supported (to avoid enabling partial support by default, and because
1177 * x2AVIC should be supported by all Zen4+ CPUs). Explicitly check for
1178 * family 0x19 and later (Zen5+), as the kernel's synthetic ZenX flags
1179 * aren't inclusive of previous generations, i.e. the kernel will set
1180 * at most one ZenX feature flag.
1181 */
1182 if (avic == AVIC_AUTO_MODE)
1183 avic = boot_cpu_has(X86_FEATURE_X2AVIC) &&
1184 (boot_cpu_data.x86 > 0x19 || cpu_feature_enabled(X86_FEATURE_ZEN4));
1185
1186 if (!avic || !npt_enabled)
1187 return false;
1188
1189 /* AVIC is a prerequisite for x2AVIC. */
1190 if (!boot_cpu_has(X86_FEATURE_AVIC) && !force_avic) {
1191 if (boot_cpu_has(X86_FEATURE_X2AVIC))
1192 pr_warn(FW_BUG "Cannot enable x2AVIC, AVIC is unsupported\n");
1193 return false;
1194 }
1195
1196 if (cc_platform_has(CC_ATTR_HOST_SEV_SNP) &&
1197 !boot_cpu_has(X86_FEATURE_HV_INUSE_WR_ALLOWED)) {
1198 pr_warn("AVIC disabled: missing HvInUseWrAllowed on SNP-enabled system\n");
1199 return false;
1200 }
1201
1202 /*
1203 * Print a scary message if AVIC is force enabled to make it abundantly
1204 * clear that ignoring CPUID could have repercussions. See Revision
1205 * Guide for specific AMD processor for more details.
1206 */
1207 if (!boot_cpu_has(X86_FEATURE_AVIC))
1208 pr_warn("AVIC unsupported in CPUID but force enabled, your system might crash and burn\n");
1209
1210 return true;
1211 }
1212
1213 /*
1214 * Note:
1215 * - The module param avic enable both xAPIC and x2APIC mode.
1216 * - Hypervisor can support both xAVIC and x2AVIC in the same guest.
1217 * - The mode can be switched at run-time.
1218 */
avic_hardware_setup(void)1219 bool __init avic_hardware_setup(void)
1220 {
1221 avic = avic_want_avic_enabled();
1222 if (!avic)
1223 return false;
1224
1225 pr_info("AVIC enabled\n");
1226
1227 /* AVIC is a prerequisite for x2AVIC. */
1228 x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC);
1229 if (x2avic_enabled)
1230 pr_info("x2AVIC enabled\n");
1231 else
1232 svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true;
1233
1234 /*
1235 * Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2)
1236 * due to erratum 1235, which results in missed VM-Exits on the sender
1237 * and thus missed wake events for blocking vCPUs due to the CPU
1238 * failing to see a software update to clear IsRunning.
1239 */
1240 enable_ipiv = enable_ipiv && boot_cpu_data.x86 != 0x17;
1241
1242 amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
1243
1244 return true;
1245 }
1246