1 // SPDX-License-Identifier: GPL-2.0-only
2
3 #include <linux/cpu.h>
4 #include <linux/kvm.h>
5 #include <linux/kvm_host.h>
6 #include <linux/interrupt.h>
7 #include <linux/io.h>
8 #include <linux/uaccess.h>
9
10 #include <kvm/arm_vgic.h>
11
12 #include <asm/kvm_arm.h>
13 #include <asm/kvm_emulate.h>
14 #include <asm/kvm_nested.h>
15
16 #include "vgic.h"
17
18 #define ICH_LRN(n) (ICH_LR0_EL2 + (n))
19 #define ICH_AP0RN(n) (ICH_AP0R0_EL2 + (n))
20 #define ICH_AP1RN(n) (ICH_AP1R0_EL2 + (n))
21
22 struct mi_state {
23 u16 eisr;
24 u16 elrsr;
25 bool pend;
26 };
27
28 /*
29 * The shadow registers loaded to the hardware when running a L2 guest
30 * with the virtual IMO/FMO bits set.
31 */
32 struct shadow_if {
33 struct vgic_v3_cpu_if cpuif;
34 unsigned long lr_map;
35 };
36
37 static DEFINE_PER_CPU(struct shadow_if, shadow_if);
38
lr_map_idx_to_shadow_idx(struct shadow_if * shadow_if,int idx)39 static int lr_map_idx_to_shadow_idx(struct shadow_if *shadow_if, int idx)
40 {
41 return hweight16(shadow_if->lr_map & (BIT(idx) - 1));
42 }
43
44 /*
45 * Nesting GICv3 support
46 *
47 * On a non-nesting VM (only running at EL0/EL1), the host hypervisor
48 * completely controls the interrupts injected via the list registers.
49 * Consequently, most of the state that is modified by the guest (by ACK-ing
50 * and EOI-ing interrupts) is synced by KVM on each entry/exit, so that we
51 * keep a semi-consistent view of the interrupts.
52 *
53 * This still applies for a NV guest, but only while "InHost" (either
54 * running at EL2, or at EL0 with HCR_EL2.{E2H.TGE}=={1,1}.
55 *
56 * When running a L2 guest ("not InHost"), things are radically different,
57 * as the L1 guest is in charge of provisioning the interrupts via its own
58 * view of the ICH_LR*_EL2 registers, which conveniently live in the VNCR
59 * page. This means that the flow described above does work (there is no
60 * state to rebuild in the L0 hypervisor), and that most things happed on L2
61 * load/put:
62 *
63 * - on L2 load: move the in-memory L1 vGIC configuration into a shadow,
64 * per-CPU data structure that is used to populate the actual LRs. This is
65 * an extra copy that we could avoid, but life is short. In the process,
66 * we remap any interrupt that has the HW bit set to the mapped interrupt
67 * on the host, should the host consider it a HW one. This allows the HW
68 * deactivation to take its course, such as for the timer.
69 *
70 * - on L2 put: perform the inverse transformation, so that the result of L2
71 * running becomes visible to L1 in the VNCR-accessible registers.
72 *
73 * - there is nothing to do on L2 entry, as everything will have happened
74 * on load. However, this is the point where we detect that an interrupt
75 * targeting L1 and prepare the grand switcheroo.
76 *
77 * - on L2 exit: emulate the HW bit, and deactivate corresponding the L1
78 * interrupt. The L0 active state will be cleared by the HW if the L1
79 * interrupt was itself backed by a HW interrupt.
80 *
81 * Maintenance Interrupt (MI) management:
82 *
83 * Since the L2 guest runs the vgic in its full glory, MIs get delivered and
84 * used as a handover point between L2 and L1.
85 *
86 * - on delivery of a MI to L0 while L2 is running: make the L1 MI pending,
87 * and let it rip. This will initiate a vcpu_put() on L2, and allow L1 to
88 * run and process the MI.
89 *
90 * - L1 MI is a fully virtual interrupt, not linked to the host's MI. Its
91 * state must be computed at each entry/exit of the guest, much like we do
92 * it for the PMU interrupt.
93 *
94 * - because most of the ICH_*_EL2 registers live in the VNCR page, the
95 * quality of emulation is poor: L1 can setup the vgic so that an MI would
96 * immediately fire, and not observe anything until the next exit. Trying
97 * to read ICH_MISR_EL2 would do the trick, for example.
98 *
99 * System register emulation:
100 *
101 * We get two classes of registers:
102 *
103 * - those backed by memory (LRs, APRs, HCR, VMCR): L1 can freely access
104 * them, and L0 doesn't see a thing.
105 *
106 * - those that always trap (ELRSR, EISR, MISR): these are status registers
107 * that are built on the fly based on the in-memory state.
108 *
109 * Only L1 can access the ICH_*_EL2 registers. A non-NV L2 obviously cannot,
110 * and a NV L2 would either access the VNCR page provided by L1 (memory
111 * based registers), or see the access redirected to L1 (registers that
112 * trap) thanks to NV being set by L1.
113 */
114
vgic_state_is_nested(struct kvm_vcpu * vcpu)115 bool vgic_state_is_nested(struct kvm_vcpu *vcpu)
116 {
117 u64 xmo;
118
119 if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) {
120 xmo = __vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_IMO | HCR_FMO);
121 WARN_ONCE(xmo && xmo != (HCR_IMO | HCR_FMO),
122 "Separate virtual IRQ/FIQ settings not supported\n");
123
124 return !!xmo;
125 }
126
127 return false;
128 }
129
get_shadow_if(void)130 static struct shadow_if *get_shadow_if(void)
131 {
132 return this_cpu_ptr(&shadow_if);
133 }
134
lr_triggers_eoi(u64 lr)135 static bool lr_triggers_eoi(u64 lr)
136 {
137 return !(lr & (ICH_LR_STATE | ICH_LR_HW)) && (lr & ICH_LR_EOI);
138 }
139
vgic_compute_mi_state(struct kvm_vcpu * vcpu,struct mi_state * mi_state)140 static void vgic_compute_mi_state(struct kvm_vcpu *vcpu, struct mi_state *mi_state)
141 {
142 u16 eisr = 0, elrsr = 0;
143 bool pend = false;
144
145 for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) {
146 u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
147
148 if (lr_triggers_eoi(lr))
149 eisr |= BIT(i);
150 if (!(lr & ICH_LR_STATE))
151 elrsr |= BIT(i);
152 pend |= (lr & ICH_LR_PENDING_BIT);
153 }
154
155 mi_state->eisr = eisr;
156 mi_state->elrsr = elrsr;
157 mi_state->pend = pend;
158 }
159
vgic_v3_get_eisr(struct kvm_vcpu * vcpu)160 u16 vgic_v3_get_eisr(struct kvm_vcpu *vcpu)
161 {
162 struct mi_state mi_state;
163
164 vgic_compute_mi_state(vcpu, &mi_state);
165 return mi_state.eisr;
166 }
167
vgic_v3_get_elrsr(struct kvm_vcpu * vcpu)168 u16 vgic_v3_get_elrsr(struct kvm_vcpu *vcpu)
169 {
170 struct mi_state mi_state;
171
172 vgic_compute_mi_state(vcpu, &mi_state);
173 return mi_state.elrsr;
174 }
175
vgic_v3_get_misr(struct kvm_vcpu * vcpu)176 u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu)
177 {
178 struct mi_state mi_state;
179 u64 reg = 0, hcr, vmcr;
180
181 hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
182 vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2);
183
184 vgic_compute_mi_state(vcpu, &mi_state);
185
186 if (mi_state.eisr)
187 reg |= ICH_MISR_EL2_EOI;
188
189 if (__vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_UIE) {
190 int used_lrs = kvm_vgic_global_state.nr_lr;
191
192 used_lrs -= hweight16(mi_state.elrsr);
193 reg |= (used_lrs <= 1) ? ICH_MISR_EL2_U : 0;
194 }
195
196 if ((hcr & ICH_HCR_EL2_LRENPIE) && FIELD_GET(ICH_HCR_EL2_EOIcount_MASK, hcr))
197 reg |= ICH_MISR_EL2_LRENP;
198
199 if ((hcr & ICH_HCR_EL2_NPIE) && !mi_state.pend)
200 reg |= ICH_MISR_EL2_NP;
201
202 if ((hcr & ICH_HCR_EL2_VGrp0EIE) && (vmcr & ICH_VMCR_ENG0_MASK))
203 reg |= ICH_MISR_EL2_VGrp0E;
204
205 if ((hcr & ICH_HCR_EL2_VGrp0DIE) && !(vmcr & ICH_VMCR_ENG0_MASK))
206 reg |= ICH_MISR_EL2_VGrp0D;
207
208 if ((hcr & ICH_HCR_EL2_VGrp1EIE) && (vmcr & ICH_VMCR_ENG1_MASK))
209 reg |= ICH_MISR_EL2_VGrp1E;
210
211 if ((hcr & ICH_HCR_EL2_VGrp1DIE) && !(vmcr & ICH_VMCR_ENG1_MASK))
212 reg |= ICH_MISR_EL2_VGrp1D;
213
214 return reg;
215 }
216
translate_lr_pintid(struct kvm_vcpu * vcpu,u64 lr)217 static u64 translate_lr_pintid(struct kvm_vcpu *vcpu, u64 lr)
218 {
219 struct vgic_irq *irq;
220
221 if (!(lr & ICH_LR_HW))
222 return lr;
223
224 /* We have the HW bit set, check for validity of pINTID */
225 irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
226 /* If there was no real mapping, nuke the HW bit */
227 if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI)
228 lr &= ~ICH_LR_HW;
229
230 /* Translate the virtual mapping to the real one, even if invalid */
231 if (irq) {
232 lr &= ~ICH_LR_PHYS_ID_MASK;
233 lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid);
234 vgic_put_irq(vcpu->kvm, irq);
235 }
236
237 return lr;
238 }
239
240 /*
241 * For LRs which have HW bit set such as timer interrupts, we modify them to
242 * have the host hardware interrupt number instead of the virtual one programmed
243 * by the guest hypervisor.
244 */
vgic_v3_create_shadow_lr(struct kvm_vcpu * vcpu,struct vgic_v3_cpu_if * s_cpu_if)245 static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu,
246 struct vgic_v3_cpu_if *s_cpu_if)
247 {
248 struct shadow_if *shadow_if;
249
250 shadow_if = container_of(s_cpu_if, struct shadow_if, cpuif);
251 shadow_if->lr_map = 0;
252
253 for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) {
254 u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
255
256 if (!(lr & ICH_LR_STATE))
257 continue;
258
259 lr = translate_lr_pintid(vcpu, lr);
260
261 s_cpu_if->vgic_lr[hweight16(shadow_if->lr_map)] = lr;
262 shadow_if->lr_map |= BIT(i);
263 }
264
265 s_cpu_if->used_lrs = hweight16(shadow_if->lr_map);
266 }
267
vgic_v3_sync_nested(struct kvm_vcpu * vcpu)268 void vgic_v3_sync_nested(struct kvm_vcpu *vcpu)
269 {
270 struct shadow_if *shadow_if = get_shadow_if();
271 int i;
272
273 for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) {
274 u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
275 struct vgic_irq *irq;
276
277 if (!(lr & ICH_LR_HW) || !(lr & ICH_LR_STATE))
278 continue;
279
280 /*
281 * If we had a HW lr programmed by the guest hypervisor, we
282 * need to emulate the HW effect between the guest hypervisor
283 * and the nested guest.
284 */
285 irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
286 if (WARN_ON(!irq)) /* Shouldn't happen as we check on load */
287 continue;
288
289 lr = __gic_v3_get_lr(lr_map_idx_to_shadow_idx(shadow_if, i));
290 if (!(lr & ICH_LR_STATE))
291 irq->active = false;
292
293 vgic_put_irq(vcpu->kvm, irq);
294 }
295 }
296
vgic_v3_create_shadow_state(struct kvm_vcpu * vcpu,struct vgic_v3_cpu_if * s_cpu_if)297 static void vgic_v3_create_shadow_state(struct kvm_vcpu *vcpu,
298 struct vgic_v3_cpu_if *s_cpu_if)
299 {
300 struct vgic_v3_cpu_if *host_if = &vcpu->arch.vgic_cpu.vgic_v3;
301 u64 val = 0;
302 int i;
303
304 /*
305 * If we're on a system with a broken vgic that requires
306 * trapping, propagate the trapping requirements.
307 *
308 * Ah, the smell of rotten fruits...
309 */
310 if (static_branch_unlikely(&vgic_v3_cpuif_trap))
311 val = host_if->vgic_hcr & (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 |
312 ICH_HCR_EL2_TC | ICH_HCR_EL2_TDIR);
313 s_cpu_if->vgic_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2) | val;
314 s_cpu_if->vgic_vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2);
315 s_cpu_if->vgic_sre = host_if->vgic_sre;
316
317 for (i = 0; i < 4; i++) {
318 s_cpu_if->vgic_ap0r[i] = __vcpu_sys_reg(vcpu, ICH_AP0RN(i));
319 s_cpu_if->vgic_ap1r[i] = __vcpu_sys_reg(vcpu, ICH_AP1RN(i));
320 }
321
322 vgic_v3_create_shadow_lr(vcpu, s_cpu_if);
323 }
324
vgic_v3_load_nested(struct kvm_vcpu * vcpu)325 void vgic_v3_load_nested(struct kvm_vcpu *vcpu)
326 {
327 struct shadow_if *shadow_if = get_shadow_if();
328 struct vgic_v3_cpu_if *cpu_if = &shadow_if->cpuif;
329
330 BUG_ON(!vgic_state_is_nested(vcpu));
331
332 vgic_v3_create_shadow_state(vcpu, cpu_if);
333
334 __vgic_v3_restore_vmcr_aprs(cpu_if);
335 __vgic_v3_activate_traps(cpu_if);
336
337 __vgic_v3_restore_state(cpu_if);
338
339 /*
340 * Propagate the number of used LRs for the benefit of the HYP
341 * GICv3 emulation code. Yes, this is a pretty sorry hack.
342 */
343 vcpu->arch.vgic_cpu.vgic_v3.used_lrs = cpu_if->used_lrs;
344 }
345
vgic_v3_put_nested(struct kvm_vcpu * vcpu)346 void vgic_v3_put_nested(struct kvm_vcpu *vcpu)
347 {
348 struct shadow_if *shadow_if = get_shadow_if();
349 struct vgic_v3_cpu_if *s_cpu_if = &shadow_if->cpuif;
350 u64 val;
351 int i;
352
353 __vgic_v3_save_vmcr_aprs(s_cpu_if);
354 __vgic_v3_deactivate_traps(s_cpu_if);
355 __vgic_v3_save_state(s_cpu_if);
356
357 /*
358 * Translate the shadow state HW fields back to the virtual ones
359 * before copying the shadow struct back to the nested one.
360 */
361 val = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
362 val &= ~ICH_HCR_EL2_EOIcount_MASK;
363 val |= (s_cpu_if->vgic_hcr & ICH_HCR_EL2_EOIcount_MASK);
364 __vcpu_assign_sys_reg(vcpu, ICH_HCR_EL2, val);
365 __vcpu_assign_sys_reg(vcpu, ICH_VMCR_EL2, s_cpu_if->vgic_vmcr);
366
367 for (i = 0; i < 4; i++) {
368 __vcpu_assign_sys_reg(vcpu, ICH_AP0RN(i), s_cpu_if->vgic_ap0r[i]);
369 __vcpu_assign_sys_reg(vcpu, ICH_AP1RN(i), s_cpu_if->vgic_ap1r[i]);
370 }
371
372 for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) {
373 val = __vcpu_sys_reg(vcpu, ICH_LRN(i));
374
375 val &= ~ICH_LR_STATE;
376 val |= s_cpu_if->vgic_lr[lr_map_idx_to_shadow_idx(shadow_if, i)] & ICH_LR_STATE;
377
378 __vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val);
379 }
380
381 vcpu->arch.vgic_cpu.vgic_v3.used_lrs = 0;
382 }
383
384 /*
385 * If we exit a L2 VM with a pending maintenance interrupt from the GIC,
386 * then we need to forward this to L1 so that it can re-sync the appropriate
387 * LRs and sample level triggered interrupts again.
388 */
vgic_v3_handle_nested_maint_irq(struct kvm_vcpu * vcpu)389 void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu)
390 {
391 bool state = read_sysreg_s(SYS_ICH_MISR_EL2);
392
393 /* This will force a switch back to L1 if the level is high */
394 kvm_vgic_inject_irq(vcpu->kvm, vcpu,
395 vcpu->kvm->arch.vgic.mi_intid, state, vcpu);
396
397 sysreg_clear_set_s(SYS_ICH_HCR_EL2, ICH_HCR_EL2_En, 0);
398 }
399
vgic_v3_nested_update_mi(struct kvm_vcpu * vcpu)400 void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu)
401 {
402 bool level;
403
404 level = __vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_En;
405 if (level)
406 level &= vgic_v3_get_misr(vcpu);
407 kvm_vgic_inject_irq(vcpu->kvm, vcpu,
408 vcpu->kvm->arch.vgic.mi_intid, level, vcpu);
409 }
410