xref: /linux/arch/arm64/kvm/vgic/vgic-v3-nested.c (revision 51d90a15fedf8366cb96ef68d0ea2d0bf15417d2)
1 // SPDX-License-Identifier: GPL-2.0-only
2 
3 #include <linux/cpu.h>
4 #include <linux/kvm.h>
5 #include <linux/kvm_host.h>
6 #include <linux/interrupt.h>
7 #include <linux/io.h>
8 #include <linux/uaccess.h>
9 
10 #include <kvm/arm_vgic.h>
11 
12 #include <asm/kvm_arm.h>
13 #include <asm/kvm_emulate.h>
14 #include <asm/kvm_nested.h>
15 
16 #include "vgic.h"
17 
18 #define ICH_LRN(n)	(ICH_LR0_EL2 + (n))
19 #define ICH_AP0RN(n)	(ICH_AP0R0_EL2 + (n))
20 #define ICH_AP1RN(n)	(ICH_AP1R0_EL2 + (n))
21 
22 struct mi_state {
23 	u16	eisr;
24 	u16	elrsr;
25 	bool	pend;
26 };
27 
28 /*
29  * The shadow registers loaded to the hardware when running a L2 guest
30  * with the virtual IMO/FMO bits set.
31  */
32 struct shadow_if {
33 	struct vgic_v3_cpu_if	cpuif;
34 	unsigned long		lr_map;
35 };
36 
37 static DEFINE_PER_CPU(struct shadow_if, shadow_if);
38 
lr_map_idx_to_shadow_idx(struct shadow_if * shadow_if,int idx)39 static int lr_map_idx_to_shadow_idx(struct shadow_if *shadow_if, int idx)
40 {
41 	return hweight16(shadow_if->lr_map & (BIT(idx) - 1));
42 }
43 
44 /*
45  * Nesting GICv3 support
46  *
47  * On a non-nesting VM (only running at EL0/EL1), the host hypervisor
48  * completely controls the interrupts injected via the list registers.
49  * Consequently, most of the state that is modified by the guest (by ACK-ing
50  * and EOI-ing interrupts) is synced by KVM on each entry/exit, so that we
51  * keep a semi-consistent view of the interrupts.
52  *
53  * This still applies for a NV guest, but only while "InHost" (either
54  * running at EL2, or at EL0 with HCR_EL2.{E2H.TGE}=={1,1}.
55  *
56  * When running a L2 guest ("not InHost"), things are radically different,
57  * as the L1 guest is in charge of provisioning the interrupts via its own
58  * view of the ICH_LR*_EL2 registers, which conveniently live in the VNCR
59  * page.  This means that the flow described above does work (there is no
60  * state to rebuild in the L0 hypervisor), and that most things happed on L2
61  * load/put:
62  *
63  * - on L2 load: move the in-memory L1 vGIC configuration into a shadow,
64  *   per-CPU data structure that is used to populate the actual LRs. This is
65  *   an extra copy that we could avoid, but life is short. In the process,
66  *   we remap any interrupt that has the HW bit set to the mapped interrupt
67  *   on the host, should the host consider it a HW one. This allows the HW
68  *   deactivation to take its course, such as for the timer.
69  *
70  * - on L2 put: perform the inverse transformation, so that the result of L2
71  *   running becomes visible to L1 in the VNCR-accessible registers.
72  *
73  * - there is nothing to do on L2 entry apart from enabling the vgic, as
74  *   everything will have happened on load. However, this is the point where
75  *   we detect that an interrupt targeting L1 and prepare the grand
76  *   switcheroo.
77  *
78  * - on L2 exit: resync the LRs and VMCR, emulate the HW bit, and deactivate
79  *   corresponding the L1 interrupt. The L0 active state will be cleared by
80  *   the HW if the L1 interrupt was itself backed by a HW interrupt.
81  *
82  * Maintenance Interrupt (MI) management:
83  *
84  * Since the L2 guest runs the vgic in its full glory, MIs get delivered and
85  * used as a handover point between L2 and L1.
86  *
87  * - on delivery of a MI to L0 while L2 is running: make the L1 MI pending,
88  *   and let it rip. This will initiate a vcpu_put() on L2, and allow L1 to
89  *   run and process the MI.
90  *
91  * - L1 MI is a fully virtual interrupt, not linked to the host's MI. Its
92  *   state must be computed at each entry/exit of the guest, much like we do
93  *   it for the PMU interrupt.
94  *
95  * - because most of the ICH_*_EL2 registers live in the VNCR page, the
96  *   quality of emulation is poor: L1 can setup the vgic so that an MI would
97  *   immediately fire, and not observe anything until the next exit.
98  *   Similarly, a pending MI is not immediately disabled by clearing
99  *   ICH_HCR_EL2.En. Trying to read ICH_MISR_EL2 would do the trick, for
100  *   example.
101  *
102  * System register emulation:
103  *
104  * We get two classes of registers:
105  *
106  * - those backed by memory (LRs, APRs, HCR, VMCR): L1 can freely access
107  *   them, and L0 doesn't see a thing.
108  *
109  * - those that always trap (ELRSR, EISR, MISR): these are status registers
110  *   that are built on the fly based on the in-memory state.
111  *
112  * Only L1 can access the ICH_*_EL2 registers. A non-NV L2 obviously cannot,
113  * and a NV L2 would either access the VNCR page provided by L1 (memory
114  * based registers), or see the access redirected to L1 (registers that
115  * trap) thanks to NV being set by L1.
116  */
117 
vgic_state_is_nested(struct kvm_vcpu * vcpu)118 bool vgic_state_is_nested(struct kvm_vcpu *vcpu)
119 {
120 	u64 xmo;
121 
122 	if (is_nested_ctxt(vcpu)) {
123 		xmo = __vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_IMO | HCR_FMO);
124 		WARN_ONCE(xmo && xmo != (HCR_IMO | HCR_FMO),
125 			  "Separate virtual IRQ/FIQ settings not supported\n");
126 
127 		return !!xmo;
128 	}
129 
130 	return false;
131 }
132 
get_shadow_if(void)133 static struct shadow_if *get_shadow_if(void)
134 {
135 	return this_cpu_ptr(&shadow_if);
136 }
137 
lr_triggers_eoi(u64 lr)138 static bool lr_triggers_eoi(u64 lr)
139 {
140 	return !(lr & (ICH_LR_STATE | ICH_LR_HW)) && (lr & ICH_LR_EOI);
141 }
142 
vgic_compute_mi_state(struct kvm_vcpu * vcpu,struct mi_state * mi_state)143 static void vgic_compute_mi_state(struct kvm_vcpu *vcpu, struct mi_state *mi_state)
144 {
145 	u16 eisr = 0, elrsr = 0;
146 	bool pend = false;
147 
148 	for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) {
149 		u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
150 
151 		if (lr_triggers_eoi(lr))
152 			eisr |= BIT(i);
153 		if (!(lr & ICH_LR_STATE))
154 			elrsr |= BIT(i);
155 		pend |= (lr & ICH_LR_PENDING_BIT);
156 	}
157 
158 	mi_state->eisr	= eisr;
159 	mi_state->elrsr	= elrsr;
160 	mi_state->pend	= pend;
161 }
162 
vgic_v3_get_eisr(struct kvm_vcpu * vcpu)163 u16 vgic_v3_get_eisr(struct kvm_vcpu *vcpu)
164 {
165 	struct mi_state mi_state;
166 
167 	vgic_compute_mi_state(vcpu, &mi_state);
168 	return mi_state.eisr;
169 }
170 
vgic_v3_get_elrsr(struct kvm_vcpu * vcpu)171 u16 vgic_v3_get_elrsr(struct kvm_vcpu *vcpu)
172 {
173 	struct mi_state mi_state;
174 
175 	vgic_compute_mi_state(vcpu, &mi_state);
176 	return mi_state.elrsr;
177 }
178 
vgic_v3_get_misr(struct kvm_vcpu * vcpu)179 u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu)
180 {
181 	struct mi_state mi_state;
182 	u64 reg = 0, hcr, vmcr;
183 
184 	hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
185 	vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2);
186 
187 	vgic_compute_mi_state(vcpu, &mi_state);
188 
189 	if (mi_state.eisr)
190 		reg |= ICH_MISR_EL2_EOI;
191 
192 	if (__vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_UIE) {
193 		int used_lrs = kvm_vgic_global_state.nr_lr;
194 
195 		used_lrs -= hweight16(mi_state.elrsr);
196 		reg |= (used_lrs <= 1) ? ICH_MISR_EL2_U : 0;
197 	}
198 
199 	if ((hcr & ICH_HCR_EL2_LRENPIE) && FIELD_GET(ICH_HCR_EL2_EOIcount_MASK, hcr))
200 		reg |= ICH_MISR_EL2_LRENP;
201 
202 	if ((hcr & ICH_HCR_EL2_NPIE) && !mi_state.pend)
203 		reg |= ICH_MISR_EL2_NP;
204 
205 	if ((hcr & ICH_HCR_EL2_VGrp0EIE) && (vmcr & ICH_VMCR_ENG0_MASK))
206 		reg |= ICH_MISR_EL2_VGrp0E;
207 
208 	if ((hcr & ICH_HCR_EL2_VGrp0DIE) && !(vmcr & ICH_VMCR_ENG0_MASK))
209 		reg |= ICH_MISR_EL2_VGrp0D;
210 
211 	if ((hcr & ICH_HCR_EL2_VGrp1EIE) && (vmcr & ICH_VMCR_ENG1_MASK))
212 		reg |= ICH_MISR_EL2_VGrp1E;
213 
214 	if ((hcr & ICH_HCR_EL2_VGrp1DIE) && !(vmcr & ICH_VMCR_ENG1_MASK))
215 		reg |= ICH_MISR_EL2_VGrp1D;
216 
217 	return reg;
218 }
219 
translate_lr_pintid(struct kvm_vcpu * vcpu,u64 lr)220 static u64 translate_lr_pintid(struct kvm_vcpu *vcpu, u64 lr)
221 {
222 	struct vgic_irq *irq;
223 
224 	if (!(lr & ICH_LR_HW))
225 		return lr;
226 
227 	/* We have the HW bit set, check for validity of pINTID */
228 	irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
229 	/* If there was no real mapping, nuke the HW bit */
230 	if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI)
231 		lr &= ~ICH_LR_HW;
232 
233 	/* Translate the virtual mapping to the real one, even if invalid */
234 	if (irq) {
235 		lr &= ~ICH_LR_PHYS_ID_MASK;
236 		lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid);
237 		vgic_put_irq(vcpu->kvm, irq);
238 	}
239 
240 	return lr;
241 }
242 
243 /*
244  * For LRs which have HW bit set such as timer interrupts, we modify them to
245  * have the host hardware interrupt number instead of the virtual one programmed
246  * by the guest hypervisor.
247  */
vgic_v3_create_shadow_lr(struct kvm_vcpu * vcpu,struct vgic_v3_cpu_if * s_cpu_if)248 static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu,
249 				     struct vgic_v3_cpu_if *s_cpu_if)
250 {
251 	struct shadow_if *shadow_if;
252 
253 	shadow_if = container_of(s_cpu_if, struct shadow_if, cpuif);
254 	shadow_if->lr_map = 0;
255 
256 	for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) {
257 		u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
258 
259 		if (!(lr & ICH_LR_STATE))
260 			continue;
261 
262 		lr = translate_lr_pintid(vcpu, lr);
263 
264 		s_cpu_if->vgic_lr[hweight16(shadow_if->lr_map)] = lr;
265 		shadow_if->lr_map |= BIT(i);
266 	}
267 
268 	s_cpu_if->used_lrs = hweight16(shadow_if->lr_map);
269 }
270 
vgic_v3_flush_nested(struct kvm_vcpu * vcpu)271 void vgic_v3_flush_nested(struct kvm_vcpu *vcpu)
272 {
273 	u64 val = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
274 
275 	write_sysreg_s(val | vgic_ich_hcr_trap_bits(), SYS_ICH_HCR_EL2);
276 }
277 
vgic_v3_sync_nested(struct kvm_vcpu * vcpu)278 void vgic_v3_sync_nested(struct kvm_vcpu *vcpu)
279 {
280 	struct shadow_if *shadow_if = get_shadow_if();
281 	int i;
282 
283 	for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) {
284 		u64 val, host_lr, lr;
285 
286 		host_lr = __gic_v3_get_lr(lr_map_idx_to_shadow_idx(shadow_if, i));
287 
288 		/* Propagate the new LR state */
289 		lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
290 		val = lr & ~ICH_LR_STATE;
291 		val |= host_lr & ICH_LR_STATE;
292 		__vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val);
293 
294 		/*
295 		 * Deactivation of a HW interrupt: the LR must have the HW
296 		 * bit set, have been in a non-invalid state before the run,
297 		 * and now be in an invalid state. If any of that doesn't
298 		 * hold, we're done with this LR.
299 		 */
300 		if (!((lr & ICH_LR_HW) && (lr & ICH_LR_STATE) &&
301 		      !(host_lr & ICH_LR_STATE)))
302 			continue;
303 
304 		/*
305 		 * If we had a HW lr programmed by the guest hypervisor, we
306 		 * need to emulate the HW effect between the guest hypervisor
307 		 * and the nested guest.
308 		 */
309 		vgic_v3_deactivate(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
310 	}
311 
312 	/* We need these to be synchronised to generate the MI */
313 	__vcpu_assign_sys_reg(vcpu, ICH_VMCR_EL2, read_sysreg_s(SYS_ICH_VMCR_EL2));
314 	__vcpu_rmw_sys_reg(vcpu, ICH_HCR_EL2, &=, ~ICH_HCR_EL2_EOIcount);
315 	__vcpu_rmw_sys_reg(vcpu, ICH_HCR_EL2, |=, read_sysreg_s(SYS_ICH_HCR_EL2) & ICH_HCR_EL2_EOIcount);
316 
317 	write_sysreg_s(0, SYS_ICH_HCR_EL2);
318 	isb();
319 
320 	vgic_v3_nested_update_mi(vcpu);
321 }
322 
vgic_v3_create_shadow_state(struct kvm_vcpu * vcpu,struct vgic_v3_cpu_if * s_cpu_if)323 static void vgic_v3_create_shadow_state(struct kvm_vcpu *vcpu,
324 					struct vgic_v3_cpu_if *s_cpu_if)
325 {
326 	struct vgic_v3_cpu_if *host_if = &vcpu->arch.vgic_cpu.vgic_v3;
327 	int i;
328 
329 	s_cpu_if->vgic_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
330 	s_cpu_if->vgic_vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2);
331 	s_cpu_if->vgic_sre = host_if->vgic_sre;
332 
333 	for (i = 0; i < 4; i++) {
334 		s_cpu_if->vgic_ap0r[i] = __vcpu_sys_reg(vcpu, ICH_AP0RN(i));
335 		s_cpu_if->vgic_ap1r[i] = __vcpu_sys_reg(vcpu, ICH_AP1RN(i));
336 	}
337 
338 	vgic_v3_create_shadow_lr(vcpu, s_cpu_if);
339 }
340 
vgic_v3_load_nested(struct kvm_vcpu * vcpu)341 void vgic_v3_load_nested(struct kvm_vcpu *vcpu)
342 {
343 	struct shadow_if *shadow_if = get_shadow_if();
344 	struct vgic_v3_cpu_if *cpu_if = &shadow_if->cpuif;
345 
346 	BUG_ON(!vgic_state_is_nested(vcpu));
347 
348 	vgic_v3_create_shadow_state(vcpu, cpu_if);
349 
350 	__vgic_v3_restore_vmcr_aprs(cpu_if);
351 	__vgic_v3_activate_traps(cpu_if);
352 
353 	for (int i = 0; i < cpu_if->used_lrs; i++)
354 		__gic_v3_set_lr(cpu_if->vgic_lr[i], i);
355 
356 	/*
357 	 * Propagate the number of used LRs for the benefit of the HYP
358 	 * GICv3 emulation code. Yes, this is a pretty sorry hack.
359 	 */
360 	vcpu->arch.vgic_cpu.vgic_v3.used_lrs = cpu_if->used_lrs;
361 }
362 
vgic_v3_put_nested(struct kvm_vcpu * vcpu)363 void vgic_v3_put_nested(struct kvm_vcpu *vcpu)
364 {
365 	struct shadow_if *shadow_if = get_shadow_if();
366 	struct vgic_v3_cpu_if *s_cpu_if = &shadow_if->cpuif;
367 	int i;
368 
369 	__vgic_v3_save_aprs(s_cpu_if);
370 
371 	for (i = 0; i < 4; i++) {
372 		__vcpu_assign_sys_reg(vcpu, ICH_AP0RN(i), s_cpu_if->vgic_ap0r[i]);
373 		__vcpu_assign_sys_reg(vcpu, ICH_AP1RN(i), s_cpu_if->vgic_ap1r[i]);
374 	}
375 
376 	for (i = 0; i < s_cpu_if->used_lrs; i++)
377 		__gic_v3_set_lr(0, i);
378 
379 	__vgic_v3_deactivate_traps(s_cpu_if);
380 
381 	vcpu->arch.vgic_cpu.vgic_v3.used_lrs = 0;
382 }
383 
384 /*
385  * If we exit a L2 VM with a pending maintenance interrupt from the GIC,
386  * then we need to forward this to L1 so that it can re-sync the appropriate
387  * LRs and sample level triggered interrupts again.
388  */
vgic_v3_handle_nested_maint_irq(struct kvm_vcpu * vcpu)389 void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu)
390 {
391 	bool state = read_sysreg_s(SYS_ICH_MISR_EL2);
392 
393 	/* This will force a switch back to L1 if the level is high */
394 	kvm_vgic_inject_irq(vcpu->kvm, vcpu,
395 			    vcpu->kvm->arch.vgic.mi_intid, state, vcpu);
396 
397 	sysreg_clear_set_s(SYS_ICH_HCR_EL2, ICH_HCR_EL2_En, 0);
398 }
399 
vgic_v3_nested_update_mi(struct kvm_vcpu * vcpu)400 void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu)
401 {
402 	bool level;
403 
404 	level = (__vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_En) && vgic_v3_get_misr(vcpu);
405 	kvm_vgic_inject_irq(vcpu->kvm, vcpu,
406 			    vcpu->kvm->arch.vgic.mi_intid, level, vcpu);
407 }
408