xref: /linux/arch/arm64/kvm/nested.c (revision 751d041a13bdc9d72bf7efdc86224da1174ff31d)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2017 - Columbia University and Linaro Ltd.
4  * Author: Jintack Lim <jintack.lim@linaro.org>
5  */
6 
7 #include <linux/bitfield.h>
8 #include <linux/kvm.h>
9 #include <linux/kvm_host.h>
10 
11 #include <asm/fixmap.h>
12 #include <asm/kvm_arm.h>
13 #include <asm/kvm_emulate.h>
14 #include <asm/kvm_mmu.h>
15 #include <asm/kvm_nested.h>
16 #include <asm/sysreg.h>
17 
18 #include "sys_regs.h"
19 
20 struct vncr_tlb {
21 	/* The guest's VNCR_EL2 */
22 	u64			gva;
23 	struct s1_walk_info	wi;
24 	struct s1_walk_result	wr;
25 
26 	u64			hpa;
27 
28 	/* -1 when not mapped on a CPU */
29 	int			cpu;
30 
31 	/*
32 	 * true if the TLB is valid. Can only be changed with the
33 	 * mmu_lock held.
34 	 */
35 	bool			valid;
36 };
37 
38 /*
39  * Ratio of live shadow S2 MMU per vcpu. This is a trade-off between
40  * memory usage and potential number of different sets of S2 PTs in
41  * the guests. Running out of S2 MMUs only affects performance (we
42  * will invalidate them more often).
43  */
44 #define S2_MMU_PER_VCPU		2
45 
46 void kvm_init_nested(struct kvm *kvm)
47 {
48 	kvm->arch.nested_mmus = NULL;
49 	kvm->arch.nested_mmus_size = 0;
50 	atomic_set(&kvm->arch.vncr_map_count, 0);
51 }
52 
53 static int init_nested_s2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
54 {
55 	/*
56 	 * We only initialise the IPA range on the canonical MMU, which
57 	 * defines the contract between KVM and userspace on where the
58 	 * "hardware" is in the IPA space. This affects the validity of MMIO
59 	 * exits forwarded to userspace, for example.
60 	 *
61 	 * For nested S2s, we use the PARange as exposed to the guest, as it
62 	 * is allowed to use it at will to expose whatever memory map it
63 	 * wants to its own guests as it would be on real HW.
64 	 */
65 	return kvm_init_stage2_mmu(kvm, mmu, kvm_get_pa_bits(kvm));
66 }
67 
68 int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu)
69 {
70 	struct kvm *kvm = vcpu->kvm;
71 	struct kvm_s2_mmu *tmp;
72 	int num_mmus, ret = 0;
73 
74 	if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features) &&
75 	    !cpus_have_final_cap(ARM64_HAS_HCR_NV1))
76 		return -EINVAL;
77 
78 	if (!vcpu->arch.ctxt.vncr_array)
79 		vcpu->arch.ctxt.vncr_array = (u64 *)__get_free_page(GFP_KERNEL_ACCOUNT |
80 								    __GFP_ZERO);
81 
82 	if (!vcpu->arch.ctxt.vncr_array)
83 		return -ENOMEM;
84 
85 	/*
86 	 * Let's treat memory allocation failures as benign: If we fail to
87 	 * allocate anything, return an error and keep the allocated array
88 	 * alive. Userspace may try to recover by initializing the vcpu
89 	 * again, and there is no reason to affect the whole VM for this.
90 	 */
91 	num_mmus = atomic_read(&kvm->online_vcpus) * S2_MMU_PER_VCPU;
92 
93 	if (num_mmus > kvm->arch.nested_mmus_size) {
94 		tmp = kvcalloc(num_mmus, sizeof(*tmp), GFP_KERNEL_ACCOUNT);
95 		if (!tmp)
96 			return -ENOMEM;
97 
98 		write_lock(&kvm->mmu_lock);
99 
100 		if (kvm->arch.nested_mmus_size) {
101 			memcpy(tmp, kvm->arch.nested_mmus,
102 			       size_mul(sizeof(*tmp), kvm->arch.nested_mmus_size));
103 
104 			for (int i = 0; i < kvm->arch.nested_mmus_size; i++)
105 				tmp[i].pgt->mmu = &tmp[i];
106 		}
107 
108 		swap(kvm->arch.nested_mmus, tmp);
109 
110 		write_unlock(&kvm->mmu_lock);
111 
112 		kvfree(tmp);
113 	}
114 
115 	for (int i = kvm->arch.nested_mmus_size; !ret && i < num_mmus; i++)
116 		ret = init_nested_s2_mmu(kvm, &kvm->arch.nested_mmus[i]);
117 
118 	if (ret) {
119 		for (int i = kvm->arch.nested_mmus_size; i < num_mmus; i++)
120 			kvm_free_stage2_pgd(&kvm->arch.nested_mmus[i]);
121 
122 		free_page((unsigned long)vcpu->arch.ctxt.vncr_array);
123 		vcpu->arch.ctxt.vncr_array = NULL;
124 
125 		return ret;
126 	}
127 
128 	kvm->arch.nested_mmus_size = num_mmus;
129 
130 	return 0;
131 }
132 
133 struct s2_walk_info {
134 	u64		baddr;
135 	unsigned int	max_oa_bits;
136 	unsigned int	pgshift;
137 	unsigned int	sl;
138 	unsigned int	t0sz;
139 	bool		be;
140 	bool		ha;
141 };
142 
143 static u32 compute_fsc(int level, u32 fsc)
144 {
145 	return fsc | (level & 0x3);
146 }
147 
148 static int esr_s2_fault(struct kvm_vcpu *vcpu, int level, u32 fsc)
149 {
150 	u32 esr;
151 
152 	esr = kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC;
153 	esr |= compute_fsc(level, fsc);
154 	return esr;
155 }
156 
157 static int get_ia_size(struct s2_walk_info *wi)
158 {
159 	return 64 - wi->t0sz;
160 }
161 
162 static int check_base_s2_limits(struct kvm_vcpu *vcpu, struct s2_walk_info *wi,
163 				int level, int input_size, int stride)
164 {
165 	int start_size, pa_max;
166 
167 	pa_max = kvm_get_pa_bits(vcpu->kvm);
168 
169 	/* Check translation limits */
170 	switch (BIT(wi->pgshift)) {
171 	case SZ_64K:
172 		if (level == 0 || (level == 1 && pa_max <= 42))
173 			return -EFAULT;
174 		break;
175 	case SZ_16K:
176 		if (level == 0 || (level == 1 && pa_max <= 40))
177 			return -EFAULT;
178 		break;
179 	case SZ_4K:
180 		if (level < 0 || (level == 0 && pa_max <= 42))
181 			return -EFAULT;
182 		break;
183 	}
184 
185 	/* Check input size limits */
186 	if (input_size > pa_max)
187 		return -EFAULT;
188 
189 	/* Check number of entries in starting level table */
190 	start_size = input_size - ((3 - level) * stride + wi->pgshift);
191 	if (start_size < 1 || start_size > stride + 4)
192 		return -EFAULT;
193 
194 	return 0;
195 }
196 
197 /* Check if output is within boundaries */
198 static int check_output_size(struct s2_walk_info *wi, phys_addr_t output)
199 {
200 	unsigned int output_size = wi->max_oa_bits;
201 
202 	if (output_size != 48 && (output & GENMASK_ULL(47, output_size)))
203 		return -1;
204 
205 	return 0;
206 }
207 
208 static int read_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 *desc,
209 			      struct s2_walk_info *wi)
210 {
211 	u64 val;
212 	int r;
213 
214 	r = kvm_read_guest(vcpu->kvm, pa, &val, sizeof(val));
215 	if (r)
216 		return r;
217 
218 	/*
219 	 * Handle reversedescriptors if endianness differs between the
220 	 * host and the guest hypervisor.
221 	 */
222 	if (wi->be)
223 		*desc = be64_to_cpu((__force __be64)val);
224 	else
225 		*desc = le64_to_cpu((__force __le64)val);
226 
227 	return 0;
228 }
229 
230 static int swap_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 old, u64 new,
231 			      struct s2_walk_info *wi)
232 {
233 	if (wi->be) {
234 		old = (__force u64)cpu_to_be64(old);
235 		new = (__force u64)cpu_to_be64(new);
236 	} else {
237 		old = (__force u64)cpu_to_le64(old);
238 		new = (__force u64)cpu_to_le64(new);
239 	}
240 
241 	return __kvm_at_swap_desc(vcpu->kvm, pa, old, new);
242 }
243 
244 /*
245  * This is essentially a C-version of the pseudo code from the ARM ARM
246  * AArch64.TranslationTableWalk  function.  I strongly recommend looking at
247  * that pseudocode in trying to understand this.
248  *
249  * Must be called with the kvm->srcu read lock held
250  */
251 static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa,
252 			      struct s2_walk_info *wi, struct kvm_s2_trans *out)
253 {
254 	int first_block_level, level, stride, input_size, base_lower_bound;
255 	phys_addr_t base_addr;
256 	unsigned int addr_top, addr_bottom;
257 	u64 desc, new_desc;  /* page table entry */
258 	int ret;
259 	phys_addr_t paddr;
260 
261 	switch (BIT(wi->pgshift)) {
262 	default:
263 	case SZ_64K:
264 	case SZ_16K:
265 		level = 3 - wi->sl;
266 		first_block_level = 2;
267 		break;
268 	case SZ_4K:
269 		level = 2 - wi->sl;
270 		first_block_level = 1;
271 		break;
272 	}
273 
274 	stride = wi->pgshift - 3;
275 	input_size = get_ia_size(wi);
276 	if (input_size > 48 || input_size < 25)
277 		return -EFAULT;
278 
279 	ret = check_base_s2_limits(vcpu, wi, level, input_size, stride);
280 	if (WARN_ON(ret)) {
281 		out->esr = compute_fsc(0, ESR_ELx_FSC_FAULT);
282 		return ret;
283 	}
284 
285 	base_lower_bound = 3 + input_size - ((3 - level) * stride +
286 			   wi->pgshift);
287 	base_addr = wi->baddr & GENMASK_ULL(47, base_lower_bound);
288 
289 	if (check_output_size(wi, base_addr)) {
290 		/* R_BFHQH */
291 		out->esr = compute_fsc(0, ESR_ELx_FSC_ADDRSZ);
292 		return 1;
293 	}
294 
295 	addr_top = input_size - 1;
296 
297 	while (1) {
298 		phys_addr_t index;
299 
300 		addr_bottom = (3 - level) * stride + wi->pgshift;
301 		index = (ipa & GENMASK_ULL(addr_top, addr_bottom))
302 			>> (addr_bottom - 3);
303 
304 		paddr = base_addr | index;
305 		ret = read_guest_s2_desc(vcpu, paddr, &desc, wi);
306 		if (ret < 0) {
307 			out->esr = ESR_ELx_FSC_SEA_TTW(level);
308 			return ret;
309 		}
310 
311 		new_desc = desc;
312 
313 		/* Check for valid descriptor at this point */
314 		if (!(desc & KVM_PTE_VALID)) {
315 			out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT);
316 			out->desc = desc;
317 			return 1;
318 		}
319 
320 		if (FIELD_GET(KVM_PTE_TYPE, desc) == KVM_PTE_TYPE_BLOCK) {
321 			if (level < 3)
322 				break;
323 
324 			out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT);
325 			out->desc = desc;
326 			return 1;
327 		}
328 
329 		/* We're at the final level */
330 		if (level == 3)
331 			break;
332 
333 		if (check_output_size(wi, desc)) {
334 			out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ);
335 			out->desc = desc;
336 			return 1;
337 		}
338 
339 		base_addr = desc & GENMASK_ULL(47, wi->pgshift);
340 
341 		level += 1;
342 		addr_top = addr_bottom - 1;
343 	}
344 
345 	if (level < first_block_level) {
346 		out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT);
347 		out->desc = desc;
348 		return 1;
349 	}
350 
351 	if (check_output_size(wi, desc)) {
352 		out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ);
353 		out->desc = desc;
354 		return 1;
355 	}
356 
357 	if (wi->ha)
358 		new_desc |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
359 
360 	if (new_desc != desc) {
361 		ret = swap_guest_s2_desc(vcpu, paddr, desc, new_desc, wi);
362 		if (ret == -EAGAIN)
363 			return ret;
364 		if (ret) {
365 			out->esr = ESR_ELx_FSC_SEA_TTW(level);
366 			out->desc = desc;
367 			return 1;
368 		}
369 
370 		desc = new_desc;
371 	}
372 
373 	if (!(desc & KVM_PTE_LEAF_ATTR_LO_S2_AF)) {
374 		out->esr = compute_fsc(level, ESR_ELx_FSC_ACCESS);
375 		out->desc = desc;
376 		return 1;
377 	}
378 
379 	addr_bottom += contiguous_bit_shift(desc, wi, level);
380 
381 	/* Calculate and return the result */
382 	paddr = (desc & GENMASK_ULL(47, addr_bottom)) |
383 		(ipa & GENMASK_ULL(addr_bottom - 1, 0));
384 	out->output = paddr;
385 	out->block_size = 1UL << ((3 - level) * stride + wi->pgshift);
386 	out->readable = desc & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
387 	out->writable = desc & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
388 	out->level = level;
389 	out->desc = desc;
390 	return 0;
391 }
392 
393 #define _has_tgran_2(__r, __sz)						\
394 	({								\
395 		u64 _s1, _s2, _mmfr0 = __r;				\
396 									\
397 		_s2 = SYS_FIELD_GET(ID_AA64MMFR0_EL1,			\
398 				    TGRAN##__sz##_2, _mmfr0);		\
399 									\
400 		_s1 = SYS_FIELD_GET(ID_AA64MMFR0_EL1,			\
401 				    TGRAN##__sz, _mmfr0);		\
402 									\
403 		((_s2 != ID_AA64MMFR0_EL1_TGRAN##__sz##_2_NI &&		\
404 		  _s2 != ID_AA64MMFR0_EL1_TGRAN##__sz##_2_TGRAN##__sz) || \
405 		 (_s2 == ID_AA64MMFR0_EL1_TGRAN##__sz##_2_TGRAN##__sz && \
406 		  _s1 != ID_AA64MMFR0_EL1_TGRAN##__sz##_NI));		\
407 	})
408 
409 static bool has_tgran_2(u64 mmfr0, unsigned int shift)
410 {
411 	switch (shift) {
412 	case 12:
413 		return _has_tgran_2(mmfr0, 4);
414 	case 14:
415 		return _has_tgran_2(mmfr0, 16);
416 	case 16:
417 		return _has_tgran_2(mmfr0, 64);
418 	default:
419 		BUG();
420 	}
421 }
422 
423 static unsigned int fallback_tgran2_shift(u64 mmfr0)
424 {
425 	if (has_tgran_2(mmfr0, PAGE_SHIFT))
426 		return PAGE_SHIFT;
427 	else if (has_tgran_2(mmfr0, 12))
428 		return 12;
429 	else if (has_tgran_2(mmfr0, 14))
430 		return 14;
431 	else if (has_tgran_2(mmfr0, 16))
432 		return 16;
433 	else
434 		return PAGE_SHIFT;
435 }
436 
437 static unsigned int vtcr_to_tg0_pgshift(struct kvm *kvm, u64 vtcr)
438 {
439 	u64 tg0 = FIELD_GET(VTCR_EL2_TG0_MASK, vtcr);
440 	u64 mmfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64MMFR0_EL1);
441 	unsigned int shift;
442 
443 	switch (tg0) {
444 	case VTCR_EL2_TG0_4K:
445 		shift = 12;
446 		break;
447 	case VTCR_EL2_TG0_16K:
448 		shift = 14;
449 		break;
450 	case VTCR_EL2_TG0_64K:
451 	/* IMPDEF: treat any other value as 64k, subject to fallback */
452 	default:
453 		shift = 16;
454 	}
455 
456 	/*
457 	 * If TGx is programmed to an unimplemented value (not advertised in
458 	 * ID_AA64MMFR0_EL1), we should treat it as if an implemented value is
459 	 * written, as per the architecture. Choose an available one while
460 	 * prioritizing PAGE_SIZE.
461 	 */
462 	if (!has_tgran_2(mmfr0, shift))
463 		return fallback_tgran2_shift(mmfr0);
464 
465 	return shift;
466 }
467 
468 static size_t vtcr_to_tg0_pgsize(struct kvm *kvm, u64 vtcr)
469 {
470 	return BIT(vtcr_to_tg0_pgshift(kvm, vtcr));
471 }
472 
473 static void setup_s2_walk(struct kvm_vcpu *vcpu, struct s2_walk_info *wi)
474 {
475 	u64 vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2);
476 
477 	wi->baddr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
478 	wi->t0sz = vtcr & VTCR_EL2_T0SZ_MASK;
479 	wi->pgshift = vtcr_to_tg0_pgshift(vcpu->kvm, vtcr);
480 	wi->sl = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
481 	/* Global limit for now, should eventually be per-VM */
482 	wi->max_oa_bits = min(get_kvm_ipa_limit(),
483 			      ps_to_output_size(FIELD_GET(VTCR_EL2_PS_MASK, vtcr), false));
484 	wi->ha = vtcr & VTCR_EL2_HA;
485 	wi->be = vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_EE;
486 }
487 
488 int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
489 		       struct kvm_s2_trans *result)
490 {
491 	struct s2_walk_info wi;
492 	int ret;
493 
494 	result->esr = 0;
495 
496 	if (!vcpu_has_nv(vcpu))
497 		return 0;
498 
499 	setup_s2_walk(vcpu, &wi);
500 
501 	ret = walk_nested_s2_pgd(vcpu, gipa, &wi, result);
502 	if (ret)
503 		result->esr |= (kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC);
504 
505 	return ret;
506 }
507 
508 static unsigned int ttl_to_size(u8 ttl)
509 {
510 	int level = ttl & 3;
511 	int gran = (ttl >> 2) & 3;
512 	unsigned int max_size = 0;
513 
514 	switch (gran) {
515 	case TLBI_TTL_TG_4K:
516 		switch (level) {
517 		case 0:
518 			break;
519 		case 1:
520 			max_size = SZ_1G;
521 			break;
522 		case 2:
523 			max_size = SZ_2M;
524 			break;
525 		case 3:
526 			max_size = SZ_4K;
527 			break;
528 		}
529 		break;
530 	case TLBI_TTL_TG_16K:
531 		switch (level) {
532 		case 0:
533 		case 1:
534 			break;
535 		case 2:
536 			max_size = SZ_32M;
537 			break;
538 		case 3:
539 			max_size = SZ_16K;
540 			break;
541 		}
542 		break;
543 	case TLBI_TTL_TG_64K:
544 		switch (level) {
545 		case 0:
546 		case 1:
547 			/* No 52bit IPA support */
548 			break;
549 		case 2:
550 			max_size = SZ_512M;
551 			break;
552 		case 3:
553 			max_size = SZ_64K;
554 			break;
555 		}
556 		break;
557 	default:			/* No size information */
558 		break;
559 	}
560 
561 	return max_size;
562 }
563 
564 static u8 pgshift_level_to_ttl(u16 shift, u8 level)
565 {
566 	u8 ttl;
567 
568 	switch(shift) {
569 	case 12:
570 		ttl = TLBI_TTL_TG_4K;
571 		break;
572 	case 14:
573 		ttl = TLBI_TTL_TG_16K;
574 		break;
575 	case 16:
576 		ttl = TLBI_TTL_TG_64K;
577 		break;
578 	default:
579 		BUG();
580 	}
581 
582 	ttl <<= 2;
583 	ttl |= level & 3;
584 
585 	return ttl;
586 }
587 
588 /*
589  * Compute the equivalent of the TTL field by parsing the shadow PT.  The
590  * granule size is extracted from the cached VTCR_EL2.TG0 while the level is
591  * retrieved from first entry carrying the level as a tag.
592  */
593 static u8 get_guest_mapping_ttl(struct kvm_s2_mmu *mmu, u64 addr)
594 {
595 	size_t tg0_size = vtcr_to_tg0_pgsize(kvm_s2_mmu_to_kvm(mmu), mmu->tlb_vtcr);
596 	u64 tmp, sz = 0;
597 	kvm_pte_t pte;
598 	u8 ttl, level;
599 
600 	lockdep_assert_held_write(&kvm_s2_mmu_to_kvm(mmu)->mmu_lock);
601 
602 	switch (tg0_size) {
603 	case SZ_4K:
604 		ttl = (TLBI_TTL_TG_4K << 2);
605 		break;
606 	case SZ_16K:
607 		ttl = (TLBI_TTL_TG_16K << 2);
608 		break;
609 	case SZ_64K:
610 	default:	    /* IMPDEF: treat any other value as 64k */
611 		ttl = (TLBI_TTL_TG_64K << 2);
612 		break;
613 	}
614 
615 	tmp = addr;
616 
617 again:
618 	/* Iteratively compute the block sizes for a particular granule size */
619 	switch (tg0_size) {
620 	case SZ_4K:
621 		if	(sz < SZ_4K)	sz = SZ_4K;
622 		else if (sz < SZ_2M)	sz = SZ_2M;
623 		else if (sz < SZ_1G)	sz = SZ_1G;
624 		else			sz = 0;
625 		break;
626 	case SZ_16K:
627 		if	(sz < SZ_16K)	sz = SZ_16K;
628 		else if (sz < SZ_32M)	sz = SZ_32M;
629 		else			sz = 0;
630 		break;
631 	case SZ_64K:
632 	default:	    /* IMPDEF: treat any other value as 64k */
633 		if	(sz < SZ_64K)	sz = SZ_64K;
634 		else if (sz < SZ_512M)	sz = SZ_512M;
635 		else			sz = 0;
636 		break;
637 	}
638 
639 	if (sz == 0)
640 		return 0;
641 
642 	tmp &= ~(sz - 1);
643 	if (kvm_pgtable_get_leaf(mmu->pgt, tmp, &pte, NULL))
644 		goto again;
645 	if (!(pte & PTE_VALID))
646 		goto again;
647 	level = FIELD_GET(KVM_NV_GUEST_MAP_SZ, pte);
648 	if (!level)
649 		goto again;
650 
651 	ttl |= level;
652 
653 	/*
654 	 * We now have found some level information in the shadow S2. Check
655 	 * that the resulting range is actually including the original IPA.
656 	 */
657 	sz = ttl_to_size(ttl);
658 	if (addr < (tmp + sz))
659 		return ttl;
660 
661 	return 0;
662 }
663 
664 unsigned long compute_tlb_inval_range(struct kvm_s2_mmu *mmu, u64 val)
665 {
666 	struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
667 	unsigned long max_size;
668 	u8 ttl;
669 
670 	ttl = FIELD_GET(TLBI_TTL_MASK, val);
671 
672 	if (!ttl || !kvm_has_feat(kvm, ID_AA64MMFR2_EL1, TTL, IMP)) {
673 		/* No TTL, check the shadow S2 for a hint */
674 		u64 addr = (val & GENMASK_ULL(35, 0)) << 12;
675 		ttl = get_guest_mapping_ttl(mmu, addr);
676 	}
677 
678 	max_size = ttl_to_size(ttl);
679 
680 	if (!max_size) {
681 		/* Compute the maximum extent of the invalidation */
682 		switch (vtcr_to_tg0_pgsize(kvm, mmu->tlb_vtcr)) {
683 		case SZ_4K:
684 			max_size = SZ_1G;
685 			break;
686 		case SZ_16K:
687 			max_size = SZ_32M;
688 			break;
689 		case SZ_64K:
690 		default:    /* IMPDEF: treat any other value as 64k */
691 			/*
692 			 * No, we do not support 52bit IPA in nested yet. Once
693 			 * we do, this should be 4TB.
694 			 */
695 			max_size = SZ_512M;
696 			break;
697 		}
698 	}
699 
700 	WARN_ON(!max_size);
701 	return max_size;
702 }
703 
704 /*
705  * We can have multiple *different* MMU contexts with the same VMID:
706  *
707  * - S2 being enabled or not, hence differing by the HCR_EL2.VM bit
708  *
709  * - Multiple vcpus using private S2s (huh huh...), hence differing by the
710  *   VBBTR_EL2.BADDR address
711  *
712  * - A combination of the above...
713  *
714  * We can always identify which MMU context to pick at run-time.  However,
715  * TLB invalidation involving a VMID must take action on all the TLBs using
716  * this particular VMID. This translates into applying the same invalidation
717  * operation to all the contexts that are using this VMID. Moar phun!
718  */
719 void kvm_s2_mmu_iterate_by_vmid(struct kvm *kvm, u16 vmid,
720 				const union tlbi_info *info,
721 				void (*tlbi_callback)(struct kvm_s2_mmu *,
722 						      const union tlbi_info *))
723 {
724 	write_lock(&kvm->mmu_lock);
725 
726 	for (int i = 0; i < kvm->arch.nested_mmus_size; i++) {
727 		struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
728 
729 		if (!kvm_s2_mmu_valid(mmu))
730 			continue;
731 
732 		if (vmid == get_vmid(mmu->tlb_vttbr))
733 			tlbi_callback(mmu, info);
734 	}
735 
736 	write_unlock(&kvm->mmu_lock);
737 }
738 
739 struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu)
740 {
741 	struct kvm *kvm = vcpu->kvm;
742 	bool nested_stage2_enabled;
743 	u64 vttbr, vtcr, hcr;
744 
745 	lockdep_assert_held_write(&kvm->mmu_lock);
746 
747 	vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
748 	vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2);
749 	hcr = vcpu_read_sys_reg(vcpu, HCR_EL2);
750 
751 	nested_stage2_enabled = hcr & HCR_VM;
752 
753 	/* Don't consider the CnP bit for the vttbr match */
754 	vttbr &= ~VTTBR_CNP_BIT;
755 
756 	/*
757 	 * Two possibilities when looking up a S2 MMU context:
758 	 *
759 	 * - either S2 is enabled in the guest, and we need a context that is
760 	 *   S2-enabled and matches the full VTTBR (VMID+BADDR) and VTCR,
761 	 *   which makes it safe from a TLB conflict perspective (a broken
762 	 *   guest won't be able to generate them),
763 	 *
764 	 * - or S2 is disabled, and we need a context that is S2-disabled
765 	 *   and matches the VMID only, as all TLBs are tagged by VMID even
766 	 *   if S2 translation is disabled.
767 	 */
768 	for (int i = 0; i < kvm->arch.nested_mmus_size; i++) {
769 		struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
770 
771 		if (!kvm_s2_mmu_valid(mmu))
772 			continue;
773 
774 		if (nested_stage2_enabled &&
775 		    mmu->nested_stage2_enabled &&
776 		    vttbr == mmu->tlb_vttbr &&
777 		    vtcr == mmu->tlb_vtcr)
778 			return mmu;
779 
780 		if (!nested_stage2_enabled &&
781 		    !mmu->nested_stage2_enabled &&
782 		    get_vmid(vttbr) == get_vmid(mmu->tlb_vttbr))
783 			return mmu;
784 	}
785 	return NULL;
786 }
787 
788 static struct kvm_s2_mmu *get_s2_mmu_nested(struct kvm_vcpu *vcpu)
789 {
790 	struct kvm *kvm = vcpu->kvm;
791 	struct kvm_s2_mmu *s2_mmu;
792 	int i;
793 
794 	lockdep_assert_held_write(&vcpu->kvm->mmu_lock);
795 
796 	s2_mmu = lookup_s2_mmu(vcpu);
797 	if (s2_mmu)
798 		goto out;
799 
800 	/*
801 	 * Make sure we don't always search from the same point, or we
802 	 * will always reuse a potentially active context, leaving
803 	 * free contexts unused.
804 	 */
805 	for (i = kvm->arch.nested_mmus_next;
806 	     i < (kvm->arch.nested_mmus_size + kvm->arch.nested_mmus_next);
807 	     i++) {
808 		s2_mmu = &kvm->arch.nested_mmus[i % kvm->arch.nested_mmus_size];
809 
810 		if (atomic_read(&s2_mmu->refcnt) == 0)
811 			break;
812 	}
813 	BUG_ON(atomic_read(&s2_mmu->refcnt)); /* We have struct MMUs to spare */
814 
815 	/* Set the scene for the next search */
816 	kvm->arch.nested_mmus_next = (i + 1) % kvm->arch.nested_mmus_size;
817 
818 	/* Make sure we don't forget to do the laundry */
819 	if (kvm_s2_mmu_valid(s2_mmu)) {
820 		kvm_nested_s2_ptdump_remove_debugfs(s2_mmu);
821 		s2_mmu->pending_unmap = true;
822 	}
823 
824 	/*
825 	 * The virtual VMID (modulo CnP) will be used as a key when matching
826 	 * an existing kvm_s2_mmu.
827 	 *
828 	 * We cache VTCR at allocation time, once and for all. It'd be great
829 	 * if the guest didn't screw that one up, as this is not very
830 	 * forgiving...
831 	 */
832 	s2_mmu->tlb_vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2) & ~VTTBR_CNP_BIT;
833 	s2_mmu->tlb_vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2);
834 	s2_mmu->nested_stage2_enabled = vcpu_read_sys_reg(vcpu, HCR_EL2) & HCR_VM;
835 
836 	kvm_nested_s2_ptdump_create_debugfs(s2_mmu);
837 
838 out:
839 	atomic_inc(&s2_mmu->refcnt);
840 
841 	/*
842 	 * Set the vCPU request to perform an unmap, even if the pending unmap
843 	 * originates from another vCPU. This guarantees that the MMU has been
844 	 * completely unmapped before any vCPU actually uses it, and allows
845 	 * multiple vCPUs to lend a hand with completing the unmap.
846 	 */
847 	if (s2_mmu->pending_unmap)
848 		kvm_make_request(KVM_REQ_NESTED_S2_UNMAP, vcpu);
849 
850 	return s2_mmu;
851 }
852 
853 void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu)
854 {
855 	/* CnP being set denotes an invalid entry */
856 	mmu->tlb_vttbr = VTTBR_CNP_BIT;
857 	mmu->nested_stage2_enabled = false;
858 	atomic_set(&mmu->refcnt, 0);
859 }
860 
861 void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu)
862 {
863 	/*
864 	 * If the vCPU kept its reference on the MMU after the last put,
865 	 * keep rolling with it.
866 	 */
867 	if (is_hyp_ctxt(vcpu)) {
868 		if (!vcpu->arch.hw_mmu)
869 			vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
870 	} else {
871 		if (!vcpu->arch.hw_mmu) {
872 			scoped_guard(write_lock, &vcpu->kvm->mmu_lock)
873 				vcpu->arch.hw_mmu = get_s2_mmu_nested(vcpu);
874 		}
875 
876 		if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_NV)
877 			kvm_make_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu);
878 	}
879 }
880 
881 static void this_cpu_reset_vncr_fixmap(struct kvm_vcpu *vcpu)
882 {
883 	if (!host_data_test_flag(L1_VNCR_MAPPED))
884 		return;
885 
886 	BUG_ON(vcpu->arch.vncr_tlb->cpu != smp_processor_id());
887 	BUG_ON(is_hyp_ctxt(vcpu));
888 
889 	clear_fixmap(vncr_fixmap(vcpu->arch.vncr_tlb->cpu));
890 	vcpu->arch.vncr_tlb->cpu = -1;
891 	host_data_clear_flag(L1_VNCR_MAPPED);
892 	atomic_dec(&vcpu->kvm->arch.vncr_map_count);
893 }
894 
895 void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu)
896 {
897 	/* Unconditionally drop the VNCR mapping if we have one */
898 	this_cpu_reset_vncr_fixmap(vcpu);
899 
900 	/*
901 	 * Keep a reference on the associated stage-2 MMU if the vCPU is
902 	 * scheduling out and not in WFI emulation, suggesting it is likely to
903 	 * reuse the MMU sometime soon.
904 	 */
905 	if (vcpu->scheduled_out && !vcpu_get_flag(vcpu, IN_WFI))
906 		return;
907 
908 	if (kvm_is_nested_s2_mmu(vcpu->kvm, vcpu->arch.hw_mmu))
909 		atomic_dec(&vcpu->arch.hw_mmu->refcnt);
910 
911 	vcpu->arch.hw_mmu = NULL;
912 }
913 
914 /*
915  * Returns non-zero if permission fault is handled by injecting it to the next
916  * level hypervisor.
917  */
918 int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, struct kvm_s2_trans *trans)
919 {
920 	bool forward_fault = false;
921 
922 	trans->esr = 0;
923 
924 	if (!kvm_vcpu_trap_is_permission_fault(vcpu))
925 		return 0;
926 
927 	if (kvm_vcpu_trap_is_iabt(vcpu)) {
928 		if (vcpu_mode_priv(vcpu))
929 			forward_fault = !kvm_s2_trans_exec_el1(vcpu->kvm, trans);
930 		else
931 			forward_fault = !kvm_s2_trans_exec_el0(vcpu->kvm, trans);
932 	} else {
933 		bool write_fault = kvm_is_write_fault(vcpu);
934 
935 		forward_fault = ((write_fault && !trans->writable) ||
936 				 (!write_fault && !trans->readable));
937 	}
938 
939 	if (forward_fault)
940 		trans->esr = esr_s2_fault(vcpu, trans->level, ESR_ELx_FSC_PERM);
941 
942 	return forward_fault;
943 }
944 
945 int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2)
946 {
947 	vcpu_write_sys_reg(vcpu, vcpu->arch.fault.far_el2, FAR_EL2);
948 	vcpu_write_sys_reg(vcpu, vcpu->arch.fault.hpfar_el2, HPFAR_EL2);
949 
950 	return kvm_inject_nested_sync(vcpu, esr_el2);
951 }
952 
953 u16 get_asid_by_regime(struct kvm_vcpu *vcpu, enum trans_regime regime)
954 {
955 	enum vcpu_sysreg ttbr_elx;
956 	u64 tcr;
957 	u16 asid;
958 
959 	switch (regime) {
960 	case TR_EL10:
961 		tcr = vcpu_read_sys_reg(vcpu, TCR_EL1);
962 		ttbr_elx = (tcr & TCR_A1) ? TTBR1_EL1 : TTBR0_EL1;
963 		break;
964 	case TR_EL20:
965 		tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
966 		ttbr_elx = (tcr & TCR_A1) ? TTBR1_EL2 : TTBR0_EL2;
967 		break;
968 	default:
969 		BUG();
970 	}
971 
972 	asid = FIELD_GET(TTBRx_EL1_ASID, vcpu_read_sys_reg(vcpu, ttbr_elx));
973 	if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) ||
974 	    !(tcr & TCR_ASID16))
975 		asid &= GENMASK(7, 0);
976 
977 	return asid;
978 }
979 
980 static void invalidate_vncr(struct vncr_tlb *vt)
981 {
982 	vt->valid = false;
983 	if (vt->cpu != -1)
984 		clear_fixmap(vncr_fixmap(vt->cpu));
985 }
986 
987 /*
988  * VNCR TLB invalidation occurs from MMU notifiers or TLBI instructions, and
989  * either can race against a vcpu not being onlined yet (no pseudo-TLB
990  * allocated). Similarly, the TLB might be invalid.  Skip those, as they
991  * obviously don't participate in the invalidation at this stage.
992  */
993 #define kvm_for_each_vncr_tlb(idx, vcpup, tlbp, kvm)	\
994 	kvm_for_each_vcpu(idx, vcpup, kvm)		\
995 		if (((tlbp) = vcpup->arch.vncr_tlb) &&	\
996 		    (tlbp)->valid)
997 
998 static void kvm_invalidate_vncr_ipa(struct kvm *kvm, u64 start, u64 end)
999 {
1000 	struct kvm_vcpu *vcpu;
1001 	struct vncr_tlb *vt;
1002 	unsigned long i;
1003 
1004 	lockdep_assert_held_write(&kvm->mmu_lock);
1005 
1006 	if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY))
1007 		return;
1008 
1009 	kvm_for_each_vncr_tlb(i, vcpu, vt, kvm) {
1010 		u64 ipa_start, ipa_end, ipa_size;
1011 
1012 		ipa_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift,
1013 							    vt->wr.level));
1014 		ipa_start = vt->wr.pa & ~(ipa_size - 1);
1015 		ipa_end = ipa_start + ipa_size;
1016 
1017 		if (ipa_end <= start || ipa_start >= end)
1018 			continue;
1019 
1020 		invalidate_vncr(vt);
1021 	}
1022 }
1023 
1024 struct s1e2_tlbi_scope {
1025 	enum {
1026 		TLBI_ALL,
1027 		TLBI_VA,
1028 		TLBI_VAA,
1029 		TLBI_ASID,
1030 	} type;
1031 
1032 	u16 asid;
1033 	u64 va;
1034 	u64 size;
1035 };
1036 
1037 static void invalidate_vncr_va(struct kvm *kvm,
1038 			       struct s1e2_tlbi_scope *scope)
1039 {
1040 	struct kvm_vcpu *vcpu;
1041 	struct vncr_tlb *vt;
1042 	unsigned long i;
1043 
1044 	lockdep_assert_held_write(&kvm->mmu_lock);
1045 
1046 	kvm_for_each_vncr_tlb(i, vcpu, vt, kvm) {
1047 		u64 va_start, va_end, va_size;
1048 
1049 		va_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift,
1050 							   vt->wr.level));
1051 		va_start = vt->gva & ~(va_size - 1);
1052 		va_end = va_start + va_size;
1053 
1054 		switch (scope->type) {
1055 		case TLBI_ALL:
1056 			break;
1057 
1058 		case TLBI_VA:
1059 			if (va_end <= scope->va ||
1060 			    va_start >= (scope->va + scope->size))
1061 				continue;
1062 			if (vt->wr.nG && vt->wr.asid != scope->asid)
1063 				continue;
1064 			break;
1065 
1066 		case TLBI_VAA:
1067 			if (va_end <= scope->va ||
1068 			    va_start >= (scope->va + scope->size))
1069 				continue;
1070 			break;
1071 
1072 		case TLBI_ASID:
1073 			if (!vt->wr.nG || vt->wr.asid != scope->asid)
1074 				continue;
1075 			break;
1076 		}
1077 
1078 		invalidate_vncr(vt);
1079 	}
1080 }
1081 
1082 #define tlbi_va_s1_to_va(v)	(u64)sign_extend64((v) << 12, 48)
1083 
1084 static void compute_s1_tlbi_range(struct kvm_vcpu *vcpu, u32 inst, u64 val,
1085 				  struct s1e2_tlbi_scope *scope)
1086 {
1087 	switch (inst) {
1088 	case OP_TLBI_ALLE2:
1089 	case OP_TLBI_ALLE2IS:
1090 	case OP_TLBI_ALLE2OS:
1091 	case OP_TLBI_VMALLE1:
1092 	case OP_TLBI_VMALLE1IS:
1093 	case OP_TLBI_VMALLE1OS:
1094 	case OP_TLBI_ALLE2NXS:
1095 	case OP_TLBI_ALLE2ISNXS:
1096 	case OP_TLBI_ALLE2OSNXS:
1097 	case OP_TLBI_VMALLE1NXS:
1098 	case OP_TLBI_VMALLE1ISNXS:
1099 	case OP_TLBI_VMALLE1OSNXS:
1100 		scope->type = TLBI_ALL;
1101 		break;
1102 	case OP_TLBI_VAE2:
1103 	case OP_TLBI_VAE2IS:
1104 	case OP_TLBI_VAE2OS:
1105 	case OP_TLBI_VAE1:
1106 	case OP_TLBI_VAE1IS:
1107 	case OP_TLBI_VAE1OS:
1108 	case OP_TLBI_VAE2NXS:
1109 	case OP_TLBI_VAE2ISNXS:
1110 	case OP_TLBI_VAE2OSNXS:
1111 	case OP_TLBI_VAE1NXS:
1112 	case OP_TLBI_VAE1ISNXS:
1113 	case OP_TLBI_VAE1OSNXS:
1114 	case OP_TLBI_VALE2:
1115 	case OP_TLBI_VALE2IS:
1116 	case OP_TLBI_VALE2OS:
1117 	case OP_TLBI_VALE1:
1118 	case OP_TLBI_VALE1IS:
1119 	case OP_TLBI_VALE1OS:
1120 	case OP_TLBI_VALE2NXS:
1121 	case OP_TLBI_VALE2ISNXS:
1122 	case OP_TLBI_VALE2OSNXS:
1123 	case OP_TLBI_VALE1NXS:
1124 	case OP_TLBI_VALE1ISNXS:
1125 	case OP_TLBI_VALE1OSNXS:
1126 		scope->type = TLBI_VA;
1127 		scope->size = ttl_to_size(FIELD_GET(TLBI_TTL_MASK, val));
1128 		if (!scope->size)
1129 			scope->size = SZ_1G;
1130 		scope->va = tlbi_va_s1_to_va(val) & ~(scope->size - 1);
1131 		scope->asid = FIELD_GET(TLBIR_ASID_MASK, val);
1132 		break;
1133 	case OP_TLBI_ASIDE1:
1134 	case OP_TLBI_ASIDE1IS:
1135 	case OP_TLBI_ASIDE1OS:
1136 	case OP_TLBI_ASIDE1NXS:
1137 	case OP_TLBI_ASIDE1ISNXS:
1138 	case OP_TLBI_ASIDE1OSNXS:
1139 		scope->type = TLBI_ASID;
1140 		scope->asid = FIELD_GET(TLBIR_ASID_MASK, val);
1141 		break;
1142 	case OP_TLBI_VAAE1:
1143 	case OP_TLBI_VAAE1IS:
1144 	case OP_TLBI_VAAE1OS:
1145 	case OP_TLBI_VAAE1NXS:
1146 	case OP_TLBI_VAAE1ISNXS:
1147 	case OP_TLBI_VAAE1OSNXS:
1148 	case OP_TLBI_VAALE1:
1149 	case OP_TLBI_VAALE1IS:
1150 	case OP_TLBI_VAALE1OS:
1151 	case OP_TLBI_VAALE1NXS:
1152 	case OP_TLBI_VAALE1ISNXS:
1153 	case OP_TLBI_VAALE1OSNXS:
1154 		scope->type = TLBI_VAA;
1155 		scope->size = ttl_to_size(FIELD_GET(TLBI_TTL_MASK, val));
1156 		if (!scope->size)
1157 			scope->size = SZ_1G;
1158 		scope->va = tlbi_va_s1_to_va(val) & ~(scope->size - 1);
1159 		break;
1160 	case OP_TLBI_RVAE2:
1161 	case OP_TLBI_RVAE2IS:
1162 	case OP_TLBI_RVAE2OS:
1163 	case OP_TLBI_RVAE1:
1164 	case OP_TLBI_RVAE1IS:
1165 	case OP_TLBI_RVAE1OS:
1166 	case OP_TLBI_RVAE2NXS:
1167 	case OP_TLBI_RVAE2ISNXS:
1168 	case OP_TLBI_RVAE2OSNXS:
1169 	case OP_TLBI_RVAE1NXS:
1170 	case OP_TLBI_RVAE1ISNXS:
1171 	case OP_TLBI_RVAE1OSNXS:
1172 	case OP_TLBI_RVALE2:
1173 	case OP_TLBI_RVALE2IS:
1174 	case OP_TLBI_RVALE2OS:
1175 	case OP_TLBI_RVALE1:
1176 	case OP_TLBI_RVALE1IS:
1177 	case OP_TLBI_RVALE1OS:
1178 	case OP_TLBI_RVALE2NXS:
1179 	case OP_TLBI_RVALE2ISNXS:
1180 	case OP_TLBI_RVALE2OSNXS:
1181 	case OP_TLBI_RVALE1NXS:
1182 	case OP_TLBI_RVALE1ISNXS:
1183 	case OP_TLBI_RVALE1OSNXS:
1184 		scope->type = TLBI_VA;
1185 		scope->va = decode_range_tlbi(val, &scope->size, &scope->asid);
1186 		break;
1187 	case OP_TLBI_RVAAE1:
1188 	case OP_TLBI_RVAAE1IS:
1189 	case OP_TLBI_RVAAE1OS:
1190 	case OP_TLBI_RVAAE1NXS:
1191 	case OP_TLBI_RVAAE1ISNXS:
1192 	case OP_TLBI_RVAAE1OSNXS:
1193 	case OP_TLBI_RVAALE1:
1194 	case OP_TLBI_RVAALE1IS:
1195 	case OP_TLBI_RVAALE1OS:
1196 	case OP_TLBI_RVAALE1NXS:
1197 	case OP_TLBI_RVAALE1ISNXS:
1198 	case OP_TLBI_RVAALE1OSNXS:
1199 		scope->type = TLBI_VAA;
1200 		scope->va = decode_range_tlbi(val, &scope->size, NULL);
1201 		break;
1202 	}
1203 }
1204 
1205 void kvm_handle_s1e2_tlbi(struct kvm_vcpu *vcpu, u32 inst, u64 val)
1206 {
1207 	struct s1e2_tlbi_scope scope = {};
1208 
1209 	compute_s1_tlbi_range(vcpu, inst, val, &scope);
1210 
1211 	guard(write_lock)(&vcpu->kvm->mmu_lock);
1212 	invalidate_vncr_va(vcpu->kvm, &scope);
1213 }
1214 
1215 void kvm_nested_s2_wp(struct kvm *kvm)
1216 {
1217 	int i;
1218 
1219 	lockdep_assert_held_write(&kvm->mmu_lock);
1220 
1221 	if (!kvm->arch.nested_mmus_size)
1222 		return;
1223 
1224 	for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
1225 		struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
1226 
1227 		if (kvm_s2_mmu_valid(mmu))
1228 			kvm_stage2_wp_range(mmu, 0, kvm_phys_size(mmu));
1229 	}
1230 
1231 	kvm_invalidate_vncr_ipa(kvm, 0, BIT(kvm->arch.mmu.pgt->ia_bits));
1232 }
1233 
1234 void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block)
1235 {
1236 	int i;
1237 
1238 	lockdep_assert_held_write(&kvm->mmu_lock);
1239 
1240 	if (!kvm->arch.nested_mmus_size)
1241 		return;
1242 
1243 	for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
1244 		struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
1245 
1246 		if (kvm_s2_mmu_valid(mmu))
1247 			kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), may_block);
1248 	}
1249 
1250 	kvm_invalidate_vncr_ipa(kvm, 0, BIT(kvm->arch.mmu.pgt->ia_bits));
1251 }
1252 
1253 void kvm_nested_s2_flush(struct kvm *kvm)
1254 {
1255 	int i;
1256 
1257 	lockdep_assert_held_write(&kvm->mmu_lock);
1258 
1259 	if (!kvm->arch.nested_mmus_size)
1260 		return;
1261 
1262 	for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
1263 		struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
1264 
1265 		if (kvm_s2_mmu_valid(mmu))
1266 			kvm_stage2_flush_range(mmu, 0, kvm_phys_size(mmu));
1267 	}
1268 }
1269 
1270 void kvm_arch_flush_shadow_all(struct kvm *kvm)
1271 {
1272 	int i;
1273 
1274 	for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
1275 		struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
1276 
1277 		if (!WARN_ON(atomic_read(&mmu->refcnt)))
1278 			kvm_free_stage2_pgd(mmu);
1279 	}
1280 	kvfree(kvm->arch.nested_mmus);
1281 	kvm->arch.nested_mmus = NULL;
1282 	kvm->arch.nested_mmus_size = 0;
1283 	kvm_uninit_stage2_mmu(kvm);
1284 }
1285 
1286 /*
1287  * Dealing with VNCR_EL2 exposed by the *guest* is a complicated matter:
1288  *
1289  * - We introduce an internal representation of a vcpu-private TLB,
1290  *   representing the mapping between the guest VA contained in VNCR_EL2,
1291  *   the IPA the guest's EL2 PTs point to, and the actual PA this lives at.
1292  *
1293  * - On translation fault from a nested VNCR access, we create such a TLB.
1294  *   If there is no mapping to describe, the guest inherits the fault.
1295  *   Crucially, no actual mapping is done at this stage.
1296  *
1297  * - On vcpu_load() in a non-HYP context with HCR_EL2.NV==1, if the above
1298  *   TLB exists, we map it in the fixmap for this CPU, and run with it. We
1299  *   have to respect the permissions dictated by the guest, but not the
1300  *   memory type (FWB is a must).
1301  *
1302  * - Note that we usually don't do a vcpu_load() on the back of a fault
1303  *   (unless we are preempted), so the resolution of a translation fault
1304  *   must go via a request that will map the VNCR page in the fixmap.
1305  *   vcpu_load() might as well use the same mechanism.
1306  *
1307  * - On vcpu_put() in a non-HYP context with HCR_EL2.NV==1, if the TLB was
1308  *   mapped, we unmap it. Yes it is that simple. The TLB still exists
1309  *   though, and may be reused at a later load.
1310  *
1311  * - On permission fault, we simply forward the fault to the guest's EL2.
1312  *   Get out of my way.
1313  *
1314  * - On any TLBI for the EL2&0 translation regime, we must find any TLB that
1315  *   intersects with the TLBI request, invalidate it, and unmap the page
1316  *   from the fixmap. Because we need to look at all the vcpu-private TLBs,
1317  *   this requires some wide-ranging locking to ensure that nothing races
1318  *   against it. This may require some refcounting to avoid the search when
1319  *   no such TLB is present.
1320  *
1321  * - On MMU notifiers, we must invalidate our TLB in a similar way, but
1322  *   looking at the IPA instead. The funny part is that there may not be a
1323  *   stage-2 mapping for this page if L1 hasn't accessed it using LD/ST
1324  *   instructions.
1325  */
1326 
1327 int kvm_vcpu_allocate_vncr_tlb(struct kvm_vcpu *vcpu)
1328 {
1329 	if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY))
1330 		return 0;
1331 
1332 	if (!vcpu->arch.vncr_tlb) {
1333 		struct vncr_tlb *vt = kzalloc_obj(*vcpu->arch.vncr_tlb,
1334 						  GFP_KERNEL_ACCOUNT);
1335 
1336 		/*
1337 		 * Taking the lock on assignment ensures that the TLB is
1338 		 * seen as initialised when following the pointer (release
1339 		 * semantics of the unlock), and avoids having acquires on
1340 		 * each user which already take the lock.
1341 		 */
1342 		scoped_guard(write_lock, &vcpu->kvm->mmu_lock)
1343 			vcpu->arch.vncr_tlb = vt;
1344 	}
1345 
1346 	if (!vcpu->arch.vncr_tlb)
1347 		return -ENOMEM;
1348 
1349 	return 0;
1350 }
1351 
1352 static u64 read_vncr_el2(struct kvm_vcpu *vcpu)
1353 {
1354 	return (u64)sign_extend64(__vcpu_sys_reg(vcpu, VNCR_EL2), 48);
1355 }
1356 
1357 static int kvm_translate_vncr(struct kvm_vcpu *vcpu, bool *is_gmem)
1358 {
1359 	struct kvm_memory_slot *memslot;
1360 	bool write_fault, writable;
1361 	unsigned long mmu_seq;
1362 	struct vncr_tlb *vt;
1363 	struct page *page;
1364 	u64 va, pfn, gfn;
1365 	int ret;
1366 
1367 	vt = vcpu->arch.vncr_tlb;
1368 
1369 	/*
1370 	 * If we're about to walk the EL2 S1 PTs, we must invalidate the
1371 	 * current TLB, as it could be sampled from another vcpu doing a
1372 	 * TLBI *IS. A real CPU wouldn't do that, but we only keep a single
1373 	 * translation, so not much of a choice.
1374 	 *
1375 	 * We also prepare the next walk wilst we're at it.
1376 	 */
1377 	scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
1378 		this_cpu_reset_vncr_fixmap(vcpu);
1379 		vt->valid = false;
1380 
1381 		vt->wi = (struct s1_walk_info) {
1382 			.regime	= TR_EL20,
1383 			.as_el0	= false,
1384 			.pan	= false,
1385 		};
1386 		vt->wr = (struct s1_walk_result){};
1387 	}
1388 
1389 	guard(srcu)(&vcpu->kvm->srcu);
1390 
1391 	va =  read_vncr_el2(vcpu);
1392 
1393 	ret = __kvm_translate_va(vcpu, &vt->wi, &vt->wr, va);
1394 	if (ret)
1395 		return ret;
1396 
1397 	write_fault = kvm_is_write_fault(vcpu);
1398 
1399 	mmu_seq = vcpu->kvm->mmu_invalidate_seq;
1400 	smp_rmb();
1401 
1402 	gfn = vt->wr.pa >> PAGE_SHIFT;
1403 	memslot = gfn_to_memslot(vcpu->kvm, gfn);
1404 	if (!memslot)
1405 		return -EFAULT;
1406 
1407 	*is_gmem = kvm_slot_has_gmem(memslot);
1408 	if (!*is_gmem) {
1409 		pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
1410 					&writable, &page);
1411 		if (is_error_noslot_pfn(pfn) || (write_fault && !writable))
1412 			return -EFAULT;
1413 	} else {
1414 		ret = kvm_gmem_get_pfn(vcpu->kvm, memslot, gfn, &pfn, &page, NULL);
1415 		if (ret) {
1416 			kvm_prepare_memory_fault_exit(vcpu, vt->wr.pa, PAGE_SIZE,
1417 					      write_fault, false, false);
1418 			return ret;
1419 		}
1420 	}
1421 
1422 	scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
1423 		if (mmu_invalidate_retry(vcpu->kvm, mmu_seq)) {
1424 			kvm_release_faultin_page(vcpu->kvm, page, true, false);
1425 			return -EAGAIN;
1426 		}
1427 
1428 		vt->gva = va;
1429 		vt->hpa = pfn << PAGE_SHIFT;
1430 		vt->valid = true;
1431 		vt->cpu = -1;
1432 
1433 		kvm_make_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu);
1434 		kvm_release_faultin_page(vcpu->kvm, page, false, vt->wr.pw);
1435 	}
1436 
1437 	if (vt->wr.pw)
1438 		mark_page_dirty(vcpu->kvm, gfn);
1439 
1440 	return 0;
1441 }
1442 
1443 static void inject_vncr_perm(struct kvm_vcpu *vcpu)
1444 {
1445 	struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
1446 	u64 esr = kvm_vcpu_get_esr(vcpu);
1447 
1448 	/* Adjust the fault level to reflect that of the guest's */
1449 	esr &= ~ESR_ELx_FSC;
1450 	esr |= FIELD_PREP(ESR_ELx_FSC,
1451 			  ESR_ELx_FSC_PERM_L(vt->wr.level));
1452 
1453 	kvm_inject_nested_sync(vcpu, esr);
1454 }
1455 
1456 static bool kvm_vncr_tlb_lookup(struct kvm_vcpu *vcpu)
1457 {
1458 	struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
1459 
1460 	lockdep_assert_held_read(&vcpu->kvm->mmu_lock);
1461 
1462 	if (!vt->valid)
1463 		return false;
1464 
1465 	if (read_vncr_el2(vcpu) != vt->gva)
1466 		return false;
1467 
1468 	if (vt->wr.nG)
1469 		return get_asid_by_regime(vcpu, TR_EL20) == vt->wr.asid;
1470 
1471 	return true;
1472 }
1473 
1474 int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu)
1475 {
1476 	struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
1477 	u64 esr = kvm_vcpu_get_esr(vcpu);
1478 
1479 	WARN_ON_ONCE(!(esr & ESR_ELx_VNCR));
1480 
1481 	if (kvm_vcpu_abt_issea(vcpu))
1482 		return kvm_handle_guest_sea(vcpu);
1483 
1484 	if (esr_fsc_is_permission_fault(esr)) {
1485 		inject_vncr_perm(vcpu);
1486 	} else if (esr_fsc_is_translation_fault(esr)) {
1487 		bool valid, is_gmem = false;
1488 		int ret;
1489 
1490 		scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
1491 			valid = kvm_vncr_tlb_lookup(vcpu);
1492 
1493 		if (!valid)
1494 			ret = kvm_translate_vncr(vcpu, &is_gmem);
1495 		else
1496 			ret = -EPERM;
1497 
1498 		switch (ret) {
1499 		case -EAGAIN:
1500 			/* Let's try again... */
1501 			break;
1502 		case -ENOMEM:
1503 			/*
1504 			 * For guest_memfd, this indicates that it failed to
1505 			 * create a folio to back the memory. Inform userspace.
1506 			 */
1507 			if (is_gmem)
1508 				return 0;
1509 			/* Otherwise, let's try again... */
1510 			break;
1511 		case -EFAULT:
1512 		case -EIO:
1513 		case -EHWPOISON:
1514 			if (is_gmem)
1515 				return 0;
1516 			fallthrough;
1517 		case -EINVAL:
1518 		case -ENOENT:
1519 		case -EACCES:
1520 			/*
1521 			 * Translation failed, inject the corresponding
1522 			 * exception back to EL2.
1523 			 */
1524 			BUG_ON(!vt->wr.failed);
1525 
1526 			esr &= ~ESR_ELx_FSC;
1527 			esr |= FIELD_PREP(ESR_ELx_FSC, vt->wr.fst);
1528 
1529 			kvm_inject_nested_sync(vcpu, esr);
1530 			break;
1531 		case -EPERM:
1532 			/* Hack to deal with POE until we get kernel support */
1533 			inject_vncr_perm(vcpu);
1534 			break;
1535 		case 0:
1536 			break;
1537 		}
1538 	} else {
1539 		WARN_ONCE(1, "Unhandled VNCR abort, ESR=%llx\n", esr);
1540 	}
1541 
1542 	return 1;
1543 }
1544 
1545 static void kvm_map_l1_vncr(struct kvm_vcpu *vcpu)
1546 {
1547 	struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
1548 	pgprot_t prot;
1549 
1550 	guard(preempt)();
1551 	guard(read_lock)(&vcpu->kvm->mmu_lock);
1552 
1553 	/*
1554 	 * The request to map VNCR may have raced against some other
1555 	 * event, such as an interrupt, and may not be valid anymore.
1556 	 */
1557 	if (is_hyp_ctxt(vcpu))
1558 		return;
1559 
1560 	/*
1561 	 * Check that the pseudo-TLB is valid and that VNCR_EL2 still
1562 	 * contains the expected value. If it doesn't, we simply bail out
1563 	 * without a mapping -- a transformed MSR/MRS will generate the
1564 	 * fault and allows us to populate the pseudo-TLB.
1565 	 */
1566 	if (!vt->valid)
1567 		return;
1568 
1569 	if (read_vncr_el2(vcpu) != vt->gva)
1570 		return;
1571 
1572 	if (vt->wr.nG && get_asid_by_regime(vcpu, TR_EL20) != vt->wr.asid)
1573 		return;
1574 
1575 	vt->cpu = smp_processor_id();
1576 
1577 	if (vt->wr.pw && vt->wr.pr)
1578 		prot = PAGE_KERNEL;
1579 	else if (vt->wr.pr)
1580 		prot = PAGE_KERNEL_RO;
1581 	else
1582 		prot = PAGE_NONE;
1583 
1584 	/*
1585 	 * We can't map write-only (or no permission at all) in the kernel,
1586 	 * but the guest can do it if using POE, so we'll have to turn a
1587 	 * translation fault into a permission fault at runtime.
1588 	 * FIXME: WO doesn't work at all, need POE support in the kernel.
1589 	 */
1590 	if (pgprot_val(prot) != pgprot_val(PAGE_NONE)) {
1591 		__set_fixmap(vncr_fixmap(vt->cpu), vt->hpa, prot);
1592 		host_data_set_flag(L1_VNCR_MAPPED);
1593 		atomic_inc(&vcpu->kvm->arch.vncr_map_count);
1594 	}
1595 }
1596 
1597 /*
1598  * Our emulated CPU doesn't support all the possible features. For the
1599  * sake of simplicity (and probably mental sanity), wipe out a number
1600  * of feature bits we don't intend to support for the time being.
1601  * This list should get updated as new features get added to the NV
1602  * support, and new extension to the architecture.
1603  */
1604 u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val)
1605 {
1606 	u64 orig_val = val;
1607 
1608 	switch (reg) {
1609 	case SYS_ID_AA64ISAR1_EL1:
1610 		/* Support everything but LS64 and Spec Invalidation */
1611 		val &= ~(ID_AA64ISAR1_EL1_LS64	|
1612 			 ID_AA64ISAR1_EL1_SPECRES);
1613 		break;
1614 
1615 	case SYS_ID_AA64PFR0_EL1:
1616 		/* No RME, AMU, MPAM, or S-EL2 */
1617 		val &= ~(ID_AA64PFR0_EL1_RME	|
1618 			 ID_AA64PFR0_EL1_AMU	|
1619 			 ID_AA64PFR0_EL1_MPAM	|
1620 			 ID_AA64PFR0_EL1_SEL2	|
1621 			 ID_AA64PFR0_EL1_EL3	|
1622 			 ID_AA64PFR0_EL1_EL2	|
1623 			 ID_AA64PFR0_EL1_EL1	|
1624 			 ID_AA64PFR0_EL1_EL0);
1625 		/* 64bit only at any EL */
1626 		val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL0, IMP);
1627 		val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL1, IMP);
1628 		val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL2, IMP);
1629 		val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL3, IMP);
1630 		break;
1631 
1632 	case SYS_ID_AA64PFR1_EL1:
1633 		/* Only support BTI, SSBS, CSV2_frac */
1634 		val &= ~(ID_AA64PFR1_EL1_PFAR		|
1635 			 ID_AA64PFR1_EL1_MTEX		|
1636 			 ID_AA64PFR1_EL1_THE		|
1637 			 ID_AA64PFR1_EL1_GCS		|
1638 			 ID_AA64PFR1_EL1_MTE_frac	|
1639 			 ID_AA64PFR1_EL1_NMI		|
1640 			 ID_AA64PFR1_EL1_SME		|
1641 			 ID_AA64PFR1_EL1_RES0		|
1642 			 ID_AA64PFR1_EL1_MPAM_frac	|
1643 			 ID_AA64PFR1_EL1_MTE);
1644 		break;
1645 
1646 	case SYS_ID_AA64PFR2_EL1:
1647 		/* GICv5 is not yet supported for NV */
1648 		val &= ~ID_AA64PFR2_EL1_GCIE;
1649 		break;
1650 
1651 	case SYS_ID_AA64MMFR0_EL1:
1652 		/* Hide ExS, Secure Memory */
1653 		val &= ~(ID_AA64MMFR0_EL1_EXS		|
1654 			 ID_AA64MMFR0_EL1_TGRAN4_2	|
1655 			 ID_AA64MMFR0_EL1_TGRAN16_2	|
1656 			 ID_AA64MMFR0_EL1_TGRAN64_2	|
1657 			 ID_AA64MMFR0_EL1_SNSMEM);
1658 
1659 		/* Hide CNTPOFF if present */
1660 		val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR0_EL1, ECV, IMP);
1661 
1662 		/* Disallow unsupported S2 page sizes */
1663 		switch (PAGE_SIZE) {
1664 		case SZ_64K:
1665 			val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, NI);
1666 			fallthrough;
1667 		case SZ_16K:
1668 			val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, NI);
1669 			fallthrough;
1670 		case SZ_4K:
1671 			/* Support everything */
1672 			break;
1673 		}
1674 
1675 		/*
1676 		 * Since we can't support a guest S2 page size smaller
1677 		 * than the host's own page size (due to KVM only
1678 		 * populating its own S2 using the kernel's page
1679 		 * size), advertise the limitation using FEAT_GTG.
1680 		 */
1681 		switch (PAGE_SIZE) {
1682 		case SZ_4K:
1683 			if (_has_tgran_2(orig_val, 4))
1684 				val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, IMP);
1685 			fallthrough;
1686 		case SZ_16K:
1687 			if (_has_tgran_2(orig_val, 16))
1688 				val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, IMP);
1689 			fallthrough;
1690 		case SZ_64K:
1691 			if (_has_tgran_2(orig_val, 64))
1692 				val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN64_2, IMP);
1693 			break;
1694 		}
1695 
1696 		/* Cap PARange to 48bits */
1697 		val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR0_EL1, PARANGE, 48);
1698 		break;
1699 
1700 	case SYS_ID_AA64MMFR1_EL1:
1701 		val &= ~(ID_AA64MMFR1_EL1_CMOW		|
1702 			 ID_AA64MMFR1_EL1_nTLBPA	|
1703 			 ID_AA64MMFR1_EL1_ETS);
1704 
1705 		/* FEAT_E2H0 implies no VHE */
1706 		if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features))
1707 			val &= ~ID_AA64MMFR1_EL1_VH;
1708 
1709 		val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR1_EL1, HAFDBS, AF);
1710 		break;
1711 
1712 	case SYS_ID_AA64MMFR2_EL1:
1713 		val &= ~(ID_AA64MMFR2_EL1_BBM	|
1714 			 ID_AA64MMFR2_EL1_TTL	|
1715 			 GENMASK_ULL(47, 44)	|
1716 			 ID_AA64MMFR2_EL1_ST	|
1717 			 ID_AA64MMFR2_EL1_CCIDX	|
1718 			 ID_AA64MMFR2_EL1_VARange);
1719 
1720 		/* Force TTL support */
1721 		val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR2_EL1, TTL, IMP);
1722 		break;
1723 
1724 	case SYS_ID_AA64MMFR4_EL1:
1725 		/*
1726 		 * You get EITHER
1727 		 *
1728 		 * - FEAT_VHE without FEAT_E2H0
1729 		 * - FEAT_NV limited to FEAT_NV2
1730 		 * - HCR_EL2.NV1 being RES0
1731 		 *
1732 		 * OR
1733 		 *
1734 		 * - FEAT_E2H0 without FEAT_VHE nor FEAT_NV
1735 		 *
1736 		 * Life is too short for anything else.
1737 		 */
1738 		if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features)) {
1739 			val = 0;
1740 		} else {
1741 			val = SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY);
1742 			val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, E2H0, NI_NV1);
1743 		}
1744 		break;
1745 
1746 	case SYS_ID_AA64DFR0_EL1:
1747 		/* Only limited support for PMU, Debug, BPs, WPs, and HPMN0 */
1748 		val &= ~(ID_AA64DFR0_EL1_ExtTrcBuff	|
1749 			 ID_AA64DFR0_EL1_BRBE		|
1750 			 ID_AA64DFR0_EL1_MTPMU		|
1751 			 ID_AA64DFR0_EL1_TraceBuffer	|
1752 			 ID_AA64DFR0_EL1_TraceFilt	|
1753 			 ID_AA64DFR0_EL1_PMSVer		|
1754 			 ID_AA64DFR0_EL1_CTX_CMPs	|
1755 			 ID_AA64DFR0_EL1_SEBEP		|
1756 			 ID_AA64DFR0_EL1_PMSS		|
1757 			 ID_AA64DFR0_EL1_TraceVer);
1758 
1759 		/*
1760 		 * FEAT_Debugv8p9 requires support for extended breakpoints /
1761 		 * watchpoints.
1762 		 */
1763 		val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8);
1764 		break;
1765 	}
1766 
1767 	return val;
1768 }
1769 
1770 u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *vcpu,
1771 			     enum vcpu_sysreg sr, u64 v)
1772 {
1773 	struct resx resx;
1774 
1775 	resx = kvm_get_sysreg_resx(vcpu->kvm, sr);
1776 	v &= ~resx.res0;
1777 	v |= resx.res1;
1778 
1779 	return v;
1780 }
1781 
1782 static __always_inline void set_sysreg_masks(struct kvm *kvm, int sr, struct resx resx)
1783 {
1784 	BUILD_BUG_ON(!__builtin_constant_p(sr));
1785 	BUILD_BUG_ON(sr < __SANITISED_REG_START__);
1786 	BUILD_BUG_ON(sr >= NR_SYS_REGS);
1787 
1788 	kvm_set_sysreg_resx(kvm, sr, resx);
1789 }
1790 
1791 int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu)
1792 {
1793 	struct kvm *kvm = vcpu->kvm;
1794 	struct resx resx;
1795 
1796 	lockdep_assert_held(&kvm->arch.config_lock);
1797 
1798 	if (kvm->arch.sysreg_masks)
1799 		goto out;
1800 
1801 	kvm->arch.sysreg_masks = kzalloc_obj(*(kvm->arch.sysreg_masks),
1802 					     GFP_KERNEL_ACCOUNT);
1803 	if (!kvm->arch.sysreg_masks)
1804 		return -ENOMEM;
1805 
1806 	/* VTTBR_EL2 */
1807 	resx = (typeof(resx)){};
1808 	if (!kvm_has_feat_enum(kvm, ID_AA64MMFR1_EL1, VMIDBits, 16))
1809 		resx.res0 |= GENMASK(63, 56);
1810 	if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, CnP, IMP))
1811 		resx.res0 |= VTTBR_CNP_BIT;
1812 	set_sysreg_masks(kvm, VTTBR_EL2, resx);
1813 
1814 	/* VTCR_EL2 */
1815 	resx = get_reg_fixed_bits(kvm, VTCR_EL2);
1816 	set_sysreg_masks(kvm, VTCR_EL2, resx);
1817 
1818 	/* VMPIDR_EL2 */
1819 	resx.res0 = GENMASK(63, 40) | GENMASK(30, 24);
1820 	resx.res1 = BIT(31);
1821 	set_sysreg_masks(kvm, VMPIDR_EL2, resx);
1822 
1823 	/* HCR_EL2 */
1824 	resx = get_reg_fixed_bits(kvm, HCR_EL2);
1825 	set_sysreg_masks(kvm, HCR_EL2, resx);
1826 
1827 	/* HCRX_EL2 */
1828 	resx = get_reg_fixed_bits(kvm, HCRX_EL2);
1829 	set_sysreg_masks(kvm, HCRX_EL2, resx);
1830 
1831 	/* HFG[RW]TR_EL2 */
1832 	resx = get_reg_fixed_bits(kvm, HFGRTR_EL2);
1833 	set_sysreg_masks(kvm, HFGRTR_EL2, resx);
1834 	resx = get_reg_fixed_bits(kvm, HFGWTR_EL2);
1835 	set_sysreg_masks(kvm, HFGWTR_EL2, resx);
1836 
1837 	/* HDFG[RW]TR_EL2 */
1838 	resx = get_reg_fixed_bits(kvm, HDFGRTR_EL2);
1839 	set_sysreg_masks(kvm, HDFGRTR_EL2, resx);
1840 	resx = get_reg_fixed_bits(kvm, HDFGWTR_EL2);
1841 	set_sysreg_masks(kvm, HDFGWTR_EL2, resx);
1842 
1843 	/* HFGITR_EL2 */
1844 	resx = get_reg_fixed_bits(kvm, HFGITR_EL2);
1845 	set_sysreg_masks(kvm, HFGITR_EL2, resx);
1846 
1847 	/* HAFGRTR_EL2 - not a lot to see here */
1848 	resx = get_reg_fixed_bits(kvm, HAFGRTR_EL2);
1849 	set_sysreg_masks(kvm, HAFGRTR_EL2, resx);
1850 
1851 	/* HFG[RW]TR2_EL2 */
1852 	resx = get_reg_fixed_bits(kvm, HFGRTR2_EL2);
1853 	set_sysreg_masks(kvm, HFGRTR2_EL2, resx);
1854 	resx = get_reg_fixed_bits(kvm, HFGWTR2_EL2);
1855 	set_sysreg_masks(kvm, HFGWTR2_EL2, resx);
1856 
1857 	/* HDFG[RW]TR2_EL2 */
1858 	resx = get_reg_fixed_bits(kvm, HDFGRTR2_EL2);
1859 	set_sysreg_masks(kvm, HDFGRTR2_EL2, resx);
1860 	resx = get_reg_fixed_bits(kvm, HDFGWTR2_EL2);
1861 	set_sysreg_masks(kvm, HDFGWTR2_EL2, resx);
1862 
1863 	/* HFGITR2_EL2 */
1864 	resx = get_reg_fixed_bits(kvm, HFGITR2_EL2);
1865 	set_sysreg_masks(kvm, HFGITR2_EL2, resx);
1866 
1867 	/* TCR2_EL2 */
1868 	resx = get_reg_fixed_bits(kvm, TCR2_EL2);
1869 	set_sysreg_masks(kvm, TCR2_EL2, resx);
1870 
1871 	/* SCTLR_EL1 */
1872 	resx = get_reg_fixed_bits(kvm, SCTLR_EL1);
1873 	set_sysreg_masks(kvm, SCTLR_EL1, resx);
1874 
1875 	/* SCTLR_EL2 */
1876 	resx = get_reg_fixed_bits(kvm, SCTLR_EL2);
1877 	set_sysreg_masks(kvm, SCTLR_EL2, resx);
1878 
1879 	/* SCTLR2_ELx */
1880 	resx = get_reg_fixed_bits(kvm, SCTLR2_EL1);
1881 	set_sysreg_masks(kvm, SCTLR2_EL1, resx);
1882 	resx = get_reg_fixed_bits(kvm, SCTLR2_EL2);
1883 	set_sysreg_masks(kvm, SCTLR2_EL2, resx);
1884 
1885 	/* MDCR_EL2 */
1886 	resx = get_reg_fixed_bits(kvm, MDCR_EL2);
1887 	set_sysreg_masks(kvm, MDCR_EL2, resx);
1888 
1889 	/* CNTHCTL_EL2 */
1890 	resx.res0 = GENMASK(63, 20);
1891 	resx.res1 = 0;
1892 	if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RME, IMP))
1893 		resx.res0 |= CNTHCTL_CNTPMASK | CNTHCTL_CNTVMASK;
1894 	if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, ECV, CNTPOFF)) {
1895 		resx.res0 |= CNTHCTL_ECV;
1896 		if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, ECV, IMP))
1897 			resx.res0 |= (CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT |
1898 				      CNTHCTL_EL1NVPCT | CNTHCTL_EL1NVVCT);
1899 	}
1900 	if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, VH, IMP))
1901 		resx.res0 |= GENMASK(11, 8);
1902 	set_sysreg_masks(kvm, CNTHCTL_EL2, resx);
1903 
1904 	/* ICH_HCR_EL2 */
1905 	resx.res0 = ICH_HCR_EL2_RES0;
1906 	resx.res1 = ICH_HCR_EL2_RES1;
1907 	if (!(kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_EL2_TDS))
1908 		resx.res0 |= ICH_HCR_EL2_TDIR;
1909 	/* No GICv4 is presented to the guest */
1910 	resx.res0 |= ICH_HCR_EL2_DVIM | ICH_HCR_EL2_vSGIEOICount;
1911 	set_sysreg_masks(kvm, ICH_HCR_EL2, resx);
1912 
1913 	/* VNCR_EL2 */
1914 	resx.res0 = VNCR_EL2_RES0;
1915 	resx.res1 = VNCR_EL2_RES1;
1916 	set_sysreg_masks(kvm, VNCR_EL2, resx);
1917 
1918 	/* ZCR_EL2 - bits 8:4 are RAZ/WI so treat them as RES0 */
1919 	resx.res0 = ZCR_ELx_RES0 | GENMASK_ULL(8, 4);
1920 	resx.res1 = ZCR_ELx_RES1;
1921 	set_sysreg_masks(kvm, ZCR_EL2, resx);
1922 
1923 out:
1924 	for (enum vcpu_sysreg sr = __SANITISED_REG_START__; sr < NR_SYS_REGS; sr++)
1925 		__vcpu_rmw_sys_reg(vcpu, sr, |=, 0);
1926 
1927 	return 0;
1928 }
1929 
1930 void check_nested_vcpu_requests(struct kvm_vcpu *vcpu)
1931 {
1932 	if (kvm_check_request(KVM_REQ_NESTED_S2_UNMAP, vcpu)) {
1933 		struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu;
1934 
1935 		write_lock(&vcpu->kvm->mmu_lock);
1936 		if (mmu->pending_unmap) {
1937 			kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), true);
1938 			mmu->pending_unmap = false;
1939 		}
1940 		write_unlock(&vcpu->kvm->mmu_lock);
1941 	}
1942 
1943 	if (kvm_check_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu))
1944 		kvm_map_l1_vncr(vcpu);
1945 
1946 	/* Must be last, as may switch context! */
1947 	if (kvm_check_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu))
1948 		kvm_inject_nested_irq(vcpu);
1949 }
1950 
1951 /*
1952  * One of the many architectural bugs in FEAT_NV2 is that the guest hypervisor
1953  * can write to HCR_EL2 behind our back, potentially changing the exception
1954  * routing / masking for even the host context.
1955  *
1956  * What follows is some slop to (1) react to exception routing / masking and (2)
1957  * preserve the pending SError state across translation regimes.
1958  */
1959 void kvm_nested_flush_hwstate(struct kvm_vcpu *vcpu)
1960 {
1961 	if (!vcpu_has_nv(vcpu))
1962 		return;
1963 
1964 	if (unlikely(vcpu_test_and_clear_flag(vcpu, NESTED_SERROR_PENDING)))
1965 		kvm_inject_serror_esr(vcpu, vcpu_get_vsesr(vcpu));
1966 }
1967 
1968 void kvm_nested_sync_hwstate(struct kvm_vcpu *vcpu)
1969 {
1970 	unsigned long *hcr = vcpu_hcr(vcpu);
1971 
1972 	if (!vcpu_has_nv(vcpu))
1973 		return;
1974 
1975 	/*
1976 	 * We previously decided that an SError was deliverable to the guest.
1977 	 * Reap the pending state from HCR_EL2 and...
1978 	 */
1979 	if (unlikely(__test_and_clear_bit(__ffs(HCR_VSE), hcr)))
1980 		vcpu_set_flag(vcpu, NESTED_SERROR_PENDING);
1981 
1982 	/*
1983 	 * Re-attempt SError injection in case the deliverability has changed,
1984 	 * which is necessary to faithfully emulate WFI the case of a pending
1985 	 * SError being a wakeup condition.
1986 	 */
1987 	if (unlikely(vcpu_test_and_clear_flag(vcpu, NESTED_SERROR_PENDING)))
1988 		kvm_inject_serror_esr(vcpu, vcpu_get_vsesr(vcpu));
1989 }
1990 
1991 /*
1992  * KVM unconditionally sets most of these traps anyway but use an allowlist
1993  * to document the guest hypervisor traps that may take precedence and guard
1994  * against future changes to the non-nested trap configuration.
1995  */
1996 #define NV_MDCR_GUEST_INCLUDE	(MDCR_EL2_TDE	|	\
1997 				 MDCR_EL2_TDA	|	\
1998 				 MDCR_EL2_TDRA	|	\
1999 				 MDCR_EL2_TTRF	|	\
2000 				 MDCR_EL2_TPMS	|	\
2001 				 MDCR_EL2_TPM	|	\
2002 				 MDCR_EL2_TPMCR	|	\
2003 				 MDCR_EL2_TDCC	|	\
2004 				 MDCR_EL2_TDOSA)
2005 
2006 void kvm_nested_setup_mdcr_el2(struct kvm_vcpu *vcpu)
2007 {
2008 	u64 guest_mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2);
2009 
2010 	if (is_nested_ctxt(vcpu))
2011 		vcpu->arch.mdcr_el2 |= (guest_mdcr & NV_MDCR_GUEST_INCLUDE);
2012 	/*
2013 	 * In yet another example where FEAT_NV2 is fscking broken, accesses
2014 	 * to MDSCR_EL1 are redirected to the VNCR despite having an effect
2015 	 * at EL2. Use a big hammer to apply sanity.
2016 	 *
2017 	 * Unless of course we have FEAT_FGT, in which case we can precisely
2018 	 * trap MDSCR_EL1.
2019 	 */
2020 	else if (!cpus_have_final_cap(ARM64_HAS_FGT))
2021 		vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA;
2022 }
2023