xref: /linux/arch/arm64/kvm/at.c (revision c1ead4b4dfe0f643cfc66571ca7d2fa332eddd35)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2017 - Linaro Ltd
4  * Author: Jintack Lim <jintack.lim@linaro.org>
5  */
6 
7 #include <linux/kvm_host.h>
8 
9 #include <asm/esr.h>
10 #include <asm/kvm_hyp.h>
11 #include <asm/kvm_mmu.h>
12 
fail_s1_walk(struct s1_walk_result * wr,u8 fst,bool s1ptw)13 static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool s1ptw)
14 {
15 	wr->fst		= fst;
16 	wr->ptw		= s1ptw;
17 	wr->s2		= s1ptw;
18 	wr->failed	= true;
19 }
20 
21 #define S1_MMU_DISABLED		(-127)
22 
get_ia_size(struct s1_walk_info * wi)23 static int get_ia_size(struct s1_walk_info *wi)
24 {
25 	return 64 - wi->txsz;
26 }
27 
28 /* Return true if the IPA is out of the OA range */
check_output_size(u64 ipa,struct s1_walk_info * wi)29 static bool check_output_size(u64 ipa, struct s1_walk_info *wi)
30 {
31 	if (wi->pa52bit)
32 		return wi->max_oa_bits < 52 && (ipa & GENMASK_ULL(51, wi->max_oa_bits));
33 	return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits));
34 }
35 
has_52bit_pa(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,u64 tcr)36 static bool has_52bit_pa(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, u64 tcr)
37 {
38 	switch (BIT(wi->pgshift)) {
39 	case SZ_64K:
40 	default:		/* IMPDEF: treat any other value as 64k */
41 		if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, PARANGE, 52))
42 			return false;
43 		return ((wi->regime == TR_EL2 ?
44 			 FIELD_GET(TCR_EL2_PS_MASK, tcr) :
45 			 FIELD_GET(TCR_IPS_MASK, tcr)) == 0b0110);
46 	case SZ_16K:
47 		if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT))
48 			return false;
49 		break;
50 	case SZ_4K:
51 		if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT))
52 			return false;
53 		break;
54 	}
55 
56 	return (tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS));
57 }
58 
desc_to_oa(struct s1_walk_info * wi,u64 desc)59 static u64 desc_to_oa(struct s1_walk_info *wi, u64 desc)
60 {
61 	u64 addr;
62 
63 	if (!wi->pa52bit)
64 		return desc & GENMASK_ULL(47, wi->pgshift);
65 
66 	switch (BIT(wi->pgshift)) {
67 	case SZ_4K:
68 	case SZ_16K:
69 		addr = desc & GENMASK_ULL(49, wi->pgshift);
70 		addr |= FIELD_GET(KVM_PTE_ADDR_51_50_LPA2, desc) << 50;
71 		break;
72 	case SZ_64K:
73 	default:	    /* IMPDEF: treat any other value as 64k */
74 		addr = desc & GENMASK_ULL(47, wi->pgshift);
75 		addr |= FIELD_GET(KVM_PTE_ADDR_51_48, desc) << 48;
76 		break;
77 	}
78 
79 	return addr;
80 }
81 
82 /* Return the translation regime that applies to an AT instruction */
compute_translation_regime(struct kvm_vcpu * vcpu,u32 op)83 static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op)
84 {
85 	/*
86 	 * We only get here from guest EL2, so the translation
87 	 * regime AT applies to is solely defined by {E2H,TGE}.
88 	 */
89 	switch (op) {
90 	case OP_AT_S1E2R:
91 	case OP_AT_S1E2W:
92 	case OP_AT_S1E2A:
93 		return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2;
94 		break;
95 	default:
96 		return (vcpu_el2_e2h_is_set(vcpu) &&
97 			vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10;
98 	}
99 }
100 
effective_tcr2(struct kvm_vcpu * vcpu,enum trans_regime regime)101 static u64 effective_tcr2(struct kvm_vcpu *vcpu, enum trans_regime regime)
102 {
103 	if (regime == TR_EL10) {
104 		if (vcpu_has_nv(vcpu) &&
105 		    !(__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En))
106 			return 0;
107 
108 		return vcpu_read_sys_reg(vcpu, TCR2_EL1);
109 	}
110 
111 	return vcpu_read_sys_reg(vcpu, TCR2_EL2);
112 }
113 
s1pie_enabled(struct kvm_vcpu * vcpu,enum trans_regime regime)114 static bool s1pie_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime)
115 {
116 	if (!kvm_has_s1pie(vcpu->kvm))
117 		return false;
118 
119 	/* Abuse TCR2_EL1_PIE and use it for EL2 as well */
120 	return effective_tcr2(vcpu, regime) & TCR2_EL1_PIE;
121 }
122 
compute_s1poe(struct kvm_vcpu * vcpu,struct s1_walk_info * wi)123 static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi)
124 {
125 	u64 val;
126 
127 	if (!kvm_has_s1poe(vcpu->kvm)) {
128 		wi->poe = wi->e0poe = false;
129 		return;
130 	}
131 
132 	val = effective_tcr2(vcpu, wi->regime);
133 
134 	/* Abuse TCR2_EL1_* for EL2 */
135 	wi->poe = val & TCR2_EL1_POE;
136 	wi->e0poe = (wi->regime != TR_EL2) && (val & TCR2_EL1_E0POE);
137 }
138 
setup_s1_walk(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr,u64 va)139 static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
140 			 struct s1_walk_result *wr, u64 va)
141 {
142 	u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr;
143 	unsigned int stride, x;
144 	bool va55, tbi, lva;
145 
146 	va55 = va & BIT(55);
147 
148 	if (vcpu_has_nv(vcpu)) {
149 		hcr = __vcpu_sys_reg(vcpu, HCR_EL2);
150 		wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC));
151 	} else {
152 		WARN_ON_ONCE(wi->regime != TR_EL10);
153 		wi->s2 = false;
154 		hcr = 0;
155 	}
156 
157 	switch (wi->regime) {
158 	case TR_EL10:
159 		sctlr	= vcpu_read_sys_reg(vcpu, SCTLR_EL1);
160 		tcr	= vcpu_read_sys_reg(vcpu, TCR_EL1);
161 		ttbr	= (va55 ?
162 			   vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
163 			   vcpu_read_sys_reg(vcpu, TTBR0_EL1));
164 		break;
165 	case TR_EL2:
166 	case TR_EL20:
167 		sctlr	= vcpu_read_sys_reg(vcpu, SCTLR_EL2);
168 		tcr	= vcpu_read_sys_reg(vcpu, TCR_EL2);
169 		ttbr	= (va55 ?
170 			   vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
171 			   vcpu_read_sys_reg(vcpu, TTBR0_EL2));
172 		break;
173 	default:
174 		BUG();
175 	}
176 
177 	/* Someone was silly enough to encode TG0/TG1 differently */
178 	if (va55 && wi->regime != TR_EL2) {
179 		wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr);
180 		tg = FIELD_GET(TCR_TG1_MASK, tcr);
181 
182 		switch (tg << TCR_TG1_SHIFT) {
183 		case TCR_TG1_4K:
184 			wi->pgshift = 12;	 break;
185 		case TCR_TG1_16K:
186 			wi->pgshift = 14;	 break;
187 		case TCR_TG1_64K:
188 		default:	    /* IMPDEF: treat any other value as 64k */
189 			wi->pgshift = 16;	 break;
190 		}
191 	} else {
192 		wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
193 		tg = FIELD_GET(TCR_TG0_MASK, tcr);
194 
195 		switch (tg << TCR_TG0_SHIFT) {
196 		case TCR_TG0_4K:
197 			wi->pgshift = 12;	 break;
198 		case TCR_TG0_16K:
199 			wi->pgshift = 14;	 break;
200 		case TCR_TG0_64K:
201 		default:	    /* IMPDEF: treat any other value as 64k */
202 			wi->pgshift = 16;	 break;
203 		}
204 	}
205 
206 	wi->pa52bit = has_52bit_pa(vcpu, wi, tcr);
207 
208 	ia_bits = get_ia_size(wi);
209 
210 	/* AArch64.S1StartLevel() */
211 	stride = wi->pgshift - 3;
212 	wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride);
213 
214 	if (wi->regime == TR_EL2 && va55)
215 		goto addrsz;
216 
217 	tbi = (wi->regime == TR_EL2 ?
218 	       FIELD_GET(TCR_EL2_TBI, tcr) :
219 	       (va55 ?
220 		FIELD_GET(TCR_TBI1, tcr) :
221 		FIELD_GET(TCR_TBI0, tcr)));
222 
223 	if (!tbi && (u64)sign_extend64(va, 55) != va)
224 		goto addrsz;
225 
226 	wi->sh = (wi->regime == TR_EL2 ?
227 		  FIELD_GET(TCR_EL2_SH0_MASK, tcr) :
228 		  (va55 ?
229 		   FIELD_GET(TCR_SH1_MASK, tcr) :
230 		   FIELD_GET(TCR_SH0_MASK, tcr)));
231 
232 	va = (u64)sign_extend64(va, 55);
233 
234 	/* Let's put the MMU disabled case aside immediately */
235 	switch (wi->regime) {
236 	case TR_EL10:
237 		/*
238 		 * If dealing with the EL1&0 translation regime, 3 things
239 		 * can disable the S1 translation:
240 		 *
241 		 * - HCR_EL2.DC = 1
242 		 * - HCR_EL2.{E2H,TGE} = {0,1}
243 		 * - SCTLR_EL1.M = 0
244 		 *
245 		 * The TGE part is interesting. If we have decided that this
246 		 * is EL1&0, then it means that either {E2H,TGE} == {1,0} or
247 		 * {0,x}, and we only need to test for TGE == 1.
248 		 */
249 		if (hcr & (HCR_DC | HCR_TGE)) {
250 			wr->level = S1_MMU_DISABLED;
251 			break;
252 		}
253 		fallthrough;
254 	case TR_EL2:
255 	case TR_EL20:
256 		if (!(sctlr & SCTLR_ELx_M))
257 			wr->level = S1_MMU_DISABLED;
258 		break;
259 	}
260 
261 	if (wr->level == S1_MMU_DISABLED) {
262 		if (va >= BIT(kvm_get_pa_bits(vcpu->kvm)))
263 			goto addrsz;
264 
265 		wr->pa = va;
266 		return 0;
267 	}
268 
269 	wi->be = sctlr & SCTLR_ELx_EE;
270 
271 	wi->hpd  = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP);
272 	wi->hpd &= (wi->regime == TR_EL2 ?
273 		    FIELD_GET(TCR_EL2_HPD, tcr) :
274 		    (va55 ?
275 		     FIELD_GET(TCR_HPD1, tcr) :
276 		     FIELD_GET(TCR_HPD0, tcr)));
277 	/* R_JHSVW */
278 	wi->hpd |= s1pie_enabled(vcpu, wi->regime);
279 
280 	/* Do we have POE? */
281 	compute_s1poe(vcpu, wi);
282 
283 	/* R_BVXDG */
284 	wi->hpd |= (wi->poe || wi->e0poe);
285 
286 	/* R_PLCGL, R_YXNYW */
287 	if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) {
288 		if (wi->txsz > 39)
289 			goto transfault;
290 	} else {
291 		if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47))
292 			goto transfault;
293 	}
294 
295 	/* R_GTJBY, R_SXWGM */
296 	switch (BIT(wi->pgshift)) {
297 	case SZ_4K:
298 	case SZ_16K:
299 		lva = wi->pa52bit;
300 		break;
301 	case SZ_64K:
302 		lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52);
303 		break;
304 	}
305 
306 	if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16))
307 		goto transfault;
308 
309 	/* R_YYVYV, I_THCZK */
310 	if ((!va55 && va > GENMASK(ia_bits - 1, 0)) ||
311 	    (va55 && va < GENMASK(63, ia_bits)))
312 		goto transfault;
313 
314 	/* I_ZFSYQ */
315 	if (wi->regime != TR_EL2 &&
316 	    (tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK)))
317 		goto transfault;
318 
319 	/* R_BNDVG and following statements */
320 	if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) &&
321 	    wi->as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0)))
322 		goto transfault;
323 
324 	ps = (wi->regime == TR_EL2 ?
325 	      FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr));
326 
327 	wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps, wi->pa52bit));
328 
329 	/* Compute minimal alignment */
330 	x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift);
331 
332 	wi->baddr = ttbr & TTBRx_EL1_BADDR;
333 	if (wi->pa52bit) {
334 		/*
335 		 * Force the alignment on 64 bytes for top-level tables
336 		 * smaller than 8 entries, since TTBR.BADDR[5:2] are used to
337 		 * store bits [51:48] of the first level of lookup.
338 		 */
339 		x = max(x, 6);
340 
341 		wi->baddr |= FIELD_GET(GENMASK_ULL(5, 2), ttbr) << 48;
342 	}
343 
344 	/* R_VPBBF */
345 	if (check_output_size(wi->baddr, wi))
346 		goto addrsz;
347 
348 	wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x);
349 
350 	return 0;
351 
352 addrsz:
353 	/*
354 	 * Address Size Fault level 0 to indicate it comes from TTBR.
355 	 * yes, this is an oddity.
356 	 */
357 	fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false);
358 	return -EFAULT;
359 
360 transfault:
361 	/* Translation Fault on start level */
362 	fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(wi->sl), false);
363 	return -EFAULT;
364 }
365 
walk_s1(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr,u64 va)366 static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
367 		   struct s1_walk_result *wr, u64 va)
368 {
369 	u64 va_top, va_bottom, baddr, desc;
370 	int level, stride, ret;
371 
372 	level = wi->sl;
373 	stride = wi->pgshift - 3;
374 	baddr = wi->baddr;
375 
376 	va_top = get_ia_size(wi) - 1;
377 
378 	while (1) {
379 		u64 index, ipa;
380 
381 		va_bottom = (3 - level) * stride + wi->pgshift;
382 		index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3);
383 
384 		ipa = baddr | index;
385 
386 		if (wi->s2) {
387 			struct kvm_s2_trans s2_trans = {};
388 
389 			ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans);
390 			if (ret) {
391 				fail_s1_walk(wr,
392 					     (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level,
393 					     true);
394 				return ret;
395 			}
396 
397 			if (!kvm_s2_trans_readable(&s2_trans)) {
398 				fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level),
399 					     true);
400 
401 				return -EPERM;
402 			}
403 
404 			ipa = kvm_s2_trans_output(&s2_trans);
405 		}
406 
407 		if (wi->filter) {
408 			ret = wi->filter->fn(&(struct s1_walk_context)
409 					     {
410 						     .wi	= wi,
411 						     .table_ipa	= baddr,
412 						     .level	= level,
413 					     }, wi->filter->priv);
414 			if (ret)
415 				return ret;
416 		}
417 
418 		ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc));
419 		if (ret) {
420 			fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false);
421 			return ret;
422 		}
423 
424 		if (wi->be)
425 			desc = be64_to_cpu((__force __be64)desc);
426 		else
427 			desc = le64_to_cpu((__force __le64)desc);
428 
429 		/* Invalid descriptor */
430 		if (!(desc & BIT(0)))
431 			goto transfault;
432 
433 		/* Block mapping, check validity down the line */
434 		if (!(desc & BIT(1)))
435 			break;
436 
437 		/* Page mapping */
438 		if (level == 3)
439 			break;
440 
441 		/* Table handling */
442 		if (!wi->hpd) {
443 			wr->APTable  |= FIELD_GET(S1_TABLE_AP, desc);
444 			wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc);
445 			wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc);
446 		}
447 
448 		baddr = desc_to_oa(wi, desc);
449 
450 		/* Check for out-of-range OA */
451 		if (check_output_size(baddr, wi))
452 			goto addrsz;
453 
454 		/* Prepare for next round */
455 		va_top = va_bottom - 1;
456 		level++;
457 	}
458 
459 	/* Block mapping, check the validity of the level */
460 	if (!(desc & BIT(1))) {
461 		bool valid_block = false;
462 
463 		switch (BIT(wi->pgshift)) {
464 		case SZ_4K:
465 			valid_block = level == 1 || level == 2 || (wi->pa52bit && level == 0);
466 			break;
467 		case SZ_16K:
468 		case SZ_64K:
469 			valid_block = level == 2 || (wi->pa52bit && level == 1);
470 			break;
471 		}
472 
473 		if (!valid_block)
474 			goto transfault;
475 	}
476 
477 	baddr = desc_to_oa(wi, desc);
478 	if (check_output_size(baddr & GENMASK(52, va_bottom), wi))
479 		goto addrsz;
480 
481 	if (!(desc & PTE_AF)) {
482 		fail_s1_walk(wr, ESR_ELx_FSC_ACCESS_L(level), false);
483 		return -EACCES;
484 	}
485 
486 	va_bottom += contiguous_bit_shift(desc, wi, level);
487 
488 	wr->failed = false;
489 	wr->level = level;
490 	wr->desc = desc;
491 	wr->pa = baddr & GENMASK(52, va_bottom);
492 	wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0);
493 
494 	wr->nG = (wi->regime != TR_EL2) && (desc & PTE_NG);
495 	if (wr->nG) {
496 		u64 asid_ttbr, tcr;
497 
498 		switch (wi->regime) {
499 		case TR_EL10:
500 			tcr = vcpu_read_sys_reg(vcpu, TCR_EL1);
501 			asid_ttbr = ((tcr & TCR_A1) ?
502 				     vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
503 				     vcpu_read_sys_reg(vcpu, TTBR0_EL1));
504 			break;
505 		case TR_EL20:
506 			tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
507 			asid_ttbr = ((tcr & TCR_A1) ?
508 				     vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
509 				     vcpu_read_sys_reg(vcpu, TTBR0_EL2));
510 			break;
511 		default:
512 			BUG();
513 		}
514 
515 		wr->asid = FIELD_GET(TTBR_ASID_MASK, asid_ttbr);
516 		if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) ||
517 		    !(tcr & TCR_ASID16))
518 			wr->asid &= GENMASK(7, 0);
519 	}
520 
521 	return 0;
522 
523 addrsz:
524 	fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), false);
525 	return -EINVAL;
526 transfault:
527 	fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), false);
528 	return -ENOENT;
529 }
530 
531 struct mmu_config {
532 	u64	ttbr0;
533 	u64	ttbr1;
534 	u64	tcr;
535 	u64	mair;
536 	u64	tcr2;
537 	u64	pir;
538 	u64	pire0;
539 	u64	por_el0;
540 	u64	por_el1;
541 	u64	sctlr;
542 	u64	vttbr;
543 	u64	vtcr;
544 };
545 
__mmu_config_save(struct mmu_config * config)546 static void __mmu_config_save(struct mmu_config *config)
547 {
548 	config->ttbr0	= read_sysreg_el1(SYS_TTBR0);
549 	config->ttbr1	= read_sysreg_el1(SYS_TTBR1);
550 	config->tcr	= read_sysreg_el1(SYS_TCR);
551 	config->mair	= read_sysreg_el1(SYS_MAIR);
552 	if (cpus_have_final_cap(ARM64_HAS_TCR2)) {
553 		config->tcr2	= read_sysreg_el1(SYS_TCR2);
554 		if (cpus_have_final_cap(ARM64_HAS_S1PIE)) {
555 			config->pir	= read_sysreg_el1(SYS_PIR);
556 			config->pire0	= read_sysreg_el1(SYS_PIRE0);
557 		}
558 		if (system_supports_poe()) {
559 			config->por_el1	= read_sysreg_el1(SYS_POR);
560 			config->por_el0	= read_sysreg_s(SYS_POR_EL0);
561 		}
562 	}
563 	config->sctlr	= read_sysreg_el1(SYS_SCTLR);
564 	config->vttbr	= read_sysreg(vttbr_el2);
565 	config->vtcr	= read_sysreg(vtcr_el2);
566 }
567 
__mmu_config_restore(struct mmu_config * config)568 static void __mmu_config_restore(struct mmu_config *config)
569 {
570 	/*
571 	 * ARM errata 1165522 and 1530923 require TGE to be 1 before
572 	 * we update the guest state.
573 	 */
574 	asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
575 
576 	write_sysreg_el1(config->ttbr0,	SYS_TTBR0);
577 	write_sysreg_el1(config->ttbr1,	SYS_TTBR1);
578 	write_sysreg_el1(config->tcr,	SYS_TCR);
579 	write_sysreg_el1(config->mair,	SYS_MAIR);
580 	if (cpus_have_final_cap(ARM64_HAS_TCR2)) {
581 		write_sysreg_el1(config->tcr2, SYS_TCR2);
582 		if (cpus_have_final_cap(ARM64_HAS_S1PIE)) {
583 			write_sysreg_el1(config->pir, SYS_PIR);
584 			write_sysreg_el1(config->pire0, SYS_PIRE0);
585 		}
586 		if (system_supports_poe()) {
587 			write_sysreg_el1(config->por_el1, SYS_POR);
588 			write_sysreg_s(config->por_el0, SYS_POR_EL0);
589 		}
590 	}
591 	write_sysreg_el1(config->sctlr,	SYS_SCTLR);
592 	write_sysreg(config->vttbr,	vttbr_el2);
593 	write_sysreg(config->vtcr,	vtcr_el2);
594 }
595 
at_s1e1p_fast(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)596 static bool at_s1e1p_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
597 {
598 	u64 host_pan;
599 	bool fail;
600 
601 	host_pan = read_sysreg_s(SYS_PSTATE_PAN);
602 	write_sysreg_s(*vcpu_cpsr(vcpu) & PSTATE_PAN, SYS_PSTATE_PAN);
603 
604 	switch (op) {
605 	case OP_AT_S1E1RP:
606 		fail = __kvm_at(OP_AT_S1E1RP, vaddr);
607 		break;
608 	case OP_AT_S1E1WP:
609 		fail = __kvm_at(OP_AT_S1E1WP, vaddr);
610 		break;
611 	}
612 
613 	write_sysreg_s(host_pan, SYS_PSTATE_PAN);
614 
615 	return fail;
616 }
617 
618 #define MEMATTR(ic, oc)		(MEMATTR_##oc << 4 | MEMATTR_##ic)
619 #define MEMATTR_NC		0b0100
620 #define MEMATTR_Wt		0b1000
621 #define MEMATTR_Wb		0b1100
622 #define MEMATTR_WbRaWa		0b1111
623 
624 #define MEMATTR_IS_DEVICE(m)	(((m) & GENMASK(7, 4)) == 0)
625 
s2_memattr_to_attr(u8 memattr)626 static u8 s2_memattr_to_attr(u8 memattr)
627 {
628 	memattr &= 0b1111;
629 
630 	switch (memattr) {
631 	case 0b0000:
632 	case 0b0001:
633 	case 0b0010:
634 	case 0b0011:
635 		return memattr << 2;
636 	case 0b0100:
637 		return MEMATTR(Wb, Wb);
638 	case 0b0101:
639 		return MEMATTR(NC, NC);
640 	case 0b0110:
641 		return MEMATTR(Wt, NC);
642 	case 0b0111:
643 		return MEMATTR(Wb, NC);
644 	case 0b1000:
645 		/* Reserved, assume NC */
646 		return MEMATTR(NC, NC);
647 	case 0b1001:
648 		return MEMATTR(NC, Wt);
649 	case 0b1010:
650 		return MEMATTR(Wt, Wt);
651 	case 0b1011:
652 		return MEMATTR(Wb, Wt);
653 	case 0b1100:
654 		/* Reserved, assume NC */
655 		return MEMATTR(NC, NC);
656 	case 0b1101:
657 		return MEMATTR(NC, Wb);
658 	case 0b1110:
659 		return MEMATTR(Wt, Wb);
660 	case 0b1111:
661 		return MEMATTR(Wb, Wb);
662 	default:
663 		unreachable();
664 	}
665 }
666 
combine_s1_s2_attr(u8 s1,u8 s2)667 static u8 combine_s1_s2_attr(u8 s1, u8 s2)
668 {
669 	bool transient;
670 	u8 final = 0;
671 
672 	/* Upgrade transient s1 to non-transient to simplify things */
673 	switch (s1) {
674 	case 0b0001 ... 0b0011:	/* Normal, Write-Through Transient */
675 		transient = true;
676 		s1 = MEMATTR_Wt | (s1 & GENMASK(1,0));
677 		break;
678 	case 0b0101 ... 0b0111:	/* Normal, Write-Back Transient */
679 		transient = true;
680 		s1 = MEMATTR_Wb | (s1 & GENMASK(1,0));
681 		break;
682 	default:
683 		transient = false;
684 	}
685 
686 	/* S2CombineS1AttrHints() */
687 	if ((s1 & GENMASK(3, 2)) == MEMATTR_NC ||
688 	    (s2 & GENMASK(3, 2)) == MEMATTR_NC)
689 		final = MEMATTR_NC;
690 	else if ((s1 & GENMASK(3, 2)) == MEMATTR_Wt ||
691 		 (s2 & GENMASK(3, 2)) == MEMATTR_Wt)
692 		final = MEMATTR_Wt;
693 	else
694 		final = MEMATTR_Wb;
695 
696 	if (final != MEMATTR_NC) {
697 		/* Inherit RaWa hints form S1 */
698 		if (transient) {
699 			switch (s1 & GENMASK(3, 2)) {
700 			case MEMATTR_Wt:
701 				final = 0;
702 				break;
703 			case MEMATTR_Wb:
704 				final = MEMATTR_NC;
705 				break;
706 			}
707 		}
708 
709 		final |= s1 & GENMASK(1, 0);
710 	}
711 
712 	return final;
713 }
714 
715 #define ATTR_NSH	0b00
716 #define ATTR_RSV	0b01
717 #define ATTR_OSH	0b10
718 #define ATTR_ISH	0b11
719 
compute_final_sh(u8 attr,u8 sh)720 static u8 compute_final_sh(u8 attr, u8 sh)
721 {
722 	/* Any form of device, as well as NC has SH[1:0]=0b10 */
723 	if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC))
724 		return ATTR_OSH;
725 
726 	if (sh == ATTR_RSV)		/* Reserved, mapped to NSH */
727 		sh = ATTR_NSH;
728 
729 	return sh;
730 }
731 
compute_s1_sh(struct s1_walk_info * wi,struct s1_walk_result * wr,u8 attr)732 static u8 compute_s1_sh(struct s1_walk_info *wi, struct s1_walk_result *wr,
733 			u8 attr)
734 {
735 	u8 sh;
736 
737 	/*
738 	 * non-52bit and LPA have their basic shareability described in the
739 	 * descriptor. LPA2 gets it from the corresponding field in TCR,
740 	 * conveniently recorded in the walk info.
741 	 */
742 	if (!wi->pa52bit || BIT(wi->pgshift) == SZ_64K)
743 		sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_SH, wr->desc);
744 	else
745 		sh = wi->sh;
746 
747 	return compute_final_sh(attr, sh);
748 }
749 
combine_sh(u8 s1_sh,u8 s2_sh)750 static u8 combine_sh(u8 s1_sh, u8 s2_sh)
751 {
752 	if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH)
753 		return ATTR_OSH;
754 	if (s1_sh == ATTR_ISH || s2_sh == ATTR_ISH)
755 		return ATTR_ISH;
756 
757 	return ATTR_NSH;
758 }
759 
compute_par_s12(struct kvm_vcpu * vcpu,u64 s1_par,struct kvm_s2_trans * tr)760 static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par,
761 			   struct kvm_s2_trans *tr)
762 {
763 	u8 s1_parattr, s2_memattr, final_attr, s2_sh;
764 	u64 par;
765 
766 	/* If S2 has failed to translate, report the damage */
767 	if (tr->esr) {
768 		par = SYS_PAR_EL1_RES1;
769 		par |= SYS_PAR_EL1_F;
770 		par |= SYS_PAR_EL1_S;
771 		par |= FIELD_PREP(SYS_PAR_EL1_FST, tr->esr);
772 		return par;
773 	}
774 
775 	s1_parattr = FIELD_GET(SYS_PAR_EL1_ATTR, s1_par);
776 	s2_memattr = FIELD_GET(GENMASK(5, 2), tr->desc);
777 
778 	if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_FWB) {
779 		if (!kvm_has_feat(vcpu->kvm, ID_AA64PFR2_EL1, MTEPERM, IMP))
780 			s2_memattr &= ~BIT(3);
781 
782 		/* Combination of R_VRJSW and R_RHWZM */
783 		switch (s2_memattr) {
784 		case 0b0101:
785 			if (MEMATTR_IS_DEVICE(s1_parattr))
786 				final_attr = s1_parattr;
787 			else
788 				final_attr = MEMATTR(NC, NC);
789 			break;
790 		case 0b0110:
791 		case 0b1110:
792 			final_attr = MEMATTR(WbRaWa, WbRaWa);
793 			break;
794 		case 0b0111:
795 		case 0b1111:
796 			/* Preserve S1 attribute */
797 			final_attr = s1_parattr;
798 			break;
799 		case 0b0100:
800 		case 0b1100:
801 		case 0b1101:
802 			/* Reserved, do something non-silly */
803 			final_attr = s1_parattr;
804 			break;
805 		default:
806 			/*
807 			 * MemAttr[2]=0, Device from S2.
808 			 *
809 			 * FWB does not influence the way that stage 1
810 			 * memory types and attributes are combined
811 			 * with stage 2 Device type and attributes.
812 			 */
813 			final_attr = min(s2_memattr_to_attr(s2_memattr),
814 					 s1_parattr);
815 		}
816 	} else {
817 		/* Combination of R_HMNDG, R_TNHFM and R_GQFSF */
818 		u8 s2_parattr = s2_memattr_to_attr(s2_memattr);
819 
820 		if (MEMATTR_IS_DEVICE(s1_parattr) ||
821 		    MEMATTR_IS_DEVICE(s2_parattr)) {
822 			final_attr = min(s1_parattr, s2_parattr);
823 		} else {
824 			/* At this stage, this is memory vs memory */
825 			final_attr  = combine_s1_s2_attr(s1_parattr & 0xf,
826 							 s2_parattr & 0xf);
827 			final_attr |= combine_s1_s2_attr(s1_parattr >> 4,
828 							 s2_parattr >> 4) << 4;
829 		}
830 	}
831 
832 	if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_CD) &&
833 	    !MEMATTR_IS_DEVICE(final_attr))
834 		final_attr = MEMATTR(NC, NC);
835 
836 	s2_sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_SH, tr->desc);
837 
838 	par  = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr);
839 	par |= tr->output & GENMASK(47, 12);
840 	par |= FIELD_PREP(SYS_PAR_EL1_SH,
841 			  combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par),
842 				     compute_final_sh(final_attr, s2_sh)));
843 
844 	return par;
845 }
846 
compute_par_s1(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)847 static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
848 			  struct s1_walk_result *wr)
849 {
850 	u64 par;
851 
852 	if (wr->failed) {
853 		par = SYS_PAR_EL1_RES1;
854 		par |= SYS_PAR_EL1_F;
855 		par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst);
856 		par |= wr->ptw ? SYS_PAR_EL1_PTW : 0;
857 		par |= wr->s2 ? SYS_PAR_EL1_S : 0;
858 	} else if (wr->level == S1_MMU_DISABLED) {
859 		/* MMU off or HCR_EL2.DC == 1 */
860 		par  = SYS_PAR_EL1_NSE;
861 		par |= wr->pa & SYS_PAR_EL1_PA;
862 
863 		if (wi->regime == TR_EL10 && vcpu_has_nv(vcpu) &&
864 		    (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
865 			par |= FIELD_PREP(SYS_PAR_EL1_ATTR,
866 					  MEMATTR(WbRaWa, WbRaWa));
867 			par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_NSH);
868 		} else {
869 			par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */
870 			par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_OSH);
871 		}
872 	} else {
873 		u64 mair, sctlr;
874 		u8 sh;
875 
876 		par  = SYS_PAR_EL1_NSE;
877 
878 		mair = (wi->regime == TR_EL10 ?
879 			vcpu_read_sys_reg(vcpu, MAIR_EL1) :
880 			vcpu_read_sys_reg(vcpu, MAIR_EL2));
881 
882 		mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8;
883 		mair &= 0xff;
884 
885 		sctlr = (wi->regime == TR_EL10 ?
886 			 vcpu_read_sys_reg(vcpu, SCTLR_EL1) :
887 			 vcpu_read_sys_reg(vcpu, SCTLR_EL2));
888 
889 		/* Force NC for memory if SCTLR_ELx.C is clear */
890 		if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair))
891 			mair = MEMATTR(NC, NC);
892 
893 		par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair);
894 		par |= wr->pa & SYS_PAR_EL1_PA;
895 
896 		sh = compute_s1_sh(wi, wr, mair);
897 		par |= FIELD_PREP(SYS_PAR_EL1_SH, sh);
898 	}
899 
900 	return par;
901 }
902 
pan3_enabled(struct kvm_vcpu * vcpu,enum trans_regime regime)903 static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime)
904 {
905 	u64 sctlr;
906 
907 	if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3))
908 		return false;
909 
910 	if (s1pie_enabled(vcpu, regime))
911 		return true;
912 
913 	if (regime == TR_EL10)
914 		sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
915 	else
916 		sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
917 
918 	return sctlr & SCTLR_EL1_EPAN;
919 }
920 
compute_s1_direct_permissions(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)921 static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu,
922 					  struct s1_walk_info *wi,
923 					  struct s1_walk_result *wr)
924 {
925 	bool wxn;
926 
927 	/* Non-hierarchical part of AArch64.S1DirectBasePermissions() */
928 	if (wi->regime != TR_EL2) {
929 		switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr->desc)) {
930 		case 0b00:
931 			wr->pr = wr->pw = true;
932 			wr->ur = wr->uw = false;
933 			break;
934 		case 0b01:
935 			wr->pr = wr->pw = wr->ur = wr->uw = true;
936 			break;
937 		case 0b10:
938 			wr->pr = true;
939 			wr->pw = wr->ur = wr->uw = false;
940 			break;
941 		case 0b11:
942 			wr->pr = wr->ur = true;
943 			wr->pw = wr->uw = false;
944 			break;
945 		}
946 
947 		/* We don't use px for anything yet, but hey... */
948 		wr->px = !((wr->desc & PTE_PXN) || wr->uw);
949 		wr->ux = !(wr->desc & PTE_UXN);
950 	} else {
951 		wr->ur = wr->uw = wr->ux = false;
952 
953 		if (!(wr->desc & PTE_RDONLY)) {
954 			wr->pr = wr->pw = true;
955 		} else {
956 			wr->pr = true;
957 			wr->pw = false;
958 		}
959 
960 		/* XN maps to UXN */
961 		wr->px = !(wr->desc & PTE_UXN);
962 	}
963 
964 	switch (wi->regime) {
965 	case TR_EL2:
966 	case TR_EL20:
967 		wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_WXN);
968 		break;
969 	case TR_EL10:
970 		wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN);
971 		break;
972 	}
973 
974 	wr->pwxn = wr->uwxn = wxn;
975 	wr->pov = wi->poe;
976 	wr->uov = wi->e0poe;
977 }
978 
compute_s1_hierarchical_permissions(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)979 static void compute_s1_hierarchical_permissions(struct kvm_vcpu *vcpu,
980 						struct s1_walk_info *wi,
981 						struct s1_walk_result *wr)
982 {
983 	/* Hierarchical part of AArch64.S1DirectBasePermissions() */
984 	if (wi->regime != TR_EL2) {
985 		switch (wr->APTable) {
986 		case 0b00:
987 			break;
988 		case 0b01:
989 			wr->ur = wr->uw = false;
990 			break;
991 		case 0b10:
992 			wr->pw = wr->uw = false;
993 			break;
994 		case 0b11:
995 			wr->pw = wr->ur = wr->uw = false;
996 			break;
997 		}
998 
999 		wr->px &= !wr->PXNTable;
1000 		wr->ux &= !wr->UXNTable;
1001 	} else {
1002 		if (wr->APTable & BIT(1))
1003 			wr->pw = false;
1004 
1005 		/* XN maps to UXN */
1006 		wr->px &= !wr->UXNTable;
1007 	}
1008 }
1009 
1010 #define perm_idx(v, r, i)	((vcpu_read_sys_reg((v), (r)) >> ((i) * 4)) & 0xf)
1011 
1012 #define set_priv_perms(wr, r, w, x)	\
1013 	do {				\
1014 		(wr)->pr = (r);		\
1015 		(wr)->pw = (w);		\
1016 		(wr)->px = (x);		\
1017 	} while (0)
1018 
1019 #define set_unpriv_perms(wr, r, w, x)	\
1020 	do {				\
1021 		(wr)->ur = (r);		\
1022 		(wr)->uw = (w);		\
1023 		(wr)->ux = (x);		\
1024 	} while (0)
1025 
1026 #define set_priv_wxn(wr, v)		\
1027 	do {				\
1028 		(wr)->pwxn = (v);	\
1029 	} while (0)
1030 
1031 #define set_unpriv_wxn(wr, v)		\
1032 	do {				\
1033 		(wr)->uwxn = (v);	\
1034 	} while (0)
1035 
1036 /* Similar to AArch64.S1IndirectBasePermissions(), without GCS  */
1037 #define set_perms(w, wr, ip)						\
1038 	do {								\
1039 		/* R_LLZDZ */						\
1040 		switch ((ip)) {						\
1041 		case 0b0000:						\
1042 			set_ ## w ## _perms((wr), false, false, false);	\
1043 			break;						\
1044 		case 0b0001:						\
1045 			set_ ## w ## _perms((wr), true , false, false);	\
1046 			break;						\
1047 		case 0b0010:						\
1048 			set_ ## w ## _perms((wr), false, false, true );	\
1049 			break;						\
1050 		case 0b0011:						\
1051 			set_ ## w ## _perms((wr), true , false, true );	\
1052 			break;						\
1053 		case 0b0100:						\
1054 			set_ ## w ## _perms((wr), false, false, false);	\
1055 			break;						\
1056 		case 0b0101:						\
1057 			set_ ## w ## _perms((wr), true , true , false);	\
1058 			break;						\
1059 		case 0b0110:						\
1060 			set_ ## w ## _perms((wr), true , true , true );	\
1061 			break;						\
1062 		case 0b0111:						\
1063 			set_ ## w ## _perms((wr), true , true , true );	\
1064 			break;						\
1065 		case 0b1000:						\
1066 			set_ ## w ## _perms((wr), true , false, false);	\
1067 			break;						\
1068 		case 0b1001:						\
1069 			set_ ## w ## _perms((wr), true , false, false);	\
1070 			break;						\
1071 		case 0b1010:						\
1072 			set_ ## w ## _perms((wr), true , false, true );	\
1073 			break;						\
1074 		case 0b1011:						\
1075 			set_ ## w ## _perms((wr), false, false, false);	\
1076 			break;						\
1077 		case 0b1100:						\
1078 			set_ ## w ## _perms((wr), true , true , false);	\
1079 			break;						\
1080 		case 0b1101:						\
1081 			set_ ## w ## _perms((wr), false, false, false);	\
1082 			break;						\
1083 		case 0b1110:						\
1084 			set_ ## w ## _perms((wr), true , true , true );	\
1085 			break;						\
1086 		case 0b1111:						\
1087 			set_ ## w ## _perms((wr), false, false, false);	\
1088 			break;						\
1089 		}							\
1090 									\
1091 		/* R_HJYGR */						\
1092 		set_ ## w ## _wxn((wr), ((ip) == 0b0110));		\
1093 									\
1094 	} while (0)
1095 
compute_s1_indirect_permissions(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)1096 static void compute_s1_indirect_permissions(struct kvm_vcpu *vcpu,
1097 					    struct s1_walk_info *wi,
1098 					    struct s1_walk_result *wr)
1099 {
1100 	u8 up, pp, idx;
1101 
1102 	idx = pte_pi_index(wr->desc);
1103 
1104 	switch (wi->regime) {
1105 	case TR_EL10:
1106 		pp = perm_idx(vcpu, PIR_EL1, idx);
1107 		up = perm_idx(vcpu, PIRE0_EL1, idx);
1108 		break;
1109 	case TR_EL20:
1110 		pp = perm_idx(vcpu, PIR_EL2, idx);
1111 		up = perm_idx(vcpu, PIRE0_EL2, idx);
1112 		break;
1113 	case TR_EL2:
1114 		pp = perm_idx(vcpu, PIR_EL2, idx);
1115 		up = 0;
1116 		break;
1117 	}
1118 
1119 	set_perms(priv, wr, pp);
1120 
1121 	if (wi->regime != TR_EL2)
1122 		set_perms(unpriv, wr, up);
1123 	else
1124 		set_unpriv_perms(wr, false, false, false);
1125 
1126 	wr->pov = wi->poe && !(pp & BIT(3));
1127 	wr->uov = wi->e0poe && !(up & BIT(3));
1128 
1129 	/* R_VFPJF */
1130 	if (wr->px && wr->uw) {
1131 		set_priv_perms(wr, false, false, false);
1132 		set_unpriv_perms(wr, false, false, false);
1133 	}
1134 }
1135 
compute_s1_overlay_permissions(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)1136 static void compute_s1_overlay_permissions(struct kvm_vcpu *vcpu,
1137 					   struct s1_walk_info *wi,
1138 					   struct s1_walk_result *wr)
1139 {
1140 	u8 idx, pov_perms, uov_perms;
1141 
1142 	idx = FIELD_GET(PTE_PO_IDX_MASK, wr->desc);
1143 
1144 	if (wr->pov) {
1145 		switch (wi->regime) {
1146 		case TR_EL10:
1147 			pov_perms = perm_idx(vcpu, POR_EL1, idx);
1148 			break;
1149 		case TR_EL20:
1150 			pov_perms = perm_idx(vcpu, POR_EL2, idx);
1151 			break;
1152 		case TR_EL2:
1153 			pov_perms = perm_idx(vcpu, POR_EL2, idx);
1154 			break;
1155 		}
1156 
1157 		if (pov_perms & ~POE_RWX)
1158 			pov_perms = POE_NONE;
1159 
1160 		/* R_QXXPC, S1PrivOverflow enabled */
1161 		if (wr->pwxn && (pov_perms & POE_X))
1162 			pov_perms &= ~POE_W;
1163 
1164 		wr->pr &= pov_perms & POE_R;
1165 		wr->pw &= pov_perms & POE_W;
1166 		wr->px &= pov_perms & POE_X;
1167 	}
1168 
1169 	if (wr->uov) {
1170 		switch (wi->regime) {
1171 		case TR_EL10:
1172 			uov_perms = perm_idx(vcpu, POR_EL0, idx);
1173 			break;
1174 		case TR_EL20:
1175 			uov_perms = perm_idx(vcpu, POR_EL0, idx);
1176 			break;
1177 		case TR_EL2:
1178 			uov_perms = 0;
1179 			break;
1180 		}
1181 
1182 		if (uov_perms & ~POE_RWX)
1183 			uov_perms = POE_NONE;
1184 
1185 		/* R_NPBXC, S1UnprivOverlay enabled */
1186 		if (wr->uwxn && (uov_perms & POE_X))
1187 			uov_perms &= ~POE_W;
1188 
1189 		wr->ur &= uov_perms & POE_R;
1190 		wr->uw &= uov_perms & POE_W;
1191 		wr->ux &= uov_perms & POE_X;
1192 	}
1193 }
1194 
compute_s1_permissions(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)1195 static void compute_s1_permissions(struct kvm_vcpu *vcpu,
1196 				   struct s1_walk_info *wi,
1197 				   struct s1_walk_result *wr)
1198 {
1199 	bool pan;
1200 
1201 	if (!s1pie_enabled(vcpu, wi->regime))
1202 		compute_s1_direct_permissions(vcpu, wi, wr);
1203 	else
1204 		compute_s1_indirect_permissions(vcpu, wi, wr);
1205 
1206 	if (!wi->hpd)
1207 		compute_s1_hierarchical_permissions(vcpu, wi, wr);
1208 
1209 	compute_s1_overlay_permissions(vcpu, wi, wr);
1210 
1211 	/* R_QXXPC, S1PrivOverlay disabled */
1212 	if (!wr->pov)
1213 		wr->px &= !(wr->pwxn && wr->pw);
1214 
1215 	/* R_NPBXC, S1UnprivOverlay disabled */
1216 	if (!wr->uov)
1217 		wr->ux &= !(wr->uwxn && wr->uw);
1218 
1219 	pan = wi->pan && (wr->ur || wr->uw ||
1220 			  (pan3_enabled(vcpu, wi->regime) && wr->ux));
1221 	wr->pw &= !pan;
1222 	wr->pr &= !pan;
1223 }
1224 
handle_at_slow(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)1225 static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
1226 {
1227 	struct s1_walk_result wr = {};
1228 	struct s1_walk_info wi = {};
1229 	bool perm_fail = false;
1230 	int ret, idx;
1231 
1232 	wi.regime = compute_translation_regime(vcpu, op);
1233 	wi.as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W);
1234 	wi.pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) &&
1235 		 (*vcpu_cpsr(vcpu) & PSR_PAN_BIT);
1236 
1237 	ret = setup_s1_walk(vcpu, &wi, &wr, vaddr);
1238 	if (ret)
1239 		goto compute_par;
1240 
1241 	if (wr.level == S1_MMU_DISABLED)
1242 		goto compute_par;
1243 
1244 	idx = srcu_read_lock(&vcpu->kvm->srcu);
1245 
1246 	ret = walk_s1(vcpu, &wi, &wr, vaddr);
1247 
1248 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
1249 
1250 	if (ret)
1251 		goto compute_par;
1252 
1253 	compute_s1_permissions(vcpu, &wi, &wr);
1254 
1255 	switch (op) {
1256 	case OP_AT_S1E1RP:
1257 	case OP_AT_S1E1R:
1258 	case OP_AT_S1E2R:
1259 		perm_fail = !wr.pr;
1260 		break;
1261 	case OP_AT_S1E1WP:
1262 	case OP_AT_S1E1W:
1263 	case OP_AT_S1E2W:
1264 		perm_fail = !wr.pw;
1265 		break;
1266 	case OP_AT_S1E0R:
1267 		perm_fail = !wr.ur;
1268 		break;
1269 	case OP_AT_S1E0W:
1270 		perm_fail = !wr.uw;
1271 		break;
1272 	case OP_AT_S1E1A:
1273 	case OP_AT_S1E2A:
1274 		break;
1275 	default:
1276 		BUG();
1277 	}
1278 
1279 	if (perm_fail)
1280 		fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false);
1281 
1282 compute_par:
1283 	return compute_par_s1(vcpu, &wi, &wr);
1284 }
1285 
1286 /*
1287  * Return the PAR_EL1 value as the result of a valid translation.
1288  *
1289  * If the translation is unsuccessful, the value may only contain
1290  * PAR_EL1.F, and cannot be taken at face value. It isn't an
1291  * indication of the translation having failed, only that the fast
1292  * path did not succeed, *unless* it indicates a S1 permission or
1293  * access fault.
1294  */
__kvm_at_s1e01_fast(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)1295 static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
1296 {
1297 	struct mmu_config config;
1298 	struct kvm_s2_mmu *mmu;
1299 	bool fail, mmu_cs;
1300 	u64 par;
1301 
1302 	par = SYS_PAR_EL1_F;
1303 
1304 	/*
1305 	 * We've trapped, so everything is live on the CPU. As we will
1306 	 * be switching contexts behind everybody's back, disable
1307 	 * interrupts while holding the mmu lock.
1308 	 */
1309 	guard(write_lock_irqsave)(&vcpu->kvm->mmu_lock);
1310 
1311 	/*
1312 	 * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already
1313 	 * the right one (as we trapped from vEL2). If not, save the
1314 	 * full MMU context.
1315 	 *
1316 	 * We are also guaranteed to be in the correct context if
1317 	 * we're not in a nested VM.
1318 	 */
1319 	mmu_cs = (vcpu_has_nv(vcpu) &&
1320 		  !(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)));
1321 	if (!mmu_cs)
1322 		goto skip_mmu_switch;
1323 
1324 	/*
1325 	 * Obtaining the S2 MMU for a L2 is horribly racy, and we may not
1326 	 * find it (recycled by another vcpu, for example). When this
1327 	 * happens, admit defeat immediately and use the SW (slow) path.
1328 	 */
1329 	mmu = lookup_s2_mmu(vcpu);
1330 	if (!mmu)
1331 		return par;
1332 
1333 	__mmu_config_save(&config);
1334 
1335 	write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1),	SYS_TTBR0);
1336 	write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1),	SYS_TTBR1);
1337 	write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1),	SYS_TCR);
1338 	write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1),	SYS_MAIR);
1339 	if (kvm_has_tcr2(vcpu->kvm)) {
1340 		write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR2_EL1), SYS_TCR2);
1341 		if (kvm_has_s1pie(vcpu->kvm)) {
1342 			write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIR_EL1), SYS_PIR);
1343 			write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIRE0_EL1), SYS_PIRE0);
1344 		}
1345 		if (kvm_has_s1poe(vcpu->kvm)) {
1346 			write_sysreg_el1(vcpu_read_sys_reg(vcpu, POR_EL1), SYS_POR);
1347 			write_sysreg_s(vcpu_read_sys_reg(vcpu, POR_EL0), SYS_POR_EL0);
1348 		}
1349 	}
1350 	write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1),	SYS_SCTLR);
1351 	__load_stage2(mmu, mmu->arch);
1352 
1353 skip_mmu_switch:
1354 	/* Temporarily switch back to guest context */
1355 	write_sysreg_hcr(vcpu->arch.hcr_el2);
1356 	isb();
1357 
1358 	switch (op) {
1359 	case OP_AT_S1E1RP:
1360 	case OP_AT_S1E1WP:
1361 		fail = at_s1e1p_fast(vcpu, op, vaddr);
1362 		break;
1363 	case OP_AT_S1E1R:
1364 		fail = __kvm_at(OP_AT_S1E1R, vaddr);
1365 		break;
1366 	case OP_AT_S1E1W:
1367 		fail = __kvm_at(OP_AT_S1E1W, vaddr);
1368 		break;
1369 	case OP_AT_S1E0R:
1370 		fail = __kvm_at(OP_AT_S1E0R, vaddr);
1371 		break;
1372 	case OP_AT_S1E0W:
1373 		fail = __kvm_at(OP_AT_S1E0W, vaddr);
1374 		break;
1375 	case OP_AT_S1E1A:
1376 		fail = __kvm_at(OP_AT_S1E1A, vaddr);
1377 		break;
1378 	default:
1379 		WARN_ON_ONCE(1);
1380 		fail = true;
1381 		break;
1382 	}
1383 
1384 	if (!fail)
1385 		par = read_sysreg_par();
1386 
1387 	write_sysreg_hcr(HCR_HOST_VHE_FLAGS);
1388 
1389 	if (mmu_cs)
1390 		__mmu_config_restore(&config);
1391 
1392 	return par;
1393 }
1394 
par_check_s1_perm_fault(u64 par)1395 static bool par_check_s1_perm_fault(u64 par)
1396 {
1397 	u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
1398 
1399 	return  ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM &&
1400 		 !(par & SYS_PAR_EL1_S));
1401 }
1402 
par_check_s1_access_fault(u64 par)1403 static bool par_check_s1_access_fault(u64 par)
1404 {
1405 	u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
1406 
1407 	return  ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_ACCESS &&
1408 		 !(par & SYS_PAR_EL1_S));
1409 }
1410 
__kvm_at_s1e01(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)1411 void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
1412 {
1413 	u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr);
1414 
1415 	/*
1416 	 * If PAR_EL1 reports that AT failed on a S1 permission or access
1417 	 * fault, we know for sure that the PTW was able to walk the S1
1418 	 * tables and there's nothing else to do.
1419 	 *
1420 	 * If AT failed for any other reason, then we must walk the guest S1
1421 	 * to emulate the instruction.
1422 	 */
1423 	if ((par & SYS_PAR_EL1_F) &&
1424 	    !par_check_s1_perm_fault(par) &&
1425 	    !par_check_s1_access_fault(par))
1426 		par = handle_at_slow(vcpu, op, vaddr);
1427 
1428 	vcpu_write_sys_reg(vcpu, par, PAR_EL1);
1429 }
1430 
__kvm_at_s1e2(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)1431 void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
1432 {
1433 	u64 par;
1434 
1435 	/*
1436 	 * We've trapped, so everything is live on the CPU. As we will be
1437 	 * switching context behind everybody's back, disable interrupts...
1438 	 */
1439 	scoped_guard(write_lock_irqsave, &vcpu->kvm->mmu_lock) {
1440 		u64 val, hcr;
1441 		bool fail;
1442 
1443 		val = hcr = read_sysreg(hcr_el2);
1444 		val &= ~HCR_TGE;
1445 		val |= HCR_VM;
1446 
1447 		if (!vcpu_el2_e2h_is_set(vcpu))
1448 			val |= HCR_NV | HCR_NV1;
1449 
1450 		write_sysreg_hcr(val);
1451 		isb();
1452 
1453 		par = SYS_PAR_EL1_F;
1454 
1455 		switch (op) {
1456 		case OP_AT_S1E2R:
1457 			fail = __kvm_at(OP_AT_S1E1R, vaddr);
1458 			break;
1459 		case OP_AT_S1E2W:
1460 			fail = __kvm_at(OP_AT_S1E1W, vaddr);
1461 			break;
1462 		case OP_AT_S1E2A:
1463 			fail = __kvm_at(OP_AT_S1E1A, vaddr);
1464 			break;
1465 		default:
1466 			WARN_ON_ONCE(1);
1467 			fail = true;
1468 		}
1469 
1470 		isb();
1471 
1472 		if (!fail)
1473 			par = read_sysreg_par();
1474 
1475 		write_sysreg_hcr(hcr);
1476 		isb();
1477 	}
1478 
1479 	/* We failed the translation, let's replay it in slow motion */
1480 	if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par))
1481 		par = handle_at_slow(vcpu, op, vaddr);
1482 
1483 	vcpu_write_sys_reg(vcpu, par, PAR_EL1);
1484 }
1485 
__kvm_at_s12(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)1486 void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
1487 {
1488 	struct kvm_s2_trans out = {};
1489 	u64 ipa, par;
1490 	bool write;
1491 	int ret;
1492 
1493 	/* Do the stage-1 translation */
1494 	switch (op) {
1495 	case OP_AT_S12E1R:
1496 		op = OP_AT_S1E1R;
1497 		write = false;
1498 		break;
1499 	case OP_AT_S12E1W:
1500 		op = OP_AT_S1E1W;
1501 		write = true;
1502 		break;
1503 	case OP_AT_S12E0R:
1504 		op = OP_AT_S1E0R;
1505 		write = false;
1506 		break;
1507 	case OP_AT_S12E0W:
1508 		op = OP_AT_S1E0W;
1509 		write = true;
1510 		break;
1511 	default:
1512 		WARN_ON_ONCE(1);
1513 		return;
1514 	}
1515 
1516 	__kvm_at_s1e01(vcpu, op, vaddr);
1517 	par = vcpu_read_sys_reg(vcpu, PAR_EL1);
1518 	if (par & SYS_PAR_EL1_F)
1519 		return;
1520 
1521 	/*
1522 	 * If we only have a single stage of translation (EL2&0), exit
1523 	 * early. Same thing if {VM,DC}=={0,0}.
1524 	 */
1525 	if (compute_translation_regime(vcpu, op) == TR_EL20 ||
1526 	    !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC)))
1527 		return;
1528 
1529 	/* Do the stage-2 translation */
1530 	ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0));
1531 	out.esr = 0;
1532 	ret = kvm_walk_nested_s2(vcpu, ipa, &out);
1533 	if (ret < 0)
1534 		return;
1535 
1536 	/* Check the access permission */
1537 	if (!out.esr &&
1538 	    ((!write && !out.readable) || (write && !out.writable)))
1539 		out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3);
1540 
1541 	par = compute_par_s12(vcpu, par, &out);
1542 	vcpu_write_sys_reg(vcpu, par, PAR_EL1);
1543 }
1544 
1545 /*
1546  * Translate a VA for a given EL in a given translation regime, with
1547  * or without PAN. This requires wi->{regime, as_el0, pan} to be
1548  * set. The rest of the wi and wr should be 0-initialised.
1549  */
__kvm_translate_va(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr,u64 va)1550 int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
1551 		       struct s1_walk_result *wr, u64 va)
1552 {
1553 	int ret;
1554 
1555 	ret = setup_s1_walk(vcpu, wi, wr, va);
1556 	if (ret)
1557 		return ret;
1558 
1559 	if (wr->level == S1_MMU_DISABLED) {
1560 		wr->ur = wr->uw = wr->ux = true;
1561 		wr->pr = wr->pw = wr->px = true;
1562 	} else {
1563 		ret = walk_s1(vcpu, wi, wr, va);
1564 		if (ret)
1565 			return ret;
1566 
1567 		compute_s1_permissions(vcpu, wi, wr);
1568 	}
1569 
1570 	return 0;
1571 }
1572 
1573 struct desc_match {
1574 	u64	ipa;
1575 	int	level;
1576 };
1577 
match_s1_desc(struct s1_walk_context * ctxt,void * priv)1578 static int match_s1_desc(struct s1_walk_context *ctxt, void *priv)
1579 {
1580 	struct desc_match *dm = priv;
1581 	u64 ipa = dm->ipa;
1582 
1583 	/* Use S1 granule alignment */
1584 	ipa &= GENMASK(51, ctxt->wi->pgshift);
1585 
1586 	/* Not the IPA we're looking for? Continue. */
1587 	if (ipa != ctxt->table_ipa)
1588 		return 0;
1589 
1590 	/* Note the level and interrupt the walk */
1591 	dm->level = ctxt->level;
1592 	return -EINTR;
1593 }
1594 
__kvm_find_s1_desc_level(struct kvm_vcpu * vcpu,u64 va,u64 ipa,int * level)1595 int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level)
1596 {
1597 	struct desc_match dm = {
1598 		.ipa	= ipa,
1599 	};
1600 	struct s1_walk_info wi = {
1601 		.filter	= &(struct s1_walk_filter){
1602 			.fn	= match_s1_desc,
1603 			.priv	= &dm,
1604 		},
1605 		.regime	= TR_EL10,
1606 		.as_el0	= false,
1607 		.pan	= false,
1608 	};
1609 	struct s1_walk_result wr = {};
1610 	int ret;
1611 
1612 	ret = setup_s1_walk(vcpu, &wi, &wr, va);
1613 	if (ret)
1614 		return ret;
1615 
1616 	/* We really expect the S1 MMU to be on here... */
1617 	if (WARN_ON_ONCE(wr.level == S1_MMU_DISABLED)) {
1618 		*level = 0;
1619 		return 0;
1620 	}
1621 
1622 	/* Walk the guest's PT, looking for a match along the way */
1623 	ret = walk_s1(vcpu, &wi, &wr, va);
1624 	switch (ret) {
1625 	case -EINTR:
1626 		/* We interrupted the walk on a match, return the level */
1627 		*level = dm.level;
1628 		return 0;
1629 	case 0:
1630 		/* The walk completed, we failed to find the entry */
1631 		return -ENOENT;
1632 	default:
1633 		/* Any other error... */
1634 		return ret;
1635 	}
1636 }
1637