xref: /linux/arch/arm64/kvm/at.c (revision 55d0969c451159cff86949b38c39171cab962069)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2017 - Linaro Ltd
4  * Author: Jintack Lim <jintack.lim@linaro.org>
5  */
6 
7 #include <linux/kvm_host.h>
8 
9 #include <asm/esr.h>
10 #include <asm/kvm_hyp.h>
11 #include <asm/kvm_mmu.h>
12 
13 enum trans_regime {
14 	TR_EL10,
15 	TR_EL20,
16 	TR_EL2,
17 };
18 
19 struct s1_walk_info {
20 	u64	     		baddr;
21 	enum trans_regime	regime;
22 	unsigned int		max_oa_bits;
23 	unsigned int		pgshift;
24 	unsigned int		txsz;
25 	int 	     		sl;
26 	bool	     		hpd;
27 	bool	     		be;
28 	bool	     		s2;
29 };
30 
31 struct s1_walk_result {
32 	union {
33 		struct {
34 			u64	desc;
35 			u64	pa;
36 			s8	level;
37 			u8	APTable;
38 			bool	UXNTable;
39 			bool	PXNTable;
40 		};
41 		struct {
42 			u8	fst;
43 			bool	ptw;
44 			bool	s2;
45 		};
46 	};
47 	bool	failed;
48 };
49 
50 static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2)
51 {
52 	wr->fst		= fst;
53 	wr->ptw		= ptw;
54 	wr->s2		= s2;
55 	wr->failed	= true;
56 }
57 
58 #define S1_MMU_DISABLED		(-127)
59 
60 static int get_ia_size(struct s1_walk_info *wi)
61 {
62 	return 64 - wi->txsz;
63 }
64 
65 /* Return true if the IPA is out of the OA range */
66 static bool check_output_size(u64 ipa, struct s1_walk_info *wi)
67 {
68 	return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits));
69 }
70 
71 /* Return the translation regime that applies to an AT instruction */
72 static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op)
73 {
74 	/*
75 	 * We only get here from guest EL2, so the translation
76 	 * regime AT applies to is solely defined by {E2H,TGE}.
77 	 */
78 	switch (op) {
79 	case OP_AT_S1E2R:
80 	case OP_AT_S1E2W:
81 	case OP_AT_S1E2A:
82 		return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2;
83 		break;
84 	default:
85 		return (vcpu_el2_e2h_is_set(vcpu) &&
86 			vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10;
87 	}
88 }
89 
90 static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
91 			 struct s1_walk_result *wr, u64 va)
92 {
93 	u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr;
94 	unsigned int stride, x;
95 	bool va55, tbi, lva, as_el0;
96 
97 	hcr = __vcpu_sys_reg(vcpu, HCR_EL2);
98 
99 	wi->regime = compute_translation_regime(vcpu, op);
100 	as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W);
101 
102 	va55 = va & BIT(55);
103 
104 	if (wi->regime == TR_EL2 && va55)
105 		goto addrsz;
106 
107 	wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC));
108 
109 	switch (wi->regime) {
110 	case TR_EL10:
111 		sctlr	= vcpu_read_sys_reg(vcpu, SCTLR_EL1);
112 		tcr	= vcpu_read_sys_reg(vcpu, TCR_EL1);
113 		ttbr	= (va55 ?
114 			   vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
115 			   vcpu_read_sys_reg(vcpu, TTBR0_EL1));
116 		break;
117 	case TR_EL2:
118 	case TR_EL20:
119 		sctlr	= vcpu_read_sys_reg(vcpu, SCTLR_EL2);
120 		tcr	= vcpu_read_sys_reg(vcpu, TCR_EL2);
121 		ttbr	= (va55 ?
122 			   vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
123 			   vcpu_read_sys_reg(vcpu, TTBR0_EL2));
124 		break;
125 	default:
126 		BUG();
127 	}
128 
129 	tbi = (wi->regime == TR_EL2 ?
130 	       FIELD_GET(TCR_EL2_TBI, tcr) :
131 	       (va55 ?
132 		FIELD_GET(TCR_TBI1, tcr) :
133 		FIELD_GET(TCR_TBI0, tcr)));
134 
135 	if (!tbi && (u64)sign_extend64(va, 55) != va)
136 		goto addrsz;
137 
138 	va = (u64)sign_extend64(va, 55);
139 
140 	/* Let's put the MMU disabled case aside immediately */
141 	switch (wi->regime) {
142 	case TR_EL10:
143 		/*
144 		 * If dealing with the EL1&0 translation regime, 3 things
145 		 * can disable the S1 translation:
146 		 *
147 		 * - HCR_EL2.DC = 1
148 		 * - HCR_EL2.{E2H,TGE} = {0,1}
149 		 * - SCTLR_EL1.M = 0
150 		 *
151 		 * The TGE part is interesting. If we have decided that this
152 		 * is EL1&0, then it means that either {E2H,TGE} == {1,0} or
153 		 * {0,x}, and we only need to test for TGE == 1.
154 		 */
155 		if (hcr & (HCR_DC | HCR_TGE)) {
156 			wr->level = S1_MMU_DISABLED;
157 			break;
158 		}
159 		fallthrough;
160 	case TR_EL2:
161 	case TR_EL20:
162 		if (!(sctlr & SCTLR_ELx_M))
163 			wr->level = S1_MMU_DISABLED;
164 		break;
165 	}
166 
167 	if (wr->level == S1_MMU_DISABLED) {
168 		if (va >= BIT(kvm_get_pa_bits(vcpu->kvm)))
169 			goto addrsz;
170 
171 		wr->pa = va;
172 		return 0;
173 	}
174 
175 	wi->be = sctlr & SCTLR_ELx_EE;
176 
177 	wi->hpd  = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP);
178 	wi->hpd &= (wi->regime == TR_EL2 ?
179 		    FIELD_GET(TCR_EL2_HPD, tcr) :
180 		    (va55 ?
181 		     FIELD_GET(TCR_HPD1, tcr) :
182 		     FIELD_GET(TCR_HPD0, tcr)));
183 
184 	/* Someone was silly enough to encode TG0/TG1 differently */
185 	if (va55) {
186 		wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr);
187 		tg = FIELD_GET(TCR_TG1_MASK, tcr);
188 
189 		switch (tg << TCR_TG1_SHIFT) {
190 		case TCR_TG1_4K:
191 			wi->pgshift = 12;	 break;
192 		case TCR_TG1_16K:
193 			wi->pgshift = 14;	 break;
194 		case TCR_TG1_64K:
195 		default:	    /* IMPDEF: treat any other value as 64k */
196 			wi->pgshift = 16;	 break;
197 		}
198 	} else {
199 		wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
200 		tg = FIELD_GET(TCR_TG0_MASK, tcr);
201 
202 		switch (tg << TCR_TG0_SHIFT) {
203 		case TCR_TG0_4K:
204 			wi->pgshift = 12;	 break;
205 		case TCR_TG0_16K:
206 			wi->pgshift = 14;	 break;
207 		case TCR_TG0_64K:
208 		default:	    /* IMPDEF: treat any other value as 64k */
209 			wi->pgshift = 16;	 break;
210 		}
211 	}
212 
213 	/* R_PLCGL, R_YXNYW */
214 	if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) {
215 		if (wi->txsz > 39)
216 			goto transfault_l0;
217 	} else {
218 		if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47))
219 			goto transfault_l0;
220 	}
221 
222 	/* R_GTJBY, R_SXWGM */
223 	switch (BIT(wi->pgshift)) {
224 	case SZ_4K:
225 		lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT);
226 		lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS);
227 		break;
228 	case SZ_16K:
229 		lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT);
230 		lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS);
231 		break;
232 	case SZ_64K:
233 		lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52);
234 		break;
235 	}
236 
237 	if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16))
238 		goto transfault_l0;
239 
240 	ia_bits = get_ia_size(wi);
241 
242 	/* R_YYVYV, I_THCZK */
243 	if ((!va55 && va > GENMASK(ia_bits - 1, 0)) ||
244 	    (va55 && va < GENMASK(63, ia_bits)))
245 		goto transfault_l0;
246 
247 	/* I_ZFSYQ */
248 	if (wi->regime != TR_EL2 &&
249 	    (tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK)))
250 		goto transfault_l0;
251 
252 	/* R_BNDVG and following statements */
253 	if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) &&
254 	    as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0)))
255 		goto transfault_l0;
256 
257 	/* AArch64.S1StartLevel() */
258 	stride = wi->pgshift - 3;
259 	wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride);
260 
261 	ps = (wi->regime == TR_EL2 ?
262 	      FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr));
263 
264 	wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps));
265 
266 	/* Compute minimal alignment */
267 	x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift);
268 
269 	wi->baddr = ttbr & TTBRx_EL1_BADDR;
270 
271 	/* R_VPBBF */
272 	if (check_output_size(wi->baddr, wi))
273 		goto addrsz;
274 
275 	wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x);
276 
277 	return 0;
278 
279 addrsz:				/* Address Size Fault level 0 */
280 	fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false, false);
281 	return -EFAULT;
282 
283 transfault_l0:			/* Translation Fault level 0 */
284 	fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false, false);
285 	return -EFAULT;
286 }
287 
288 static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
289 		   struct s1_walk_result *wr, u64 va)
290 {
291 	u64 va_top, va_bottom, baddr, desc;
292 	int level, stride, ret;
293 
294 	level = wi->sl;
295 	stride = wi->pgshift - 3;
296 	baddr = wi->baddr;
297 
298 	va_top = get_ia_size(wi) - 1;
299 
300 	while (1) {
301 		u64 index, ipa;
302 
303 		va_bottom = (3 - level) * stride + wi->pgshift;
304 		index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3);
305 
306 		ipa = baddr | index;
307 
308 		if (wi->s2) {
309 			struct kvm_s2_trans s2_trans = {};
310 
311 			ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans);
312 			if (ret) {
313 				fail_s1_walk(wr,
314 					     (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level,
315 					     true, true);
316 				return ret;
317 			}
318 
319 			if (!kvm_s2_trans_readable(&s2_trans)) {
320 				fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level),
321 					     true, true);
322 
323 				return -EPERM;
324 			}
325 
326 			ipa = kvm_s2_trans_output(&s2_trans);
327 		}
328 
329 		ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc));
330 		if (ret) {
331 			fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level),
332 				     true, false);
333 			return ret;
334 		}
335 
336 		if (wi->be)
337 			desc = be64_to_cpu((__force __be64)desc);
338 		else
339 			desc = le64_to_cpu((__force __le64)desc);
340 
341 		/* Invalid descriptor */
342 		if (!(desc & BIT(0)))
343 			goto transfault;
344 
345 		/* Block mapping, check validity down the line */
346 		if (!(desc & BIT(1)))
347 			break;
348 
349 		/* Page mapping */
350 		if (level == 3)
351 			break;
352 
353 		/* Table handling */
354 		if (!wi->hpd) {
355 			wr->APTable  |= FIELD_GET(S1_TABLE_AP, desc);
356 			wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc);
357 			wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc);
358 		}
359 
360 		baddr = desc & GENMASK_ULL(47, wi->pgshift);
361 
362 		/* Check for out-of-range OA */
363 		if (check_output_size(baddr, wi))
364 			goto addrsz;
365 
366 		/* Prepare for next round */
367 		va_top = va_bottom - 1;
368 		level++;
369 	}
370 
371 	/* Block mapping, check the validity of the level */
372 	if (!(desc & BIT(1))) {
373 		bool valid_block = false;
374 
375 		switch (BIT(wi->pgshift)) {
376 		case SZ_4K:
377 			valid_block = level == 1 || level == 2;
378 			break;
379 		case SZ_16K:
380 		case SZ_64K:
381 			valid_block = level == 2;
382 			break;
383 		}
384 
385 		if (!valid_block)
386 			goto transfault;
387 	}
388 
389 	if (check_output_size(desc & GENMASK(47, va_bottom), wi))
390 		goto addrsz;
391 
392 	va_bottom += contiguous_bit_shift(desc, wi, level);
393 
394 	wr->failed = false;
395 	wr->level = level;
396 	wr->desc = desc;
397 	wr->pa = desc & GENMASK(47, va_bottom);
398 	wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0);
399 
400 	return 0;
401 
402 addrsz:
403 	fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), true, false);
404 	return -EINVAL;
405 transfault:
406 	fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), true, false);
407 	return -ENOENT;
408 }
409 
410 struct mmu_config {
411 	u64	ttbr0;
412 	u64	ttbr1;
413 	u64	tcr;
414 	u64	mair;
415 	u64	sctlr;
416 	u64	vttbr;
417 	u64	vtcr;
418 	u64	hcr;
419 };
420 
421 static void __mmu_config_save(struct mmu_config *config)
422 {
423 	config->ttbr0	= read_sysreg_el1(SYS_TTBR0);
424 	config->ttbr1	= read_sysreg_el1(SYS_TTBR1);
425 	config->tcr	= read_sysreg_el1(SYS_TCR);
426 	config->mair	= read_sysreg_el1(SYS_MAIR);
427 	config->sctlr	= read_sysreg_el1(SYS_SCTLR);
428 	config->vttbr	= read_sysreg(vttbr_el2);
429 	config->vtcr	= read_sysreg(vtcr_el2);
430 	config->hcr	= read_sysreg(hcr_el2);
431 }
432 
433 static void __mmu_config_restore(struct mmu_config *config)
434 {
435 	write_sysreg(config->hcr,	hcr_el2);
436 
437 	/*
438 	 * ARM errata 1165522 and 1530923 require TGE to be 1 before
439 	 * we update the guest state.
440 	 */
441 	asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
442 
443 	write_sysreg_el1(config->ttbr0,	SYS_TTBR0);
444 	write_sysreg_el1(config->ttbr1,	SYS_TTBR1);
445 	write_sysreg_el1(config->tcr,	SYS_TCR);
446 	write_sysreg_el1(config->mair,	SYS_MAIR);
447 	write_sysreg_el1(config->sctlr,	SYS_SCTLR);
448 	write_sysreg(config->vttbr,	vttbr_el2);
449 	write_sysreg(config->vtcr,	vtcr_el2);
450 }
451 
452 static bool at_s1e1p_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
453 {
454 	u64 host_pan;
455 	bool fail;
456 
457 	host_pan = read_sysreg_s(SYS_PSTATE_PAN);
458 	write_sysreg_s(*vcpu_cpsr(vcpu) & PSTATE_PAN, SYS_PSTATE_PAN);
459 
460 	switch (op) {
461 	case OP_AT_S1E1RP:
462 		fail = __kvm_at(OP_AT_S1E1RP, vaddr);
463 		break;
464 	case OP_AT_S1E1WP:
465 		fail = __kvm_at(OP_AT_S1E1WP, vaddr);
466 		break;
467 	}
468 
469 	write_sysreg_s(host_pan, SYS_PSTATE_PAN);
470 
471 	return fail;
472 }
473 
474 #define MEMATTR(ic, oc)		(MEMATTR_##oc << 4 | MEMATTR_##ic)
475 #define MEMATTR_NC		0b0100
476 #define MEMATTR_Wt		0b1000
477 #define MEMATTR_Wb		0b1100
478 #define MEMATTR_WbRaWa		0b1111
479 
480 #define MEMATTR_IS_DEVICE(m)	(((m) & GENMASK(7, 4)) == 0)
481 
482 static u8 s2_memattr_to_attr(u8 memattr)
483 {
484 	memattr &= 0b1111;
485 
486 	switch (memattr) {
487 	case 0b0000:
488 	case 0b0001:
489 	case 0b0010:
490 	case 0b0011:
491 		return memattr << 2;
492 	case 0b0100:
493 		return MEMATTR(Wb, Wb);
494 	case 0b0101:
495 		return MEMATTR(NC, NC);
496 	case 0b0110:
497 		return MEMATTR(Wt, NC);
498 	case 0b0111:
499 		return MEMATTR(Wb, NC);
500 	case 0b1000:
501 		/* Reserved, assume NC */
502 		return MEMATTR(NC, NC);
503 	case 0b1001:
504 		return MEMATTR(NC, Wt);
505 	case 0b1010:
506 		return MEMATTR(Wt, Wt);
507 	case 0b1011:
508 		return MEMATTR(Wb, Wt);
509 	case 0b1100:
510 		/* Reserved, assume NC */
511 		return MEMATTR(NC, NC);
512 	case 0b1101:
513 		return MEMATTR(NC, Wb);
514 	case 0b1110:
515 		return MEMATTR(Wt, Wb);
516 	case 0b1111:
517 		return MEMATTR(Wb, Wb);
518 	default:
519 		unreachable();
520 	}
521 }
522 
523 static u8 combine_s1_s2_attr(u8 s1, u8 s2)
524 {
525 	bool transient;
526 	u8 final = 0;
527 
528 	/* Upgrade transient s1 to non-transient to simplify things */
529 	switch (s1) {
530 	case 0b0001 ... 0b0011:	/* Normal, Write-Through Transient */
531 		transient = true;
532 		s1 = MEMATTR_Wt | (s1 & GENMASK(1,0));
533 		break;
534 	case 0b0101 ... 0b0111:	/* Normal, Write-Back Transient */
535 		transient = true;
536 		s1 = MEMATTR_Wb | (s1 & GENMASK(1,0));
537 		break;
538 	default:
539 		transient = false;
540 	}
541 
542 	/* S2CombineS1AttrHints() */
543 	if ((s1 & GENMASK(3, 2)) == MEMATTR_NC ||
544 	    (s2 & GENMASK(3, 2)) == MEMATTR_NC)
545 		final = MEMATTR_NC;
546 	else if ((s1 & GENMASK(3, 2)) == MEMATTR_Wt ||
547 		 (s2 & GENMASK(3, 2)) == MEMATTR_Wt)
548 		final = MEMATTR_Wt;
549 	else
550 		final = MEMATTR_Wb;
551 
552 	if (final != MEMATTR_NC) {
553 		/* Inherit RaWa hints form S1 */
554 		if (transient) {
555 			switch (s1 & GENMASK(3, 2)) {
556 			case MEMATTR_Wt:
557 				final = 0;
558 				break;
559 			case MEMATTR_Wb:
560 				final = MEMATTR_NC;
561 				break;
562 			}
563 		}
564 
565 		final |= s1 & GENMASK(1, 0);
566 	}
567 
568 	return final;
569 }
570 
571 #define ATTR_NSH	0b00
572 #define ATTR_RSV	0b01
573 #define ATTR_OSH	0b10
574 #define ATTR_ISH	0b11
575 
576 static u8 compute_sh(u8 attr, u64 desc)
577 {
578 	u8 sh;
579 
580 	/* Any form of device, as well as NC has SH[1:0]=0b10 */
581 	if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC))
582 		return ATTR_OSH;
583 
584 	sh = FIELD_GET(PTE_SHARED, desc);
585 	if (sh == ATTR_RSV)		/* Reserved, mapped to NSH */
586 		sh = ATTR_NSH;
587 
588 	return sh;
589 }
590 
591 static u8 combine_sh(u8 s1_sh, u8 s2_sh)
592 {
593 	if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH)
594 		return ATTR_OSH;
595 	if (s1_sh == ATTR_ISH || s2_sh == ATTR_ISH)
596 		return ATTR_ISH;
597 
598 	return ATTR_NSH;
599 }
600 
601 static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par,
602 			   struct kvm_s2_trans *tr)
603 {
604 	u8 s1_parattr, s2_memattr, final_attr;
605 	u64 par;
606 
607 	/* If S2 has failed to translate, report the damage */
608 	if (tr->esr) {
609 		par = SYS_PAR_EL1_RES1;
610 		par |= SYS_PAR_EL1_F;
611 		par |= SYS_PAR_EL1_S;
612 		par |= FIELD_PREP(SYS_PAR_EL1_FST, tr->esr);
613 		return par;
614 	}
615 
616 	s1_parattr = FIELD_GET(SYS_PAR_EL1_ATTR, s1_par);
617 	s2_memattr = FIELD_GET(GENMASK(5, 2), tr->desc);
618 
619 	if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_FWB) {
620 		if (!kvm_has_feat(vcpu->kvm, ID_AA64PFR2_EL1, MTEPERM, IMP))
621 			s2_memattr &= ~BIT(3);
622 
623 		/* Combination of R_VRJSW and R_RHWZM */
624 		switch (s2_memattr) {
625 		case 0b0101:
626 			if (MEMATTR_IS_DEVICE(s1_parattr))
627 				final_attr = s1_parattr;
628 			else
629 				final_attr = MEMATTR(NC, NC);
630 			break;
631 		case 0b0110:
632 		case 0b1110:
633 			final_attr = MEMATTR(WbRaWa, WbRaWa);
634 			break;
635 		case 0b0111:
636 		case 0b1111:
637 			/* Preserve S1 attribute */
638 			final_attr = s1_parattr;
639 			break;
640 		case 0b0100:
641 		case 0b1100:
642 		case 0b1101:
643 			/* Reserved, do something non-silly */
644 			final_attr = s1_parattr;
645 			break;
646 		default:
647 			/* MemAttr[2]=0, Device from S2 */
648 			final_attr = s2_memattr & GENMASK(1,0) << 2;
649 		}
650 	} else {
651 		/* Combination of R_HMNDG, R_TNHFM and R_GQFSF */
652 		u8 s2_parattr = s2_memattr_to_attr(s2_memattr);
653 
654 		if (MEMATTR_IS_DEVICE(s1_parattr) ||
655 		    MEMATTR_IS_DEVICE(s2_parattr)) {
656 			final_attr = min(s1_parattr, s2_parattr);
657 		} else {
658 			/* At this stage, this is memory vs memory */
659 			final_attr  = combine_s1_s2_attr(s1_parattr & 0xf,
660 							 s2_parattr & 0xf);
661 			final_attr |= combine_s1_s2_attr(s1_parattr >> 4,
662 							 s2_parattr >> 4) << 4;
663 		}
664 	}
665 
666 	if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_CD) &&
667 	    !MEMATTR_IS_DEVICE(final_attr))
668 		final_attr = MEMATTR(NC, NC);
669 
670 	par  = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr);
671 	par |= tr->output & GENMASK(47, 12);
672 	par |= FIELD_PREP(SYS_PAR_EL1_SH,
673 			  combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par),
674 				     compute_sh(final_attr, tr->desc)));
675 
676 	return par;
677 }
678 
679 static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr,
680 			  enum trans_regime regime)
681 {
682 	u64 par;
683 
684 	if (wr->failed) {
685 		par = SYS_PAR_EL1_RES1;
686 		par |= SYS_PAR_EL1_F;
687 		par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst);
688 		par |= wr->ptw ? SYS_PAR_EL1_PTW : 0;
689 		par |= wr->s2 ? SYS_PAR_EL1_S : 0;
690 	} else if (wr->level == S1_MMU_DISABLED) {
691 		/* MMU off or HCR_EL2.DC == 1 */
692 		par  = SYS_PAR_EL1_NSE;
693 		par |= wr->pa & GENMASK_ULL(47, 12);
694 
695 		if (regime == TR_EL10 &&
696 		    (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
697 			par |= FIELD_PREP(SYS_PAR_EL1_ATTR,
698 					  MEMATTR(WbRaWa, WbRaWa));
699 			par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_NSH);
700 		} else {
701 			par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */
702 			par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_OSH);
703 		}
704 	} else {
705 		u64 mair, sctlr;
706 		u8 sh;
707 
708 		par  = SYS_PAR_EL1_NSE;
709 
710 		mair = (regime == TR_EL10 ?
711 			vcpu_read_sys_reg(vcpu, MAIR_EL1) :
712 			vcpu_read_sys_reg(vcpu, MAIR_EL2));
713 
714 		mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8;
715 		mair &= 0xff;
716 
717 		sctlr = (regime == TR_EL10 ?
718 			 vcpu_read_sys_reg(vcpu, SCTLR_EL1) :
719 			 vcpu_read_sys_reg(vcpu, SCTLR_EL2));
720 
721 		/* Force NC for memory if SCTLR_ELx.C is clear */
722 		if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair))
723 			mair = MEMATTR(NC, NC);
724 
725 		par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair);
726 		par |= wr->pa & GENMASK_ULL(47, 12);
727 
728 		sh = compute_sh(mair, wr->desc);
729 		par |= FIELD_PREP(SYS_PAR_EL1_SH, sh);
730 	}
731 
732 	return par;
733 }
734 
735 static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime)
736 {
737 	u64 sctlr;
738 
739 	if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3))
740 		return false;
741 
742 	if (regime == TR_EL10)
743 		sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
744 	else
745 		sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
746 
747 	return sctlr & SCTLR_EL1_EPAN;
748 }
749 
750 static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
751 {
752 	bool perm_fail, ur, uw, ux, pr, pw, px;
753 	struct s1_walk_result wr = {};
754 	struct s1_walk_info wi = {};
755 	int ret, idx;
756 
757 	ret = setup_s1_walk(vcpu, op, &wi, &wr, vaddr);
758 	if (ret)
759 		goto compute_par;
760 
761 	if (wr.level == S1_MMU_DISABLED)
762 		goto compute_par;
763 
764 	idx = srcu_read_lock(&vcpu->kvm->srcu);
765 
766 	ret = walk_s1(vcpu, &wi, &wr, vaddr);
767 
768 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
769 
770 	if (ret)
771 		goto compute_par;
772 
773 	/* FIXME: revisit when adding indirect permission support */
774 	/* AArch64.S1DirectBasePermissions() */
775 	if (wi.regime != TR_EL2) {
776 		switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr.desc)) {
777 		case 0b00:
778 			pr = pw = true;
779 			ur = uw = false;
780 			break;
781 		case 0b01:
782 			pr = pw = ur = uw = true;
783 			break;
784 		case 0b10:
785 			pr = true;
786 			pw = ur = uw = false;
787 			break;
788 		case 0b11:
789 			pr = ur = true;
790 			pw = uw = false;
791 			break;
792 		}
793 
794 		switch (wr.APTable) {
795 		case 0b00:
796 			break;
797 		case 0b01:
798 			ur = uw = false;
799 			break;
800 		case 0b10:
801 			pw = uw = false;
802 			break;
803 		case 0b11:
804 			pw = ur = uw = false;
805 			break;
806 		}
807 
808 		/* We don't use px for anything yet, but hey... */
809 		px = !((wr.desc & PTE_PXN) || wr.PXNTable || uw);
810 		ux = !((wr.desc & PTE_UXN) || wr.UXNTable);
811 
812 		if (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) {
813 			bool pan;
814 
815 			pan = *vcpu_cpsr(vcpu) & PSR_PAN_BIT;
816 			pan &= ur || uw || (pan3_enabled(vcpu, wi.regime) && ux);
817 			pw &= !pan;
818 			pr &= !pan;
819 		}
820 	} else {
821 		ur = uw = ux = false;
822 
823 		if (!(wr.desc & PTE_RDONLY)) {
824 			pr = pw = true;
825 		} else {
826 			pr = true;
827 			pw = false;
828 		}
829 
830 		if (wr.APTable & BIT(1))
831 			pw = false;
832 
833 		/* XN maps to UXN */
834 		px = !((wr.desc & PTE_UXN) || wr.UXNTable);
835 	}
836 
837 	perm_fail = false;
838 
839 	switch (op) {
840 	case OP_AT_S1E1RP:
841 	case OP_AT_S1E1R:
842 	case OP_AT_S1E2R:
843 		perm_fail = !pr;
844 		break;
845 	case OP_AT_S1E1WP:
846 	case OP_AT_S1E1W:
847 	case OP_AT_S1E2W:
848 		perm_fail = !pw;
849 		break;
850 	case OP_AT_S1E0R:
851 		perm_fail = !ur;
852 		break;
853 	case OP_AT_S1E0W:
854 		perm_fail = !uw;
855 		break;
856 	case OP_AT_S1E1A:
857 	case OP_AT_S1E2A:
858 		break;
859 	default:
860 		BUG();
861 	}
862 
863 	if (perm_fail)
864 		fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false, false);
865 
866 compute_par:
867 	return compute_par_s1(vcpu, &wr, wi.regime);
868 }
869 
870 /*
871  * Return the PAR_EL1 value as the result of a valid translation.
872  *
873  * If the translation is unsuccessful, the value may only contain
874  * PAR_EL1.F, and cannot be taken at face value. It isn't an
875  * indication of the translation having failed, only that the fast
876  * path did not succeed, *unless* it indicates a S1 permission fault.
877  */
878 static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
879 {
880 	struct mmu_config config;
881 	struct kvm_s2_mmu *mmu;
882 	bool fail;
883 	u64 par;
884 
885 	par = SYS_PAR_EL1_F;
886 
887 	/*
888 	 * We've trapped, so everything is live on the CPU. As we will
889 	 * be switching contexts behind everybody's back, disable
890 	 * interrupts while holding the mmu lock.
891 	 */
892 	guard(write_lock_irqsave)(&vcpu->kvm->mmu_lock);
893 
894 	/*
895 	 * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already
896 	 * the right one (as we trapped from vEL2). If not, save the
897 	 * full MMU context.
898 	 */
899 	if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))
900 		goto skip_mmu_switch;
901 
902 	/*
903 	 * Obtaining the S2 MMU for a L2 is horribly racy, and we may not
904 	 * find it (recycled by another vcpu, for example). When this
905 	 * happens, admit defeat immediately and use the SW (slow) path.
906 	 */
907 	mmu = lookup_s2_mmu(vcpu);
908 	if (!mmu)
909 		return par;
910 
911 	__mmu_config_save(&config);
912 
913 	write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1),	SYS_TTBR0);
914 	write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1),	SYS_TTBR1);
915 	write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1),	SYS_TCR);
916 	write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1),	SYS_MAIR);
917 	write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1),	SYS_SCTLR);
918 	__load_stage2(mmu, mmu->arch);
919 
920 skip_mmu_switch:
921 	/* Clear TGE, enable S2 translation, we're rolling */
922 	write_sysreg((config.hcr & ~HCR_TGE) | HCR_VM,	hcr_el2);
923 	isb();
924 
925 	switch (op) {
926 	case OP_AT_S1E1RP:
927 	case OP_AT_S1E1WP:
928 		fail = at_s1e1p_fast(vcpu, op, vaddr);
929 		break;
930 	case OP_AT_S1E1R:
931 		fail = __kvm_at(OP_AT_S1E1R, vaddr);
932 		break;
933 	case OP_AT_S1E1W:
934 		fail = __kvm_at(OP_AT_S1E1W, vaddr);
935 		break;
936 	case OP_AT_S1E0R:
937 		fail = __kvm_at(OP_AT_S1E0R, vaddr);
938 		break;
939 	case OP_AT_S1E0W:
940 		fail = __kvm_at(OP_AT_S1E0W, vaddr);
941 		break;
942 	case OP_AT_S1E1A:
943 		fail = __kvm_at(OP_AT_S1E1A, vaddr);
944 		break;
945 	default:
946 		WARN_ON_ONCE(1);
947 		fail = true;
948 		break;
949 	}
950 
951 	if (!fail)
952 		par = read_sysreg_par();
953 
954 	if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)))
955 		__mmu_config_restore(&config);
956 
957 	return par;
958 }
959 
960 static bool par_check_s1_perm_fault(u64 par)
961 {
962 	u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
963 
964 	return  ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM &&
965 		 !(par & SYS_PAR_EL1_S));
966 }
967 
968 void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
969 {
970 	u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr);
971 
972 	/*
973 	 * If PAR_EL1 reports that AT failed on a S1 permission fault, we
974 	 * know for sure that the PTW was able to walk the S1 tables and
975 	 * there's nothing else to do.
976 	 *
977 	 * If AT failed for any other reason, then we must walk the guest S1
978 	 * to emulate the instruction.
979 	 */
980 	if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par))
981 		par = handle_at_slow(vcpu, op, vaddr);
982 
983 	vcpu_write_sys_reg(vcpu, par, PAR_EL1);
984 }
985 
986 void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
987 {
988 	u64 par;
989 
990 	/*
991 	 * We've trapped, so everything is live on the CPU. As we will be
992 	 * switching context behind everybody's back, disable interrupts...
993 	 */
994 	scoped_guard(write_lock_irqsave, &vcpu->kvm->mmu_lock) {
995 		struct kvm_s2_mmu *mmu;
996 		u64 val, hcr;
997 		bool fail;
998 
999 		mmu = &vcpu->kvm->arch.mmu;
1000 
1001 		val = hcr = read_sysreg(hcr_el2);
1002 		val &= ~HCR_TGE;
1003 		val |= HCR_VM;
1004 
1005 		if (!vcpu_el2_e2h_is_set(vcpu))
1006 			val |= HCR_NV | HCR_NV1;
1007 
1008 		write_sysreg(val, hcr_el2);
1009 		isb();
1010 
1011 		par = SYS_PAR_EL1_F;
1012 
1013 		switch (op) {
1014 		case OP_AT_S1E2R:
1015 			fail = __kvm_at(OP_AT_S1E1R, vaddr);
1016 			break;
1017 		case OP_AT_S1E2W:
1018 			fail = __kvm_at(OP_AT_S1E1W, vaddr);
1019 			break;
1020 		case OP_AT_S1E2A:
1021 			fail = __kvm_at(OP_AT_S1E1A, vaddr);
1022 			break;
1023 		default:
1024 			WARN_ON_ONCE(1);
1025 			fail = true;
1026 		}
1027 
1028 		isb();
1029 
1030 		if (!fail)
1031 			par = read_sysreg_par();
1032 
1033 		write_sysreg(hcr, hcr_el2);
1034 		isb();
1035 	}
1036 
1037 	/* We failed the translation, let's replay it in slow motion */
1038 	if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par))
1039 		par = handle_at_slow(vcpu, op, vaddr);
1040 
1041 	vcpu_write_sys_reg(vcpu, par, PAR_EL1);
1042 }
1043 
1044 void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
1045 {
1046 	struct kvm_s2_trans out = {};
1047 	u64 ipa, par;
1048 	bool write;
1049 	int ret;
1050 
1051 	/* Do the stage-1 translation */
1052 	switch (op) {
1053 	case OP_AT_S12E1R:
1054 		op = OP_AT_S1E1R;
1055 		write = false;
1056 		break;
1057 	case OP_AT_S12E1W:
1058 		op = OP_AT_S1E1W;
1059 		write = true;
1060 		break;
1061 	case OP_AT_S12E0R:
1062 		op = OP_AT_S1E0R;
1063 		write = false;
1064 		break;
1065 	case OP_AT_S12E0W:
1066 		op = OP_AT_S1E0W;
1067 		write = true;
1068 		break;
1069 	default:
1070 		WARN_ON_ONCE(1);
1071 		return;
1072 	}
1073 
1074 	__kvm_at_s1e01(vcpu, op, vaddr);
1075 	par = vcpu_read_sys_reg(vcpu, PAR_EL1);
1076 	if (par & SYS_PAR_EL1_F)
1077 		return;
1078 
1079 	/*
1080 	 * If we only have a single stage of translation (E2H=0 or
1081 	 * TGE=1), exit early. Same thing if {VM,DC}=={0,0}.
1082 	 */
1083 	if (!vcpu_el2_e2h_is_set(vcpu) || vcpu_el2_tge_is_set(vcpu) ||
1084 	    !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC)))
1085 		return;
1086 
1087 	/* Do the stage-2 translation */
1088 	ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0));
1089 	out.esr = 0;
1090 	ret = kvm_walk_nested_s2(vcpu, ipa, &out);
1091 	if (ret < 0)
1092 		return;
1093 
1094 	/* Check the access permission */
1095 	if (!out.esr &&
1096 	    ((!write && !out.readable) || (write && !out.writable)))
1097 		out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3);
1098 
1099 	par = compute_par_s12(vcpu, par, &out);
1100 	vcpu_write_sys_reg(vcpu, par, PAR_EL1);
1101 }
1102