1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (C) 2017 - Linaro Ltd
4 * Author: Jintack Lim <jintack.lim@linaro.org>
5 */
6
7 #include <linux/kvm_host.h>
8
9 #include <asm/esr.h>
10 #include <asm/kvm_hyp.h>
11 #include <asm/kvm_mmu.h>
12
13 enum trans_regime {
14 TR_EL10,
15 TR_EL20,
16 TR_EL2,
17 };
18
19 struct s1_walk_info {
20 u64 baddr;
21 enum trans_regime regime;
22 unsigned int max_oa_bits;
23 unsigned int pgshift;
24 unsigned int txsz;
25 int sl;
26 bool hpd;
27 bool be;
28 bool s2;
29 };
30
31 struct s1_walk_result {
32 union {
33 struct {
34 u64 desc;
35 u64 pa;
36 s8 level;
37 u8 APTable;
38 bool UXNTable;
39 bool PXNTable;
40 };
41 struct {
42 u8 fst;
43 bool ptw;
44 bool s2;
45 };
46 };
47 bool failed;
48 };
49
fail_s1_walk(struct s1_walk_result * wr,u8 fst,bool ptw,bool s2)50 static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2)
51 {
52 wr->fst = fst;
53 wr->ptw = ptw;
54 wr->s2 = s2;
55 wr->failed = true;
56 }
57
58 #define S1_MMU_DISABLED (-127)
59
get_ia_size(struct s1_walk_info * wi)60 static int get_ia_size(struct s1_walk_info *wi)
61 {
62 return 64 - wi->txsz;
63 }
64
65 /* Return true if the IPA is out of the OA range */
check_output_size(u64 ipa,struct s1_walk_info * wi)66 static bool check_output_size(u64 ipa, struct s1_walk_info *wi)
67 {
68 return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits));
69 }
70
71 /* Return the translation regime that applies to an AT instruction */
compute_translation_regime(struct kvm_vcpu * vcpu,u32 op)72 static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op)
73 {
74 /*
75 * We only get here from guest EL2, so the translation
76 * regime AT applies to is solely defined by {E2H,TGE}.
77 */
78 switch (op) {
79 case OP_AT_S1E2R:
80 case OP_AT_S1E2W:
81 case OP_AT_S1E2A:
82 return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2;
83 break;
84 default:
85 return (vcpu_el2_e2h_is_set(vcpu) &&
86 vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10;
87 }
88 }
89
setup_s1_walk(struct kvm_vcpu * vcpu,u32 op,struct s1_walk_info * wi,struct s1_walk_result * wr,u64 va)90 static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
91 struct s1_walk_result *wr, u64 va)
92 {
93 u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr;
94 unsigned int stride, x;
95 bool va55, tbi, lva, as_el0;
96
97 hcr = __vcpu_sys_reg(vcpu, HCR_EL2);
98
99 wi->regime = compute_translation_regime(vcpu, op);
100 as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W);
101
102 va55 = va & BIT(55);
103
104 if (wi->regime == TR_EL2 && va55)
105 goto addrsz;
106
107 wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC));
108
109 switch (wi->regime) {
110 case TR_EL10:
111 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
112 tcr = vcpu_read_sys_reg(vcpu, TCR_EL1);
113 ttbr = (va55 ?
114 vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
115 vcpu_read_sys_reg(vcpu, TTBR0_EL1));
116 break;
117 case TR_EL2:
118 case TR_EL20:
119 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
120 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
121 ttbr = (va55 ?
122 vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
123 vcpu_read_sys_reg(vcpu, TTBR0_EL2));
124 break;
125 default:
126 BUG();
127 }
128
129 tbi = (wi->regime == TR_EL2 ?
130 FIELD_GET(TCR_EL2_TBI, tcr) :
131 (va55 ?
132 FIELD_GET(TCR_TBI1, tcr) :
133 FIELD_GET(TCR_TBI0, tcr)));
134
135 if (!tbi && (u64)sign_extend64(va, 55) != va)
136 goto addrsz;
137
138 va = (u64)sign_extend64(va, 55);
139
140 /* Let's put the MMU disabled case aside immediately */
141 switch (wi->regime) {
142 case TR_EL10:
143 /*
144 * If dealing with the EL1&0 translation regime, 3 things
145 * can disable the S1 translation:
146 *
147 * - HCR_EL2.DC = 1
148 * - HCR_EL2.{E2H,TGE} = {0,1}
149 * - SCTLR_EL1.M = 0
150 *
151 * The TGE part is interesting. If we have decided that this
152 * is EL1&0, then it means that either {E2H,TGE} == {1,0} or
153 * {0,x}, and we only need to test for TGE == 1.
154 */
155 if (hcr & (HCR_DC | HCR_TGE)) {
156 wr->level = S1_MMU_DISABLED;
157 break;
158 }
159 fallthrough;
160 case TR_EL2:
161 case TR_EL20:
162 if (!(sctlr & SCTLR_ELx_M))
163 wr->level = S1_MMU_DISABLED;
164 break;
165 }
166
167 if (wr->level == S1_MMU_DISABLED) {
168 if (va >= BIT(kvm_get_pa_bits(vcpu->kvm)))
169 goto addrsz;
170
171 wr->pa = va;
172 return 0;
173 }
174
175 wi->be = sctlr & SCTLR_ELx_EE;
176
177 wi->hpd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP);
178 wi->hpd &= (wi->regime == TR_EL2 ?
179 FIELD_GET(TCR_EL2_HPD, tcr) :
180 (va55 ?
181 FIELD_GET(TCR_HPD1, tcr) :
182 FIELD_GET(TCR_HPD0, tcr)));
183
184 /* Someone was silly enough to encode TG0/TG1 differently */
185 if (va55) {
186 wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr);
187 tg = FIELD_GET(TCR_TG1_MASK, tcr);
188
189 switch (tg << TCR_TG1_SHIFT) {
190 case TCR_TG1_4K:
191 wi->pgshift = 12; break;
192 case TCR_TG1_16K:
193 wi->pgshift = 14; break;
194 case TCR_TG1_64K:
195 default: /* IMPDEF: treat any other value as 64k */
196 wi->pgshift = 16; break;
197 }
198 } else {
199 wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
200 tg = FIELD_GET(TCR_TG0_MASK, tcr);
201
202 switch (tg << TCR_TG0_SHIFT) {
203 case TCR_TG0_4K:
204 wi->pgshift = 12; break;
205 case TCR_TG0_16K:
206 wi->pgshift = 14; break;
207 case TCR_TG0_64K:
208 default: /* IMPDEF: treat any other value as 64k */
209 wi->pgshift = 16; break;
210 }
211 }
212
213 /* R_PLCGL, R_YXNYW */
214 if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) {
215 if (wi->txsz > 39)
216 goto transfault_l0;
217 } else {
218 if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47))
219 goto transfault_l0;
220 }
221
222 /* R_GTJBY, R_SXWGM */
223 switch (BIT(wi->pgshift)) {
224 case SZ_4K:
225 lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT);
226 lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS);
227 break;
228 case SZ_16K:
229 lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT);
230 lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS);
231 break;
232 case SZ_64K:
233 lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52);
234 break;
235 }
236
237 if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16))
238 goto transfault_l0;
239
240 ia_bits = get_ia_size(wi);
241
242 /* R_YYVYV, I_THCZK */
243 if ((!va55 && va > GENMASK(ia_bits - 1, 0)) ||
244 (va55 && va < GENMASK(63, ia_bits)))
245 goto transfault_l0;
246
247 /* I_ZFSYQ */
248 if (wi->regime != TR_EL2 &&
249 (tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK)))
250 goto transfault_l0;
251
252 /* R_BNDVG and following statements */
253 if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) &&
254 as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0)))
255 goto transfault_l0;
256
257 /* AArch64.S1StartLevel() */
258 stride = wi->pgshift - 3;
259 wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride);
260
261 ps = (wi->regime == TR_EL2 ?
262 FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr));
263
264 wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps));
265
266 /* Compute minimal alignment */
267 x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift);
268
269 wi->baddr = ttbr & TTBRx_EL1_BADDR;
270
271 /* R_VPBBF */
272 if (check_output_size(wi->baddr, wi))
273 goto addrsz;
274
275 wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x);
276
277 return 0;
278
279 addrsz: /* Address Size Fault level 0 */
280 fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false, false);
281 return -EFAULT;
282
283 transfault_l0: /* Translation Fault level 0 */
284 fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false, false);
285 return -EFAULT;
286 }
287
walk_s1(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr,u64 va)288 static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
289 struct s1_walk_result *wr, u64 va)
290 {
291 u64 va_top, va_bottom, baddr, desc;
292 int level, stride, ret;
293
294 level = wi->sl;
295 stride = wi->pgshift - 3;
296 baddr = wi->baddr;
297
298 va_top = get_ia_size(wi) - 1;
299
300 while (1) {
301 u64 index, ipa;
302
303 va_bottom = (3 - level) * stride + wi->pgshift;
304 index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3);
305
306 ipa = baddr | index;
307
308 if (wi->s2) {
309 struct kvm_s2_trans s2_trans = {};
310
311 ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans);
312 if (ret) {
313 fail_s1_walk(wr,
314 (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level,
315 true, true);
316 return ret;
317 }
318
319 if (!kvm_s2_trans_readable(&s2_trans)) {
320 fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level),
321 true, true);
322
323 return -EPERM;
324 }
325
326 ipa = kvm_s2_trans_output(&s2_trans);
327 }
328
329 ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc));
330 if (ret) {
331 fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level),
332 true, false);
333 return ret;
334 }
335
336 if (wi->be)
337 desc = be64_to_cpu((__force __be64)desc);
338 else
339 desc = le64_to_cpu((__force __le64)desc);
340
341 /* Invalid descriptor */
342 if (!(desc & BIT(0)))
343 goto transfault;
344
345 /* Block mapping, check validity down the line */
346 if (!(desc & BIT(1)))
347 break;
348
349 /* Page mapping */
350 if (level == 3)
351 break;
352
353 /* Table handling */
354 if (!wi->hpd) {
355 wr->APTable |= FIELD_GET(S1_TABLE_AP, desc);
356 wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc);
357 wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc);
358 }
359
360 baddr = desc & GENMASK_ULL(47, wi->pgshift);
361
362 /* Check for out-of-range OA */
363 if (check_output_size(baddr, wi))
364 goto addrsz;
365
366 /* Prepare for next round */
367 va_top = va_bottom - 1;
368 level++;
369 }
370
371 /* Block mapping, check the validity of the level */
372 if (!(desc & BIT(1))) {
373 bool valid_block = false;
374
375 switch (BIT(wi->pgshift)) {
376 case SZ_4K:
377 valid_block = level == 1 || level == 2;
378 break;
379 case SZ_16K:
380 case SZ_64K:
381 valid_block = level == 2;
382 break;
383 }
384
385 if (!valid_block)
386 goto transfault;
387 }
388
389 if (check_output_size(desc & GENMASK(47, va_bottom), wi))
390 goto addrsz;
391
392 va_bottom += contiguous_bit_shift(desc, wi, level);
393
394 wr->failed = false;
395 wr->level = level;
396 wr->desc = desc;
397 wr->pa = desc & GENMASK(47, va_bottom);
398 wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0);
399
400 return 0;
401
402 addrsz:
403 fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), true, false);
404 return -EINVAL;
405 transfault:
406 fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), true, false);
407 return -ENOENT;
408 }
409
410 struct mmu_config {
411 u64 ttbr0;
412 u64 ttbr1;
413 u64 tcr;
414 u64 mair;
415 u64 sctlr;
416 u64 vttbr;
417 u64 vtcr;
418 u64 hcr;
419 };
420
__mmu_config_save(struct mmu_config * config)421 static void __mmu_config_save(struct mmu_config *config)
422 {
423 config->ttbr0 = read_sysreg_el1(SYS_TTBR0);
424 config->ttbr1 = read_sysreg_el1(SYS_TTBR1);
425 config->tcr = read_sysreg_el1(SYS_TCR);
426 config->mair = read_sysreg_el1(SYS_MAIR);
427 config->sctlr = read_sysreg_el1(SYS_SCTLR);
428 config->vttbr = read_sysreg(vttbr_el2);
429 config->vtcr = read_sysreg(vtcr_el2);
430 config->hcr = read_sysreg(hcr_el2);
431 }
432
__mmu_config_restore(struct mmu_config * config)433 static void __mmu_config_restore(struct mmu_config *config)
434 {
435 write_sysreg(config->hcr, hcr_el2);
436
437 /*
438 * ARM errata 1165522 and 1530923 require TGE to be 1 before
439 * we update the guest state.
440 */
441 asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
442
443 write_sysreg_el1(config->ttbr0, SYS_TTBR0);
444 write_sysreg_el1(config->ttbr1, SYS_TTBR1);
445 write_sysreg_el1(config->tcr, SYS_TCR);
446 write_sysreg_el1(config->mair, SYS_MAIR);
447 write_sysreg_el1(config->sctlr, SYS_SCTLR);
448 write_sysreg(config->vttbr, vttbr_el2);
449 write_sysreg(config->vtcr, vtcr_el2);
450 }
451
at_s1e1p_fast(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)452 static bool at_s1e1p_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
453 {
454 u64 host_pan;
455 bool fail;
456
457 host_pan = read_sysreg_s(SYS_PSTATE_PAN);
458 write_sysreg_s(*vcpu_cpsr(vcpu) & PSTATE_PAN, SYS_PSTATE_PAN);
459
460 switch (op) {
461 case OP_AT_S1E1RP:
462 fail = __kvm_at(OP_AT_S1E1RP, vaddr);
463 break;
464 case OP_AT_S1E1WP:
465 fail = __kvm_at(OP_AT_S1E1WP, vaddr);
466 break;
467 }
468
469 write_sysreg_s(host_pan, SYS_PSTATE_PAN);
470
471 return fail;
472 }
473
474 #define MEMATTR(ic, oc) (MEMATTR_##oc << 4 | MEMATTR_##ic)
475 #define MEMATTR_NC 0b0100
476 #define MEMATTR_Wt 0b1000
477 #define MEMATTR_Wb 0b1100
478 #define MEMATTR_WbRaWa 0b1111
479
480 #define MEMATTR_IS_DEVICE(m) (((m) & GENMASK(7, 4)) == 0)
481
s2_memattr_to_attr(u8 memattr)482 static u8 s2_memattr_to_attr(u8 memattr)
483 {
484 memattr &= 0b1111;
485
486 switch (memattr) {
487 case 0b0000:
488 case 0b0001:
489 case 0b0010:
490 case 0b0011:
491 return memattr << 2;
492 case 0b0100:
493 return MEMATTR(Wb, Wb);
494 case 0b0101:
495 return MEMATTR(NC, NC);
496 case 0b0110:
497 return MEMATTR(Wt, NC);
498 case 0b0111:
499 return MEMATTR(Wb, NC);
500 case 0b1000:
501 /* Reserved, assume NC */
502 return MEMATTR(NC, NC);
503 case 0b1001:
504 return MEMATTR(NC, Wt);
505 case 0b1010:
506 return MEMATTR(Wt, Wt);
507 case 0b1011:
508 return MEMATTR(Wb, Wt);
509 case 0b1100:
510 /* Reserved, assume NC */
511 return MEMATTR(NC, NC);
512 case 0b1101:
513 return MEMATTR(NC, Wb);
514 case 0b1110:
515 return MEMATTR(Wt, Wb);
516 case 0b1111:
517 return MEMATTR(Wb, Wb);
518 default:
519 unreachable();
520 }
521 }
522
combine_s1_s2_attr(u8 s1,u8 s2)523 static u8 combine_s1_s2_attr(u8 s1, u8 s2)
524 {
525 bool transient;
526 u8 final = 0;
527
528 /* Upgrade transient s1 to non-transient to simplify things */
529 switch (s1) {
530 case 0b0001 ... 0b0011: /* Normal, Write-Through Transient */
531 transient = true;
532 s1 = MEMATTR_Wt | (s1 & GENMASK(1,0));
533 break;
534 case 0b0101 ... 0b0111: /* Normal, Write-Back Transient */
535 transient = true;
536 s1 = MEMATTR_Wb | (s1 & GENMASK(1,0));
537 break;
538 default:
539 transient = false;
540 }
541
542 /* S2CombineS1AttrHints() */
543 if ((s1 & GENMASK(3, 2)) == MEMATTR_NC ||
544 (s2 & GENMASK(3, 2)) == MEMATTR_NC)
545 final = MEMATTR_NC;
546 else if ((s1 & GENMASK(3, 2)) == MEMATTR_Wt ||
547 (s2 & GENMASK(3, 2)) == MEMATTR_Wt)
548 final = MEMATTR_Wt;
549 else
550 final = MEMATTR_Wb;
551
552 if (final != MEMATTR_NC) {
553 /* Inherit RaWa hints form S1 */
554 if (transient) {
555 switch (s1 & GENMASK(3, 2)) {
556 case MEMATTR_Wt:
557 final = 0;
558 break;
559 case MEMATTR_Wb:
560 final = MEMATTR_NC;
561 break;
562 }
563 }
564
565 final |= s1 & GENMASK(1, 0);
566 }
567
568 return final;
569 }
570
571 #define ATTR_NSH 0b00
572 #define ATTR_RSV 0b01
573 #define ATTR_OSH 0b10
574 #define ATTR_ISH 0b11
575
compute_sh(u8 attr,u64 desc)576 static u8 compute_sh(u8 attr, u64 desc)
577 {
578 u8 sh;
579
580 /* Any form of device, as well as NC has SH[1:0]=0b10 */
581 if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC))
582 return ATTR_OSH;
583
584 sh = FIELD_GET(PTE_SHARED, desc);
585 if (sh == ATTR_RSV) /* Reserved, mapped to NSH */
586 sh = ATTR_NSH;
587
588 return sh;
589 }
590
combine_sh(u8 s1_sh,u8 s2_sh)591 static u8 combine_sh(u8 s1_sh, u8 s2_sh)
592 {
593 if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH)
594 return ATTR_OSH;
595 if (s1_sh == ATTR_ISH || s2_sh == ATTR_ISH)
596 return ATTR_ISH;
597
598 return ATTR_NSH;
599 }
600
compute_par_s12(struct kvm_vcpu * vcpu,u64 s1_par,struct kvm_s2_trans * tr)601 static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par,
602 struct kvm_s2_trans *tr)
603 {
604 u8 s1_parattr, s2_memattr, final_attr;
605 u64 par;
606
607 /* If S2 has failed to translate, report the damage */
608 if (tr->esr) {
609 par = SYS_PAR_EL1_RES1;
610 par |= SYS_PAR_EL1_F;
611 par |= SYS_PAR_EL1_S;
612 par |= FIELD_PREP(SYS_PAR_EL1_FST, tr->esr);
613 return par;
614 }
615
616 s1_parattr = FIELD_GET(SYS_PAR_EL1_ATTR, s1_par);
617 s2_memattr = FIELD_GET(GENMASK(5, 2), tr->desc);
618
619 if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_FWB) {
620 if (!kvm_has_feat(vcpu->kvm, ID_AA64PFR2_EL1, MTEPERM, IMP))
621 s2_memattr &= ~BIT(3);
622
623 /* Combination of R_VRJSW and R_RHWZM */
624 switch (s2_memattr) {
625 case 0b0101:
626 if (MEMATTR_IS_DEVICE(s1_parattr))
627 final_attr = s1_parattr;
628 else
629 final_attr = MEMATTR(NC, NC);
630 break;
631 case 0b0110:
632 case 0b1110:
633 final_attr = MEMATTR(WbRaWa, WbRaWa);
634 break;
635 case 0b0111:
636 case 0b1111:
637 /* Preserve S1 attribute */
638 final_attr = s1_parattr;
639 break;
640 case 0b0100:
641 case 0b1100:
642 case 0b1101:
643 /* Reserved, do something non-silly */
644 final_attr = s1_parattr;
645 break;
646 default:
647 /* MemAttr[2]=0, Device from S2 */
648 final_attr = s2_memattr & GENMASK(1,0) << 2;
649 }
650 } else {
651 /* Combination of R_HMNDG, R_TNHFM and R_GQFSF */
652 u8 s2_parattr = s2_memattr_to_attr(s2_memattr);
653
654 if (MEMATTR_IS_DEVICE(s1_parattr) ||
655 MEMATTR_IS_DEVICE(s2_parattr)) {
656 final_attr = min(s1_parattr, s2_parattr);
657 } else {
658 /* At this stage, this is memory vs memory */
659 final_attr = combine_s1_s2_attr(s1_parattr & 0xf,
660 s2_parattr & 0xf);
661 final_attr |= combine_s1_s2_attr(s1_parattr >> 4,
662 s2_parattr >> 4) << 4;
663 }
664 }
665
666 if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_CD) &&
667 !MEMATTR_IS_DEVICE(final_attr))
668 final_attr = MEMATTR(NC, NC);
669
670 par = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr);
671 par |= tr->output & GENMASK(47, 12);
672 par |= FIELD_PREP(SYS_PAR_EL1_SH,
673 combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par),
674 compute_sh(final_attr, tr->desc)));
675
676 return par;
677 }
678
compute_par_s1(struct kvm_vcpu * vcpu,struct s1_walk_result * wr,enum trans_regime regime)679 static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr,
680 enum trans_regime regime)
681 {
682 u64 par;
683
684 if (wr->failed) {
685 par = SYS_PAR_EL1_RES1;
686 par |= SYS_PAR_EL1_F;
687 par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst);
688 par |= wr->ptw ? SYS_PAR_EL1_PTW : 0;
689 par |= wr->s2 ? SYS_PAR_EL1_S : 0;
690 } else if (wr->level == S1_MMU_DISABLED) {
691 /* MMU off or HCR_EL2.DC == 1 */
692 par = SYS_PAR_EL1_NSE;
693 par |= wr->pa & GENMASK_ULL(47, 12);
694
695 if (regime == TR_EL10 &&
696 (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
697 par |= FIELD_PREP(SYS_PAR_EL1_ATTR,
698 MEMATTR(WbRaWa, WbRaWa));
699 par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_NSH);
700 } else {
701 par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */
702 par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_OSH);
703 }
704 } else {
705 u64 mair, sctlr;
706 u8 sh;
707
708 par = SYS_PAR_EL1_NSE;
709
710 mair = (regime == TR_EL10 ?
711 vcpu_read_sys_reg(vcpu, MAIR_EL1) :
712 vcpu_read_sys_reg(vcpu, MAIR_EL2));
713
714 mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8;
715 mair &= 0xff;
716
717 sctlr = (regime == TR_EL10 ?
718 vcpu_read_sys_reg(vcpu, SCTLR_EL1) :
719 vcpu_read_sys_reg(vcpu, SCTLR_EL2));
720
721 /* Force NC for memory if SCTLR_ELx.C is clear */
722 if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair))
723 mair = MEMATTR(NC, NC);
724
725 par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair);
726 par |= wr->pa & GENMASK_ULL(47, 12);
727
728 sh = compute_sh(mair, wr->desc);
729 par |= FIELD_PREP(SYS_PAR_EL1_SH, sh);
730 }
731
732 return par;
733 }
734
pan3_enabled(struct kvm_vcpu * vcpu,enum trans_regime regime)735 static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime)
736 {
737 u64 sctlr;
738
739 if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3))
740 return false;
741
742 if (regime == TR_EL10)
743 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
744 else
745 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
746
747 return sctlr & SCTLR_EL1_EPAN;
748 }
749
handle_at_slow(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)750 static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
751 {
752 bool perm_fail, ur, uw, ux, pr, pw, px;
753 struct s1_walk_result wr = {};
754 struct s1_walk_info wi = {};
755 int ret, idx;
756
757 ret = setup_s1_walk(vcpu, op, &wi, &wr, vaddr);
758 if (ret)
759 goto compute_par;
760
761 if (wr.level == S1_MMU_DISABLED)
762 goto compute_par;
763
764 idx = srcu_read_lock(&vcpu->kvm->srcu);
765
766 ret = walk_s1(vcpu, &wi, &wr, vaddr);
767
768 srcu_read_unlock(&vcpu->kvm->srcu, idx);
769
770 if (ret)
771 goto compute_par;
772
773 /* FIXME: revisit when adding indirect permission support */
774 /* AArch64.S1DirectBasePermissions() */
775 if (wi.regime != TR_EL2) {
776 switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr.desc)) {
777 case 0b00:
778 pr = pw = true;
779 ur = uw = false;
780 break;
781 case 0b01:
782 pr = pw = ur = uw = true;
783 break;
784 case 0b10:
785 pr = true;
786 pw = ur = uw = false;
787 break;
788 case 0b11:
789 pr = ur = true;
790 pw = uw = false;
791 break;
792 }
793
794 switch (wr.APTable) {
795 case 0b00:
796 break;
797 case 0b01:
798 ur = uw = false;
799 break;
800 case 0b10:
801 pw = uw = false;
802 break;
803 case 0b11:
804 pw = ur = uw = false;
805 break;
806 }
807
808 /* We don't use px for anything yet, but hey... */
809 px = !((wr.desc & PTE_PXN) || wr.PXNTable || uw);
810 ux = !((wr.desc & PTE_UXN) || wr.UXNTable);
811
812 if (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) {
813 bool pan;
814
815 pan = *vcpu_cpsr(vcpu) & PSR_PAN_BIT;
816 pan &= ur || uw || (pan3_enabled(vcpu, wi.regime) && ux);
817 pw &= !pan;
818 pr &= !pan;
819 }
820 } else {
821 ur = uw = ux = false;
822
823 if (!(wr.desc & PTE_RDONLY)) {
824 pr = pw = true;
825 } else {
826 pr = true;
827 pw = false;
828 }
829
830 if (wr.APTable & BIT(1))
831 pw = false;
832
833 /* XN maps to UXN */
834 px = !((wr.desc & PTE_UXN) || wr.UXNTable);
835 }
836
837 perm_fail = false;
838
839 switch (op) {
840 case OP_AT_S1E1RP:
841 case OP_AT_S1E1R:
842 case OP_AT_S1E2R:
843 perm_fail = !pr;
844 break;
845 case OP_AT_S1E1WP:
846 case OP_AT_S1E1W:
847 case OP_AT_S1E2W:
848 perm_fail = !pw;
849 break;
850 case OP_AT_S1E0R:
851 perm_fail = !ur;
852 break;
853 case OP_AT_S1E0W:
854 perm_fail = !uw;
855 break;
856 case OP_AT_S1E1A:
857 case OP_AT_S1E2A:
858 break;
859 default:
860 BUG();
861 }
862
863 if (perm_fail)
864 fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false, false);
865
866 compute_par:
867 return compute_par_s1(vcpu, &wr, wi.regime);
868 }
869
870 /*
871 * Return the PAR_EL1 value as the result of a valid translation.
872 *
873 * If the translation is unsuccessful, the value may only contain
874 * PAR_EL1.F, and cannot be taken at face value. It isn't an
875 * indication of the translation having failed, only that the fast
876 * path did not succeed, *unless* it indicates a S1 permission fault.
877 */
__kvm_at_s1e01_fast(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)878 static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
879 {
880 struct mmu_config config;
881 struct kvm_s2_mmu *mmu;
882 bool fail;
883 u64 par;
884
885 par = SYS_PAR_EL1_F;
886
887 /*
888 * We've trapped, so everything is live on the CPU. As we will
889 * be switching contexts behind everybody's back, disable
890 * interrupts while holding the mmu lock.
891 */
892 guard(write_lock_irqsave)(&vcpu->kvm->mmu_lock);
893
894 /*
895 * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already
896 * the right one (as we trapped from vEL2). If not, save the
897 * full MMU context.
898 */
899 if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))
900 goto skip_mmu_switch;
901
902 /*
903 * Obtaining the S2 MMU for a L2 is horribly racy, and we may not
904 * find it (recycled by another vcpu, for example). When this
905 * happens, admit defeat immediately and use the SW (slow) path.
906 */
907 mmu = lookup_s2_mmu(vcpu);
908 if (!mmu)
909 return par;
910
911 __mmu_config_save(&config);
912
913 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1), SYS_TTBR0);
914 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1), SYS_TTBR1);
915 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1), SYS_TCR);
916 write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1), SYS_MAIR);
917 write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1), SYS_SCTLR);
918 __load_stage2(mmu, mmu->arch);
919
920 skip_mmu_switch:
921 /* Clear TGE, enable S2 translation, we're rolling */
922 write_sysreg((config.hcr & ~HCR_TGE) | HCR_VM, hcr_el2);
923 isb();
924
925 switch (op) {
926 case OP_AT_S1E1RP:
927 case OP_AT_S1E1WP:
928 fail = at_s1e1p_fast(vcpu, op, vaddr);
929 break;
930 case OP_AT_S1E1R:
931 fail = __kvm_at(OP_AT_S1E1R, vaddr);
932 break;
933 case OP_AT_S1E1W:
934 fail = __kvm_at(OP_AT_S1E1W, vaddr);
935 break;
936 case OP_AT_S1E0R:
937 fail = __kvm_at(OP_AT_S1E0R, vaddr);
938 break;
939 case OP_AT_S1E0W:
940 fail = __kvm_at(OP_AT_S1E0W, vaddr);
941 break;
942 case OP_AT_S1E1A:
943 fail = __kvm_at(OP_AT_S1E1A, vaddr);
944 break;
945 default:
946 WARN_ON_ONCE(1);
947 fail = true;
948 break;
949 }
950
951 if (!fail)
952 par = read_sysreg_par();
953
954 if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)))
955 __mmu_config_restore(&config);
956
957 return par;
958 }
959
par_check_s1_perm_fault(u64 par)960 static bool par_check_s1_perm_fault(u64 par)
961 {
962 u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
963
964 return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM &&
965 !(par & SYS_PAR_EL1_S));
966 }
967
__kvm_at_s1e01(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)968 void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
969 {
970 u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr);
971
972 /*
973 * If PAR_EL1 reports that AT failed on a S1 permission fault, we
974 * know for sure that the PTW was able to walk the S1 tables and
975 * there's nothing else to do.
976 *
977 * If AT failed for any other reason, then we must walk the guest S1
978 * to emulate the instruction.
979 */
980 if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par))
981 par = handle_at_slow(vcpu, op, vaddr);
982
983 vcpu_write_sys_reg(vcpu, par, PAR_EL1);
984 }
985
__kvm_at_s1e2(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)986 void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
987 {
988 u64 par;
989
990 /*
991 * We've trapped, so everything is live on the CPU. As we will be
992 * switching context behind everybody's back, disable interrupts...
993 */
994 scoped_guard(write_lock_irqsave, &vcpu->kvm->mmu_lock) {
995 struct kvm_s2_mmu *mmu;
996 u64 val, hcr;
997 bool fail;
998
999 mmu = &vcpu->kvm->arch.mmu;
1000
1001 val = hcr = read_sysreg(hcr_el2);
1002 val &= ~HCR_TGE;
1003 val |= HCR_VM;
1004
1005 if (!vcpu_el2_e2h_is_set(vcpu))
1006 val |= HCR_NV | HCR_NV1;
1007
1008 write_sysreg(val, hcr_el2);
1009 isb();
1010
1011 par = SYS_PAR_EL1_F;
1012
1013 switch (op) {
1014 case OP_AT_S1E2R:
1015 fail = __kvm_at(OP_AT_S1E1R, vaddr);
1016 break;
1017 case OP_AT_S1E2W:
1018 fail = __kvm_at(OP_AT_S1E1W, vaddr);
1019 break;
1020 case OP_AT_S1E2A:
1021 fail = __kvm_at(OP_AT_S1E1A, vaddr);
1022 break;
1023 default:
1024 WARN_ON_ONCE(1);
1025 fail = true;
1026 }
1027
1028 isb();
1029
1030 if (!fail)
1031 par = read_sysreg_par();
1032
1033 write_sysreg(hcr, hcr_el2);
1034 isb();
1035 }
1036
1037 /* We failed the translation, let's replay it in slow motion */
1038 if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par))
1039 par = handle_at_slow(vcpu, op, vaddr);
1040
1041 vcpu_write_sys_reg(vcpu, par, PAR_EL1);
1042 }
1043
__kvm_at_s12(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)1044 void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
1045 {
1046 struct kvm_s2_trans out = {};
1047 u64 ipa, par;
1048 bool write;
1049 int ret;
1050
1051 /* Do the stage-1 translation */
1052 switch (op) {
1053 case OP_AT_S12E1R:
1054 op = OP_AT_S1E1R;
1055 write = false;
1056 break;
1057 case OP_AT_S12E1W:
1058 op = OP_AT_S1E1W;
1059 write = true;
1060 break;
1061 case OP_AT_S12E0R:
1062 op = OP_AT_S1E0R;
1063 write = false;
1064 break;
1065 case OP_AT_S12E0W:
1066 op = OP_AT_S1E0W;
1067 write = true;
1068 break;
1069 default:
1070 WARN_ON_ONCE(1);
1071 return;
1072 }
1073
1074 __kvm_at_s1e01(vcpu, op, vaddr);
1075 par = vcpu_read_sys_reg(vcpu, PAR_EL1);
1076 if (par & SYS_PAR_EL1_F)
1077 return;
1078
1079 /*
1080 * If we only have a single stage of translation (E2H=0 or
1081 * TGE=1), exit early. Same thing if {VM,DC}=={0,0}.
1082 */
1083 if (!vcpu_el2_e2h_is_set(vcpu) || vcpu_el2_tge_is_set(vcpu) ||
1084 !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC)))
1085 return;
1086
1087 /* Do the stage-2 translation */
1088 ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0));
1089 out.esr = 0;
1090 ret = kvm_walk_nested_s2(vcpu, ipa, &out);
1091 if (ret < 0)
1092 return;
1093
1094 /* Check the access permission */
1095 if (!out.esr &&
1096 ((!write && !out.readable) || (write && !out.writable)))
1097 out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3);
1098
1099 par = compute_par_s12(vcpu, par, &out);
1100 vcpu_write_sys_reg(vcpu, par, PAR_EL1);
1101 }
1102