1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (C) 2017 - Linaro Ltd
4 * Author: Jintack Lim <jintack.lim@linaro.org>
5 */
6
7 #include <linux/kvm_host.h>
8
9 #include <asm/esr.h>
10 #include <asm/kvm_hyp.h>
11 #include <asm/kvm_mmu.h>
12
fail_s1_walk(struct s1_walk_result * wr,u8 fst,bool s1ptw)13 static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool s1ptw)
14 {
15 wr->fst = fst;
16 wr->ptw = s1ptw;
17 wr->s2 = s1ptw;
18 wr->failed = true;
19 }
20
21 #define S1_MMU_DISABLED (-127)
22
get_ia_size(struct s1_walk_info * wi)23 static int get_ia_size(struct s1_walk_info *wi)
24 {
25 return 64 - wi->txsz;
26 }
27
28 /* Return true if the IPA is out of the OA range */
check_output_size(u64 ipa,struct s1_walk_info * wi)29 static bool check_output_size(u64 ipa, struct s1_walk_info *wi)
30 {
31 if (wi->pa52bit)
32 return wi->max_oa_bits < 52 && (ipa & GENMASK_ULL(51, wi->max_oa_bits));
33 return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits));
34 }
35
has_52bit_pa(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,u64 tcr)36 static bool has_52bit_pa(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, u64 tcr)
37 {
38 switch (BIT(wi->pgshift)) {
39 case SZ_64K:
40 default: /* IMPDEF: treat any other value as 64k */
41 if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, PARANGE, 52))
42 return false;
43 return ((wi->regime == TR_EL2 ?
44 FIELD_GET(TCR_EL2_PS_MASK, tcr) :
45 FIELD_GET(TCR_IPS_MASK, tcr)) == 0b0110);
46 case SZ_16K:
47 if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT))
48 return false;
49 break;
50 case SZ_4K:
51 if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT))
52 return false;
53 break;
54 }
55
56 return (tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS));
57 }
58
desc_to_oa(struct s1_walk_info * wi,u64 desc)59 static u64 desc_to_oa(struct s1_walk_info *wi, u64 desc)
60 {
61 u64 addr;
62
63 if (!wi->pa52bit)
64 return desc & GENMASK_ULL(47, wi->pgshift);
65
66 switch (BIT(wi->pgshift)) {
67 case SZ_4K:
68 case SZ_16K:
69 addr = desc & GENMASK_ULL(49, wi->pgshift);
70 addr |= FIELD_GET(KVM_PTE_ADDR_51_50_LPA2, desc) << 50;
71 break;
72 case SZ_64K:
73 default: /* IMPDEF: treat any other value as 64k */
74 addr = desc & GENMASK_ULL(47, wi->pgshift);
75 addr |= FIELD_GET(KVM_PTE_ADDR_51_48, desc) << 48;
76 break;
77 }
78
79 return addr;
80 }
81
82 /* Return the translation regime that applies to an AT instruction */
compute_translation_regime(struct kvm_vcpu * vcpu,u32 op)83 static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op)
84 {
85 /*
86 * We only get here from guest EL2, so the translation
87 * regime AT applies to is solely defined by {E2H,TGE}.
88 */
89 switch (op) {
90 case OP_AT_S1E2R:
91 case OP_AT_S1E2W:
92 case OP_AT_S1E2A:
93 return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2;
94 break;
95 default:
96 return (vcpu_el2_e2h_is_set(vcpu) &&
97 vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10;
98 }
99 }
100
effective_tcr2(struct kvm_vcpu * vcpu,enum trans_regime regime)101 static u64 effective_tcr2(struct kvm_vcpu *vcpu, enum trans_regime regime)
102 {
103 if (regime == TR_EL10) {
104 if (vcpu_has_nv(vcpu) &&
105 !(__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En))
106 return 0;
107
108 return vcpu_read_sys_reg(vcpu, TCR2_EL1);
109 }
110
111 return vcpu_read_sys_reg(vcpu, TCR2_EL2);
112 }
113
s1pie_enabled(struct kvm_vcpu * vcpu,enum trans_regime regime)114 static bool s1pie_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime)
115 {
116 if (!kvm_has_s1pie(vcpu->kvm))
117 return false;
118
119 /* Abuse TCR2_EL1_PIE and use it for EL2 as well */
120 return effective_tcr2(vcpu, regime) & TCR2_EL1_PIE;
121 }
122
compute_s1poe(struct kvm_vcpu * vcpu,struct s1_walk_info * wi)123 static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi)
124 {
125 u64 val;
126
127 if (!kvm_has_s1poe(vcpu->kvm)) {
128 wi->poe = wi->e0poe = false;
129 return;
130 }
131
132 val = effective_tcr2(vcpu, wi->regime);
133
134 /* Abuse TCR2_EL1_* for EL2 */
135 wi->poe = val & TCR2_EL1_POE;
136 wi->e0poe = (wi->regime != TR_EL2) && (val & TCR2_EL1_E0POE);
137 }
138
setup_s1_walk(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr,u64 va)139 static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
140 struct s1_walk_result *wr, u64 va)
141 {
142 u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr;
143 unsigned int stride, x;
144 bool va55, tbi, lva;
145
146 va55 = va & BIT(55);
147
148 if (vcpu_has_nv(vcpu)) {
149 hcr = __vcpu_sys_reg(vcpu, HCR_EL2);
150 wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC));
151 } else {
152 WARN_ON_ONCE(wi->regime != TR_EL10);
153 wi->s2 = false;
154 hcr = 0;
155 }
156
157 switch (wi->regime) {
158 case TR_EL10:
159 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
160 tcr = vcpu_read_sys_reg(vcpu, TCR_EL1);
161 ttbr = (va55 ?
162 vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
163 vcpu_read_sys_reg(vcpu, TTBR0_EL1));
164 break;
165 case TR_EL2:
166 case TR_EL20:
167 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
168 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
169 ttbr = (va55 ?
170 vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
171 vcpu_read_sys_reg(vcpu, TTBR0_EL2));
172 break;
173 default:
174 BUG();
175 }
176
177 /* Someone was silly enough to encode TG0/TG1 differently */
178 if (va55 && wi->regime != TR_EL2) {
179 wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr);
180 tg = FIELD_GET(TCR_TG1_MASK, tcr);
181
182 switch (tg << TCR_TG1_SHIFT) {
183 case TCR_TG1_4K:
184 wi->pgshift = 12; break;
185 case TCR_TG1_16K:
186 wi->pgshift = 14; break;
187 case TCR_TG1_64K:
188 default: /* IMPDEF: treat any other value as 64k */
189 wi->pgshift = 16; break;
190 }
191 } else {
192 wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
193 tg = FIELD_GET(TCR_TG0_MASK, tcr);
194
195 switch (tg << TCR_TG0_SHIFT) {
196 case TCR_TG0_4K:
197 wi->pgshift = 12; break;
198 case TCR_TG0_16K:
199 wi->pgshift = 14; break;
200 case TCR_TG0_64K:
201 default: /* IMPDEF: treat any other value as 64k */
202 wi->pgshift = 16; break;
203 }
204 }
205
206 wi->pa52bit = has_52bit_pa(vcpu, wi, tcr);
207
208 ia_bits = get_ia_size(wi);
209
210 /* AArch64.S1StartLevel() */
211 stride = wi->pgshift - 3;
212 wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride);
213
214 if (wi->regime == TR_EL2 && va55)
215 goto addrsz;
216
217 tbi = (wi->regime == TR_EL2 ?
218 FIELD_GET(TCR_EL2_TBI, tcr) :
219 (va55 ?
220 FIELD_GET(TCR_TBI1, tcr) :
221 FIELD_GET(TCR_TBI0, tcr)));
222
223 if (!tbi && (u64)sign_extend64(va, 55) != va)
224 goto addrsz;
225
226 wi->sh = (wi->regime == TR_EL2 ?
227 FIELD_GET(TCR_EL2_SH0_MASK, tcr) :
228 (va55 ?
229 FIELD_GET(TCR_SH1_MASK, tcr) :
230 FIELD_GET(TCR_SH0_MASK, tcr)));
231
232 va = (u64)sign_extend64(va, 55);
233
234 /* Let's put the MMU disabled case aside immediately */
235 switch (wi->regime) {
236 case TR_EL10:
237 /*
238 * If dealing with the EL1&0 translation regime, 3 things
239 * can disable the S1 translation:
240 *
241 * - HCR_EL2.DC = 1
242 * - HCR_EL2.{E2H,TGE} = {0,1}
243 * - SCTLR_EL1.M = 0
244 *
245 * The TGE part is interesting. If we have decided that this
246 * is EL1&0, then it means that either {E2H,TGE} == {1,0} or
247 * {0,x}, and we only need to test for TGE == 1.
248 */
249 if (hcr & (HCR_DC | HCR_TGE)) {
250 wr->level = S1_MMU_DISABLED;
251 break;
252 }
253 fallthrough;
254 case TR_EL2:
255 case TR_EL20:
256 if (!(sctlr & SCTLR_ELx_M))
257 wr->level = S1_MMU_DISABLED;
258 break;
259 }
260
261 if (wr->level == S1_MMU_DISABLED) {
262 if (va >= BIT(kvm_get_pa_bits(vcpu->kvm)))
263 goto addrsz;
264
265 wr->pa = va;
266 return 0;
267 }
268
269 wi->be = sctlr & SCTLR_ELx_EE;
270
271 wi->hpd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP);
272 wi->hpd &= (wi->regime == TR_EL2 ?
273 FIELD_GET(TCR_EL2_HPD, tcr) :
274 (va55 ?
275 FIELD_GET(TCR_HPD1, tcr) :
276 FIELD_GET(TCR_HPD0, tcr)));
277 /* R_JHSVW */
278 wi->hpd |= s1pie_enabled(vcpu, wi->regime);
279
280 /* Do we have POE? */
281 compute_s1poe(vcpu, wi);
282
283 /* R_BVXDG */
284 wi->hpd |= (wi->poe || wi->e0poe);
285
286 /* R_PLCGL, R_YXNYW */
287 if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) {
288 if (wi->txsz > 39)
289 goto transfault;
290 } else {
291 if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47))
292 goto transfault;
293 }
294
295 /* R_GTJBY, R_SXWGM */
296 switch (BIT(wi->pgshift)) {
297 case SZ_4K:
298 case SZ_16K:
299 lva = wi->pa52bit;
300 break;
301 case SZ_64K:
302 lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52);
303 break;
304 }
305
306 if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16))
307 goto transfault;
308
309 /* R_YYVYV, I_THCZK */
310 if ((!va55 && va > GENMASK(ia_bits - 1, 0)) ||
311 (va55 && va < GENMASK(63, ia_bits)))
312 goto transfault;
313
314 /* I_ZFSYQ */
315 if (wi->regime != TR_EL2 &&
316 (tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK)))
317 goto transfault;
318
319 /* R_BNDVG and following statements */
320 if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) &&
321 wi->as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0)))
322 goto transfault;
323
324 ps = (wi->regime == TR_EL2 ?
325 FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr));
326
327 wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps, wi->pa52bit));
328
329 /* Compute minimal alignment */
330 x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift);
331
332 wi->baddr = ttbr & TTBRx_EL1_BADDR;
333 if (wi->pa52bit) {
334 /*
335 * Force the alignment on 64 bytes for top-level tables
336 * smaller than 8 entries, since TTBR.BADDR[5:2] are used to
337 * store bits [51:48] of the first level of lookup.
338 */
339 x = max(x, 6);
340
341 wi->baddr |= FIELD_GET(GENMASK_ULL(5, 2), ttbr) << 48;
342 }
343
344 /* R_VPBBF */
345 if (check_output_size(wi->baddr, wi))
346 goto addrsz;
347
348 wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x);
349
350 return 0;
351
352 addrsz:
353 /*
354 * Address Size Fault level 0 to indicate it comes from TTBR.
355 * yes, this is an oddity.
356 */
357 fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false);
358 return -EFAULT;
359
360 transfault:
361 /* Translation Fault on start level */
362 fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(wi->sl), false);
363 return -EFAULT;
364 }
365
walk_s1(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr,u64 va)366 static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
367 struct s1_walk_result *wr, u64 va)
368 {
369 u64 va_top, va_bottom, baddr, desc;
370 int level, stride, ret;
371
372 level = wi->sl;
373 stride = wi->pgshift - 3;
374 baddr = wi->baddr;
375
376 va_top = get_ia_size(wi) - 1;
377
378 while (1) {
379 u64 index, ipa;
380
381 va_bottom = (3 - level) * stride + wi->pgshift;
382 index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3);
383
384 ipa = baddr | index;
385
386 if (wi->s2) {
387 struct kvm_s2_trans s2_trans = {};
388
389 ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans);
390 if (ret) {
391 fail_s1_walk(wr,
392 (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level,
393 true);
394 return ret;
395 }
396
397 if (!kvm_s2_trans_readable(&s2_trans)) {
398 fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level),
399 true);
400
401 return -EPERM;
402 }
403
404 ipa = kvm_s2_trans_output(&s2_trans);
405 }
406
407 if (wi->filter) {
408 ret = wi->filter->fn(&(struct s1_walk_context)
409 {
410 .wi = wi,
411 .table_ipa = baddr,
412 .level = level,
413 }, wi->filter->priv);
414 if (ret)
415 return ret;
416 }
417
418 ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc));
419 if (ret) {
420 fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false);
421 return ret;
422 }
423
424 if (wi->be)
425 desc = be64_to_cpu((__force __be64)desc);
426 else
427 desc = le64_to_cpu((__force __le64)desc);
428
429 /* Invalid descriptor */
430 if (!(desc & BIT(0)))
431 goto transfault;
432
433 /* Block mapping, check validity down the line */
434 if (!(desc & BIT(1)))
435 break;
436
437 /* Page mapping */
438 if (level == 3)
439 break;
440
441 /* Table handling */
442 if (!wi->hpd) {
443 wr->APTable |= FIELD_GET(S1_TABLE_AP, desc);
444 wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc);
445 wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc);
446 }
447
448 baddr = desc_to_oa(wi, desc);
449
450 /* Check for out-of-range OA */
451 if (check_output_size(baddr, wi))
452 goto addrsz;
453
454 /* Prepare for next round */
455 va_top = va_bottom - 1;
456 level++;
457 }
458
459 /* Block mapping, check the validity of the level */
460 if (!(desc & BIT(1))) {
461 bool valid_block = false;
462
463 switch (BIT(wi->pgshift)) {
464 case SZ_4K:
465 valid_block = level == 1 || level == 2 || (wi->pa52bit && level == 0);
466 break;
467 case SZ_16K:
468 case SZ_64K:
469 valid_block = level == 2 || (wi->pa52bit && level == 1);
470 break;
471 }
472
473 if (!valid_block)
474 goto transfault;
475 }
476
477 baddr = desc_to_oa(wi, desc);
478 if (check_output_size(baddr & GENMASK(52, va_bottom), wi))
479 goto addrsz;
480
481 if (!(desc & PTE_AF)) {
482 fail_s1_walk(wr, ESR_ELx_FSC_ACCESS_L(level), false);
483 return -EACCES;
484 }
485
486 va_bottom += contiguous_bit_shift(desc, wi, level);
487
488 wr->failed = false;
489 wr->level = level;
490 wr->desc = desc;
491 wr->pa = baddr & GENMASK(52, va_bottom);
492 wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0);
493
494 wr->nG = (wi->regime != TR_EL2) && (desc & PTE_NG);
495 if (wr->nG) {
496 u64 asid_ttbr, tcr;
497
498 switch (wi->regime) {
499 case TR_EL10:
500 tcr = vcpu_read_sys_reg(vcpu, TCR_EL1);
501 asid_ttbr = ((tcr & TCR_A1) ?
502 vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
503 vcpu_read_sys_reg(vcpu, TTBR0_EL1));
504 break;
505 case TR_EL20:
506 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
507 asid_ttbr = ((tcr & TCR_A1) ?
508 vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
509 vcpu_read_sys_reg(vcpu, TTBR0_EL2));
510 break;
511 default:
512 BUG();
513 }
514
515 wr->asid = FIELD_GET(TTBR_ASID_MASK, asid_ttbr);
516 if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) ||
517 !(tcr & TCR_ASID16))
518 wr->asid &= GENMASK(7, 0);
519 }
520
521 return 0;
522
523 addrsz:
524 fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), false);
525 return -EINVAL;
526 transfault:
527 fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), false);
528 return -ENOENT;
529 }
530
531 struct mmu_config {
532 u64 ttbr0;
533 u64 ttbr1;
534 u64 tcr;
535 u64 mair;
536 u64 tcr2;
537 u64 pir;
538 u64 pire0;
539 u64 por_el0;
540 u64 por_el1;
541 u64 sctlr;
542 u64 vttbr;
543 u64 vtcr;
544 };
545
__mmu_config_save(struct mmu_config * config)546 static void __mmu_config_save(struct mmu_config *config)
547 {
548 config->ttbr0 = read_sysreg_el1(SYS_TTBR0);
549 config->ttbr1 = read_sysreg_el1(SYS_TTBR1);
550 config->tcr = read_sysreg_el1(SYS_TCR);
551 config->mair = read_sysreg_el1(SYS_MAIR);
552 if (cpus_have_final_cap(ARM64_HAS_TCR2)) {
553 config->tcr2 = read_sysreg_el1(SYS_TCR2);
554 if (cpus_have_final_cap(ARM64_HAS_S1PIE)) {
555 config->pir = read_sysreg_el1(SYS_PIR);
556 config->pire0 = read_sysreg_el1(SYS_PIRE0);
557 }
558 if (system_supports_poe()) {
559 config->por_el1 = read_sysreg_el1(SYS_POR);
560 config->por_el0 = read_sysreg_s(SYS_POR_EL0);
561 }
562 }
563 config->sctlr = read_sysreg_el1(SYS_SCTLR);
564 config->vttbr = read_sysreg(vttbr_el2);
565 config->vtcr = read_sysreg(vtcr_el2);
566 }
567
__mmu_config_restore(struct mmu_config * config)568 static void __mmu_config_restore(struct mmu_config *config)
569 {
570 /*
571 * ARM errata 1165522 and 1530923 require TGE to be 1 before
572 * we update the guest state.
573 */
574 asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
575
576 write_sysreg_el1(config->ttbr0, SYS_TTBR0);
577 write_sysreg_el1(config->ttbr1, SYS_TTBR1);
578 write_sysreg_el1(config->tcr, SYS_TCR);
579 write_sysreg_el1(config->mair, SYS_MAIR);
580 if (cpus_have_final_cap(ARM64_HAS_TCR2)) {
581 write_sysreg_el1(config->tcr2, SYS_TCR2);
582 if (cpus_have_final_cap(ARM64_HAS_S1PIE)) {
583 write_sysreg_el1(config->pir, SYS_PIR);
584 write_sysreg_el1(config->pire0, SYS_PIRE0);
585 }
586 if (system_supports_poe()) {
587 write_sysreg_el1(config->por_el1, SYS_POR);
588 write_sysreg_s(config->por_el0, SYS_POR_EL0);
589 }
590 }
591 write_sysreg_el1(config->sctlr, SYS_SCTLR);
592 write_sysreg(config->vttbr, vttbr_el2);
593 write_sysreg(config->vtcr, vtcr_el2);
594 }
595
at_s1e1p_fast(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)596 static bool at_s1e1p_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
597 {
598 u64 host_pan;
599 bool fail;
600
601 host_pan = read_sysreg_s(SYS_PSTATE_PAN);
602 write_sysreg_s(*vcpu_cpsr(vcpu) & PSTATE_PAN, SYS_PSTATE_PAN);
603
604 switch (op) {
605 case OP_AT_S1E1RP:
606 fail = __kvm_at(OP_AT_S1E1RP, vaddr);
607 break;
608 case OP_AT_S1E1WP:
609 fail = __kvm_at(OP_AT_S1E1WP, vaddr);
610 break;
611 }
612
613 write_sysreg_s(host_pan, SYS_PSTATE_PAN);
614
615 return fail;
616 }
617
618 #define MEMATTR(ic, oc) (MEMATTR_##oc << 4 | MEMATTR_##ic)
619 #define MEMATTR_NC 0b0100
620 #define MEMATTR_Wt 0b1000
621 #define MEMATTR_Wb 0b1100
622 #define MEMATTR_WbRaWa 0b1111
623
624 #define MEMATTR_IS_DEVICE(m) (((m) & GENMASK(7, 4)) == 0)
625
s2_memattr_to_attr(u8 memattr)626 static u8 s2_memattr_to_attr(u8 memattr)
627 {
628 memattr &= 0b1111;
629
630 switch (memattr) {
631 case 0b0000:
632 case 0b0001:
633 case 0b0010:
634 case 0b0011:
635 return memattr << 2;
636 case 0b0100:
637 return MEMATTR(Wb, Wb);
638 case 0b0101:
639 return MEMATTR(NC, NC);
640 case 0b0110:
641 return MEMATTR(Wt, NC);
642 case 0b0111:
643 return MEMATTR(Wb, NC);
644 case 0b1000:
645 /* Reserved, assume NC */
646 return MEMATTR(NC, NC);
647 case 0b1001:
648 return MEMATTR(NC, Wt);
649 case 0b1010:
650 return MEMATTR(Wt, Wt);
651 case 0b1011:
652 return MEMATTR(Wb, Wt);
653 case 0b1100:
654 /* Reserved, assume NC */
655 return MEMATTR(NC, NC);
656 case 0b1101:
657 return MEMATTR(NC, Wb);
658 case 0b1110:
659 return MEMATTR(Wt, Wb);
660 case 0b1111:
661 return MEMATTR(Wb, Wb);
662 default:
663 unreachable();
664 }
665 }
666
combine_s1_s2_attr(u8 s1,u8 s2)667 static u8 combine_s1_s2_attr(u8 s1, u8 s2)
668 {
669 bool transient;
670 u8 final = 0;
671
672 /* Upgrade transient s1 to non-transient to simplify things */
673 switch (s1) {
674 case 0b0001 ... 0b0011: /* Normal, Write-Through Transient */
675 transient = true;
676 s1 = MEMATTR_Wt | (s1 & GENMASK(1,0));
677 break;
678 case 0b0101 ... 0b0111: /* Normal, Write-Back Transient */
679 transient = true;
680 s1 = MEMATTR_Wb | (s1 & GENMASK(1,0));
681 break;
682 default:
683 transient = false;
684 }
685
686 /* S2CombineS1AttrHints() */
687 if ((s1 & GENMASK(3, 2)) == MEMATTR_NC ||
688 (s2 & GENMASK(3, 2)) == MEMATTR_NC)
689 final = MEMATTR_NC;
690 else if ((s1 & GENMASK(3, 2)) == MEMATTR_Wt ||
691 (s2 & GENMASK(3, 2)) == MEMATTR_Wt)
692 final = MEMATTR_Wt;
693 else
694 final = MEMATTR_Wb;
695
696 if (final != MEMATTR_NC) {
697 /* Inherit RaWa hints form S1 */
698 if (transient) {
699 switch (s1 & GENMASK(3, 2)) {
700 case MEMATTR_Wt:
701 final = 0;
702 break;
703 case MEMATTR_Wb:
704 final = MEMATTR_NC;
705 break;
706 }
707 }
708
709 final |= s1 & GENMASK(1, 0);
710 }
711
712 return final;
713 }
714
715 #define ATTR_NSH 0b00
716 #define ATTR_RSV 0b01
717 #define ATTR_OSH 0b10
718 #define ATTR_ISH 0b11
719
compute_final_sh(u8 attr,u8 sh)720 static u8 compute_final_sh(u8 attr, u8 sh)
721 {
722 /* Any form of device, as well as NC has SH[1:0]=0b10 */
723 if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC))
724 return ATTR_OSH;
725
726 if (sh == ATTR_RSV) /* Reserved, mapped to NSH */
727 sh = ATTR_NSH;
728
729 return sh;
730 }
731
compute_s1_sh(struct s1_walk_info * wi,struct s1_walk_result * wr,u8 attr)732 static u8 compute_s1_sh(struct s1_walk_info *wi, struct s1_walk_result *wr,
733 u8 attr)
734 {
735 u8 sh;
736
737 /*
738 * non-52bit and LPA have their basic shareability described in the
739 * descriptor. LPA2 gets it from the corresponding field in TCR,
740 * conveniently recorded in the walk info.
741 */
742 if (!wi->pa52bit || BIT(wi->pgshift) == SZ_64K)
743 sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_SH, wr->desc);
744 else
745 sh = wi->sh;
746
747 return compute_final_sh(attr, sh);
748 }
749
combine_sh(u8 s1_sh,u8 s2_sh)750 static u8 combine_sh(u8 s1_sh, u8 s2_sh)
751 {
752 if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH)
753 return ATTR_OSH;
754 if (s1_sh == ATTR_ISH || s2_sh == ATTR_ISH)
755 return ATTR_ISH;
756
757 return ATTR_NSH;
758 }
759
compute_par_s12(struct kvm_vcpu * vcpu,u64 s1_par,struct kvm_s2_trans * tr)760 static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par,
761 struct kvm_s2_trans *tr)
762 {
763 u8 s1_parattr, s2_memattr, final_attr, s2_sh;
764 u64 par;
765
766 /* If S2 has failed to translate, report the damage */
767 if (tr->esr) {
768 par = SYS_PAR_EL1_RES1;
769 par |= SYS_PAR_EL1_F;
770 par |= SYS_PAR_EL1_S;
771 par |= FIELD_PREP(SYS_PAR_EL1_FST, tr->esr);
772 return par;
773 }
774
775 s1_parattr = FIELD_GET(SYS_PAR_EL1_ATTR, s1_par);
776 s2_memattr = FIELD_GET(GENMASK(5, 2), tr->desc);
777
778 if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_FWB) {
779 if (!kvm_has_feat(vcpu->kvm, ID_AA64PFR2_EL1, MTEPERM, IMP))
780 s2_memattr &= ~BIT(3);
781
782 /* Combination of R_VRJSW and R_RHWZM */
783 switch (s2_memattr) {
784 case 0b0101:
785 if (MEMATTR_IS_DEVICE(s1_parattr))
786 final_attr = s1_parattr;
787 else
788 final_attr = MEMATTR(NC, NC);
789 break;
790 case 0b0110:
791 case 0b1110:
792 final_attr = MEMATTR(WbRaWa, WbRaWa);
793 break;
794 case 0b0111:
795 case 0b1111:
796 /* Preserve S1 attribute */
797 final_attr = s1_parattr;
798 break;
799 case 0b0100:
800 case 0b1100:
801 case 0b1101:
802 /* Reserved, do something non-silly */
803 final_attr = s1_parattr;
804 break;
805 default:
806 /*
807 * MemAttr[2]=0, Device from S2.
808 *
809 * FWB does not influence the way that stage 1
810 * memory types and attributes are combined
811 * with stage 2 Device type and attributes.
812 */
813 final_attr = min(s2_memattr_to_attr(s2_memattr),
814 s1_parattr);
815 }
816 } else {
817 /* Combination of R_HMNDG, R_TNHFM and R_GQFSF */
818 u8 s2_parattr = s2_memattr_to_attr(s2_memattr);
819
820 if (MEMATTR_IS_DEVICE(s1_parattr) ||
821 MEMATTR_IS_DEVICE(s2_parattr)) {
822 final_attr = min(s1_parattr, s2_parattr);
823 } else {
824 /* At this stage, this is memory vs memory */
825 final_attr = combine_s1_s2_attr(s1_parattr & 0xf,
826 s2_parattr & 0xf);
827 final_attr |= combine_s1_s2_attr(s1_parattr >> 4,
828 s2_parattr >> 4) << 4;
829 }
830 }
831
832 if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_CD) &&
833 !MEMATTR_IS_DEVICE(final_attr))
834 final_attr = MEMATTR(NC, NC);
835
836 s2_sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_SH, tr->desc);
837
838 par = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr);
839 par |= tr->output & GENMASK(47, 12);
840 par |= FIELD_PREP(SYS_PAR_EL1_SH,
841 combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par),
842 compute_final_sh(final_attr, s2_sh)));
843
844 return par;
845 }
846
compute_par_s1(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)847 static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
848 struct s1_walk_result *wr)
849 {
850 u64 par;
851
852 if (wr->failed) {
853 par = SYS_PAR_EL1_RES1;
854 par |= SYS_PAR_EL1_F;
855 par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst);
856 par |= wr->ptw ? SYS_PAR_EL1_PTW : 0;
857 par |= wr->s2 ? SYS_PAR_EL1_S : 0;
858 } else if (wr->level == S1_MMU_DISABLED) {
859 /* MMU off or HCR_EL2.DC == 1 */
860 par = SYS_PAR_EL1_NSE;
861 par |= wr->pa & SYS_PAR_EL1_PA;
862
863 if (wi->regime == TR_EL10 && vcpu_has_nv(vcpu) &&
864 (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
865 par |= FIELD_PREP(SYS_PAR_EL1_ATTR,
866 MEMATTR(WbRaWa, WbRaWa));
867 par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_NSH);
868 } else {
869 par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */
870 par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_OSH);
871 }
872 } else {
873 u64 mair, sctlr;
874 u8 sh;
875
876 par = SYS_PAR_EL1_NSE;
877
878 mair = (wi->regime == TR_EL10 ?
879 vcpu_read_sys_reg(vcpu, MAIR_EL1) :
880 vcpu_read_sys_reg(vcpu, MAIR_EL2));
881
882 mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8;
883 mair &= 0xff;
884
885 sctlr = (wi->regime == TR_EL10 ?
886 vcpu_read_sys_reg(vcpu, SCTLR_EL1) :
887 vcpu_read_sys_reg(vcpu, SCTLR_EL2));
888
889 /* Force NC for memory if SCTLR_ELx.C is clear */
890 if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair))
891 mair = MEMATTR(NC, NC);
892
893 par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair);
894 par |= wr->pa & SYS_PAR_EL1_PA;
895
896 sh = compute_s1_sh(wi, wr, mair);
897 par |= FIELD_PREP(SYS_PAR_EL1_SH, sh);
898 }
899
900 return par;
901 }
902
pan3_enabled(struct kvm_vcpu * vcpu,enum trans_regime regime)903 static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime)
904 {
905 u64 sctlr;
906
907 if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3))
908 return false;
909
910 if (s1pie_enabled(vcpu, regime))
911 return true;
912
913 if (regime == TR_EL10)
914 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
915 else
916 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
917
918 return sctlr & SCTLR_EL1_EPAN;
919 }
920
compute_s1_direct_permissions(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)921 static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu,
922 struct s1_walk_info *wi,
923 struct s1_walk_result *wr)
924 {
925 bool wxn;
926
927 /* Non-hierarchical part of AArch64.S1DirectBasePermissions() */
928 if (wi->regime != TR_EL2) {
929 switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr->desc)) {
930 case 0b00:
931 wr->pr = wr->pw = true;
932 wr->ur = wr->uw = false;
933 break;
934 case 0b01:
935 wr->pr = wr->pw = wr->ur = wr->uw = true;
936 break;
937 case 0b10:
938 wr->pr = true;
939 wr->pw = wr->ur = wr->uw = false;
940 break;
941 case 0b11:
942 wr->pr = wr->ur = true;
943 wr->pw = wr->uw = false;
944 break;
945 }
946
947 /* We don't use px for anything yet, but hey... */
948 wr->px = !((wr->desc & PTE_PXN) || wr->uw);
949 wr->ux = !(wr->desc & PTE_UXN);
950 } else {
951 wr->ur = wr->uw = wr->ux = false;
952
953 if (!(wr->desc & PTE_RDONLY)) {
954 wr->pr = wr->pw = true;
955 } else {
956 wr->pr = true;
957 wr->pw = false;
958 }
959
960 /* XN maps to UXN */
961 wr->px = !(wr->desc & PTE_UXN);
962 }
963
964 switch (wi->regime) {
965 case TR_EL2:
966 case TR_EL20:
967 wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_WXN);
968 break;
969 case TR_EL10:
970 wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN);
971 break;
972 }
973
974 wr->pwxn = wr->uwxn = wxn;
975 wr->pov = wi->poe;
976 wr->uov = wi->e0poe;
977 }
978
compute_s1_hierarchical_permissions(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)979 static void compute_s1_hierarchical_permissions(struct kvm_vcpu *vcpu,
980 struct s1_walk_info *wi,
981 struct s1_walk_result *wr)
982 {
983 /* Hierarchical part of AArch64.S1DirectBasePermissions() */
984 if (wi->regime != TR_EL2) {
985 switch (wr->APTable) {
986 case 0b00:
987 break;
988 case 0b01:
989 wr->ur = wr->uw = false;
990 break;
991 case 0b10:
992 wr->pw = wr->uw = false;
993 break;
994 case 0b11:
995 wr->pw = wr->ur = wr->uw = false;
996 break;
997 }
998
999 wr->px &= !wr->PXNTable;
1000 wr->ux &= !wr->UXNTable;
1001 } else {
1002 if (wr->APTable & BIT(1))
1003 wr->pw = false;
1004
1005 /* XN maps to UXN */
1006 wr->px &= !wr->UXNTable;
1007 }
1008 }
1009
1010 #define perm_idx(v, r, i) ((vcpu_read_sys_reg((v), (r)) >> ((i) * 4)) & 0xf)
1011
1012 #define set_priv_perms(wr, r, w, x) \
1013 do { \
1014 (wr)->pr = (r); \
1015 (wr)->pw = (w); \
1016 (wr)->px = (x); \
1017 } while (0)
1018
1019 #define set_unpriv_perms(wr, r, w, x) \
1020 do { \
1021 (wr)->ur = (r); \
1022 (wr)->uw = (w); \
1023 (wr)->ux = (x); \
1024 } while (0)
1025
1026 #define set_priv_wxn(wr, v) \
1027 do { \
1028 (wr)->pwxn = (v); \
1029 } while (0)
1030
1031 #define set_unpriv_wxn(wr, v) \
1032 do { \
1033 (wr)->uwxn = (v); \
1034 } while (0)
1035
1036 /* Similar to AArch64.S1IndirectBasePermissions(), without GCS */
1037 #define set_perms(w, wr, ip) \
1038 do { \
1039 /* R_LLZDZ */ \
1040 switch ((ip)) { \
1041 case 0b0000: \
1042 set_ ## w ## _perms((wr), false, false, false); \
1043 break; \
1044 case 0b0001: \
1045 set_ ## w ## _perms((wr), true , false, false); \
1046 break; \
1047 case 0b0010: \
1048 set_ ## w ## _perms((wr), false, false, true ); \
1049 break; \
1050 case 0b0011: \
1051 set_ ## w ## _perms((wr), true , false, true ); \
1052 break; \
1053 case 0b0100: \
1054 set_ ## w ## _perms((wr), false, false, false); \
1055 break; \
1056 case 0b0101: \
1057 set_ ## w ## _perms((wr), true , true , false); \
1058 break; \
1059 case 0b0110: \
1060 set_ ## w ## _perms((wr), true , true , true ); \
1061 break; \
1062 case 0b0111: \
1063 set_ ## w ## _perms((wr), true , true , true ); \
1064 break; \
1065 case 0b1000: \
1066 set_ ## w ## _perms((wr), true , false, false); \
1067 break; \
1068 case 0b1001: \
1069 set_ ## w ## _perms((wr), true , false, false); \
1070 break; \
1071 case 0b1010: \
1072 set_ ## w ## _perms((wr), true , false, true ); \
1073 break; \
1074 case 0b1011: \
1075 set_ ## w ## _perms((wr), false, false, false); \
1076 break; \
1077 case 0b1100: \
1078 set_ ## w ## _perms((wr), true , true , false); \
1079 break; \
1080 case 0b1101: \
1081 set_ ## w ## _perms((wr), false, false, false); \
1082 break; \
1083 case 0b1110: \
1084 set_ ## w ## _perms((wr), true , true , true ); \
1085 break; \
1086 case 0b1111: \
1087 set_ ## w ## _perms((wr), false, false, false); \
1088 break; \
1089 } \
1090 \
1091 /* R_HJYGR */ \
1092 set_ ## w ## _wxn((wr), ((ip) == 0b0110)); \
1093 \
1094 } while (0)
1095
compute_s1_indirect_permissions(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)1096 static void compute_s1_indirect_permissions(struct kvm_vcpu *vcpu,
1097 struct s1_walk_info *wi,
1098 struct s1_walk_result *wr)
1099 {
1100 u8 up, pp, idx;
1101
1102 idx = pte_pi_index(wr->desc);
1103
1104 switch (wi->regime) {
1105 case TR_EL10:
1106 pp = perm_idx(vcpu, PIR_EL1, idx);
1107 up = perm_idx(vcpu, PIRE0_EL1, idx);
1108 break;
1109 case TR_EL20:
1110 pp = perm_idx(vcpu, PIR_EL2, idx);
1111 up = perm_idx(vcpu, PIRE0_EL2, idx);
1112 break;
1113 case TR_EL2:
1114 pp = perm_idx(vcpu, PIR_EL2, idx);
1115 up = 0;
1116 break;
1117 }
1118
1119 set_perms(priv, wr, pp);
1120
1121 if (wi->regime != TR_EL2)
1122 set_perms(unpriv, wr, up);
1123 else
1124 set_unpriv_perms(wr, false, false, false);
1125
1126 wr->pov = wi->poe && !(pp & BIT(3));
1127 wr->uov = wi->e0poe && !(up & BIT(3));
1128
1129 /* R_VFPJF */
1130 if (wr->px && wr->uw) {
1131 set_priv_perms(wr, false, false, false);
1132 set_unpriv_perms(wr, false, false, false);
1133 }
1134 }
1135
compute_s1_overlay_permissions(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)1136 static void compute_s1_overlay_permissions(struct kvm_vcpu *vcpu,
1137 struct s1_walk_info *wi,
1138 struct s1_walk_result *wr)
1139 {
1140 u8 idx, pov_perms, uov_perms;
1141
1142 idx = FIELD_GET(PTE_PO_IDX_MASK, wr->desc);
1143
1144 if (wr->pov) {
1145 switch (wi->regime) {
1146 case TR_EL10:
1147 pov_perms = perm_idx(vcpu, POR_EL1, idx);
1148 break;
1149 case TR_EL20:
1150 pov_perms = perm_idx(vcpu, POR_EL2, idx);
1151 break;
1152 case TR_EL2:
1153 pov_perms = perm_idx(vcpu, POR_EL2, idx);
1154 break;
1155 }
1156
1157 if (pov_perms & ~POE_RWX)
1158 pov_perms = POE_NONE;
1159
1160 /* R_QXXPC, S1PrivOverflow enabled */
1161 if (wr->pwxn && (pov_perms & POE_X))
1162 pov_perms &= ~POE_W;
1163
1164 wr->pr &= pov_perms & POE_R;
1165 wr->pw &= pov_perms & POE_W;
1166 wr->px &= pov_perms & POE_X;
1167 }
1168
1169 if (wr->uov) {
1170 switch (wi->regime) {
1171 case TR_EL10:
1172 uov_perms = perm_idx(vcpu, POR_EL0, idx);
1173 break;
1174 case TR_EL20:
1175 uov_perms = perm_idx(vcpu, POR_EL0, idx);
1176 break;
1177 case TR_EL2:
1178 uov_perms = 0;
1179 break;
1180 }
1181
1182 if (uov_perms & ~POE_RWX)
1183 uov_perms = POE_NONE;
1184
1185 /* R_NPBXC, S1UnprivOverlay enabled */
1186 if (wr->uwxn && (uov_perms & POE_X))
1187 uov_perms &= ~POE_W;
1188
1189 wr->ur &= uov_perms & POE_R;
1190 wr->uw &= uov_perms & POE_W;
1191 wr->ux &= uov_perms & POE_X;
1192 }
1193 }
1194
compute_s1_permissions(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)1195 static void compute_s1_permissions(struct kvm_vcpu *vcpu,
1196 struct s1_walk_info *wi,
1197 struct s1_walk_result *wr)
1198 {
1199 bool pan;
1200
1201 if (!s1pie_enabled(vcpu, wi->regime))
1202 compute_s1_direct_permissions(vcpu, wi, wr);
1203 else
1204 compute_s1_indirect_permissions(vcpu, wi, wr);
1205
1206 if (!wi->hpd)
1207 compute_s1_hierarchical_permissions(vcpu, wi, wr);
1208
1209 compute_s1_overlay_permissions(vcpu, wi, wr);
1210
1211 /* R_QXXPC, S1PrivOverlay disabled */
1212 if (!wr->pov)
1213 wr->px &= !(wr->pwxn && wr->pw);
1214
1215 /* R_NPBXC, S1UnprivOverlay disabled */
1216 if (!wr->uov)
1217 wr->ux &= !(wr->uwxn && wr->uw);
1218
1219 pan = wi->pan && (wr->ur || wr->uw ||
1220 (pan3_enabled(vcpu, wi->regime) && wr->ux));
1221 wr->pw &= !pan;
1222 wr->pr &= !pan;
1223 }
1224
handle_at_slow(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)1225 static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
1226 {
1227 struct s1_walk_result wr = {};
1228 struct s1_walk_info wi = {};
1229 bool perm_fail = false;
1230 int ret, idx;
1231
1232 wi.regime = compute_translation_regime(vcpu, op);
1233 wi.as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W);
1234 wi.pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) &&
1235 (*vcpu_cpsr(vcpu) & PSR_PAN_BIT);
1236
1237 ret = setup_s1_walk(vcpu, &wi, &wr, vaddr);
1238 if (ret)
1239 goto compute_par;
1240
1241 if (wr.level == S1_MMU_DISABLED)
1242 goto compute_par;
1243
1244 idx = srcu_read_lock(&vcpu->kvm->srcu);
1245
1246 ret = walk_s1(vcpu, &wi, &wr, vaddr);
1247
1248 srcu_read_unlock(&vcpu->kvm->srcu, idx);
1249
1250 if (ret)
1251 goto compute_par;
1252
1253 compute_s1_permissions(vcpu, &wi, &wr);
1254
1255 switch (op) {
1256 case OP_AT_S1E1RP:
1257 case OP_AT_S1E1R:
1258 case OP_AT_S1E2R:
1259 perm_fail = !wr.pr;
1260 break;
1261 case OP_AT_S1E1WP:
1262 case OP_AT_S1E1W:
1263 case OP_AT_S1E2W:
1264 perm_fail = !wr.pw;
1265 break;
1266 case OP_AT_S1E0R:
1267 perm_fail = !wr.ur;
1268 break;
1269 case OP_AT_S1E0W:
1270 perm_fail = !wr.uw;
1271 break;
1272 case OP_AT_S1E1A:
1273 case OP_AT_S1E2A:
1274 break;
1275 default:
1276 BUG();
1277 }
1278
1279 if (perm_fail)
1280 fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false);
1281
1282 compute_par:
1283 return compute_par_s1(vcpu, &wi, &wr);
1284 }
1285
1286 /*
1287 * Return the PAR_EL1 value as the result of a valid translation.
1288 *
1289 * If the translation is unsuccessful, the value may only contain
1290 * PAR_EL1.F, and cannot be taken at face value. It isn't an
1291 * indication of the translation having failed, only that the fast
1292 * path did not succeed, *unless* it indicates a S1 permission or
1293 * access fault.
1294 */
__kvm_at_s1e01_fast(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)1295 static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
1296 {
1297 struct mmu_config config;
1298 struct kvm_s2_mmu *mmu;
1299 bool fail, mmu_cs;
1300 u64 par;
1301
1302 par = SYS_PAR_EL1_F;
1303
1304 /*
1305 * We've trapped, so everything is live on the CPU. As we will
1306 * be switching contexts behind everybody's back, disable
1307 * interrupts while holding the mmu lock.
1308 */
1309 guard(write_lock_irqsave)(&vcpu->kvm->mmu_lock);
1310
1311 /*
1312 * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already
1313 * the right one (as we trapped from vEL2). If not, save the
1314 * full MMU context.
1315 *
1316 * We are also guaranteed to be in the correct context if
1317 * we're not in a nested VM.
1318 */
1319 mmu_cs = (vcpu_has_nv(vcpu) &&
1320 !(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)));
1321 if (!mmu_cs)
1322 goto skip_mmu_switch;
1323
1324 /*
1325 * Obtaining the S2 MMU for a L2 is horribly racy, and we may not
1326 * find it (recycled by another vcpu, for example). When this
1327 * happens, admit defeat immediately and use the SW (slow) path.
1328 */
1329 mmu = lookup_s2_mmu(vcpu);
1330 if (!mmu)
1331 return par;
1332
1333 __mmu_config_save(&config);
1334
1335 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1), SYS_TTBR0);
1336 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1), SYS_TTBR1);
1337 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1), SYS_TCR);
1338 write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1), SYS_MAIR);
1339 if (kvm_has_tcr2(vcpu->kvm)) {
1340 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR2_EL1), SYS_TCR2);
1341 if (kvm_has_s1pie(vcpu->kvm)) {
1342 write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIR_EL1), SYS_PIR);
1343 write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIRE0_EL1), SYS_PIRE0);
1344 }
1345 if (kvm_has_s1poe(vcpu->kvm)) {
1346 write_sysreg_el1(vcpu_read_sys_reg(vcpu, POR_EL1), SYS_POR);
1347 write_sysreg_s(vcpu_read_sys_reg(vcpu, POR_EL0), SYS_POR_EL0);
1348 }
1349 }
1350 write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1), SYS_SCTLR);
1351 __load_stage2(mmu, mmu->arch);
1352
1353 skip_mmu_switch:
1354 /* Temporarily switch back to guest context */
1355 write_sysreg_hcr(vcpu->arch.hcr_el2);
1356 isb();
1357
1358 switch (op) {
1359 case OP_AT_S1E1RP:
1360 case OP_AT_S1E1WP:
1361 fail = at_s1e1p_fast(vcpu, op, vaddr);
1362 break;
1363 case OP_AT_S1E1R:
1364 fail = __kvm_at(OP_AT_S1E1R, vaddr);
1365 break;
1366 case OP_AT_S1E1W:
1367 fail = __kvm_at(OP_AT_S1E1W, vaddr);
1368 break;
1369 case OP_AT_S1E0R:
1370 fail = __kvm_at(OP_AT_S1E0R, vaddr);
1371 break;
1372 case OP_AT_S1E0W:
1373 fail = __kvm_at(OP_AT_S1E0W, vaddr);
1374 break;
1375 case OP_AT_S1E1A:
1376 fail = __kvm_at(OP_AT_S1E1A, vaddr);
1377 break;
1378 default:
1379 WARN_ON_ONCE(1);
1380 fail = true;
1381 break;
1382 }
1383
1384 if (!fail)
1385 par = read_sysreg_par();
1386
1387 write_sysreg_hcr(HCR_HOST_VHE_FLAGS);
1388
1389 if (mmu_cs)
1390 __mmu_config_restore(&config);
1391
1392 return par;
1393 }
1394
par_check_s1_perm_fault(u64 par)1395 static bool par_check_s1_perm_fault(u64 par)
1396 {
1397 u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
1398
1399 return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM &&
1400 !(par & SYS_PAR_EL1_S));
1401 }
1402
par_check_s1_access_fault(u64 par)1403 static bool par_check_s1_access_fault(u64 par)
1404 {
1405 u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
1406
1407 return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_ACCESS &&
1408 !(par & SYS_PAR_EL1_S));
1409 }
1410
__kvm_at_s1e01(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)1411 void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
1412 {
1413 u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr);
1414
1415 /*
1416 * If PAR_EL1 reports that AT failed on a S1 permission or access
1417 * fault, we know for sure that the PTW was able to walk the S1
1418 * tables and there's nothing else to do.
1419 *
1420 * If AT failed for any other reason, then we must walk the guest S1
1421 * to emulate the instruction.
1422 */
1423 if ((par & SYS_PAR_EL1_F) &&
1424 !par_check_s1_perm_fault(par) &&
1425 !par_check_s1_access_fault(par))
1426 par = handle_at_slow(vcpu, op, vaddr);
1427
1428 vcpu_write_sys_reg(vcpu, par, PAR_EL1);
1429 }
1430
__kvm_at_s1e2(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)1431 void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
1432 {
1433 u64 par;
1434
1435 /*
1436 * We've trapped, so everything is live on the CPU. As we will be
1437 * switching context behind everybody's back, disable interrupts...
1438 */
1439 scoped_guard(write_lock_irqsave, &vcpu->kvm->mmu_lock) {
1440 u64 val, hcr;
1441 bool fail;
1442
1443 val = hcr = read_sysreg(hcr_el2);
1444 val &= ~HCR_TGE;
1445 val |= HCR_VM;
1446
1447 if (!vcpu_el2_e2h_is_set(vcpu))
1448 val |= HCR_NV | HCR_NV1;
1449
1450 write_sysreg_hcr(val);
1451 isb();
1452
1453 par = SYS_PAR_EL1_F;
1454
1455 switch (op) {
1456 case OP_AT_S1E2R:
1457 fail = __kvm_at(OP_AT_S1E1R, vaddr);
1458 break;
1459 case OP_AT_S1E2W:
1460 fail = __kvm_at(OP_AT_S1E1W, vaddr);
1461 break;
1462 case OP_AT_S1E2A:
1463 fail = __kvm_at(OP_AT_S1E1A, vaddr);
1464 break;
1465 default:
1466 WARN_ON_ONCE(1);
1467 fail = true;
1468 }
1469
1470 isb();
1471
1472 if (!fail)
1473 par = read_sysreg_par();
1474
1475 write_sysreg_hcr(hcr);
1476 isb();
1477 }
1478
1479 /* We failed the translation, let's replay it in slow motion */
1480 if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par))
1481 par = handle_at_slow(vcpu, op, vaddr);
1482
1483 vcpu_write_sys_reg(vcpu, par, PAR_EL1);
1484 }
1485
__kvm_at_s12(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)1486 void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
1487 {
1488 struct kvm_s2_trans out = {};
1489 u64 ipa, par;
1490 bool write;
1491 int ret;
1492
1493 /* Do the stage-1 translation */
1494 switch (op) {
1495 case OP_AT_S12E1R:
1496 op = OP_AT_S1E1R;
1497 write = false;
1498 break;
1499 case OP_AT_S12E1W:
1500 op = OP_AT_S1E1W;
1501 write = true;
1502 break;
1503 case OP_AT_S12E0R:
1504 op = OP_AT_S1E0R;
1505 write = false;
1506 break;
1507 case OP_AT_S12E0W:
1508 op = OP_AT_S1E0W;
1509 write = true;
1510 break;
1511 default:
1512 WARN_ON_ONCE(1);
1513 return;
1514 }
1515
1516 __kvm_at_s1e01(vcpu, op, vaddr);
1517 par = vcpu_read_sys_reg(vcpu, PAR_EL1);
1518 if (par & SYS_PAR_EL1_F)
1519 return;
1520
1521 /*
1522 * If we only have a single stage of translation (EL2&0), exit
1523 * early. Same thing if {VM,DC}=={0,0}.
1524 */
1525 if (compute_translation_regime(vcpu, op) == TR_EL20 ||
1526 !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC)))
1527 return;
1528
1529 /* Do the stage-2 translation */
1530 ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0));
1531 out.esr = 0;
1532 ret = kvm_walk_nested_s2(vcpu, ipa, &out);
1533 if (ret < 0)
1534 return;
1535
1536 /* Check the access permission */
1537 if (!out.esr &&
1538 ((!write && !out.readable) || (write && !out.writable)))
1539 out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3);
1540
1541 par = compute_par_s12(vcpu, par, &out);
1542 vcpu_write_sys_reg(vcpu, par, PAR_EL1);
1543 }
1544
1545 /*
1546 * Translate a VA for a given EL in a given translation regime, with
1547 * or without PAN. This requires wi->{regime, as_el0, pan} to be
1548 * set. The rest of the wi and wr should be 0-initialised.
1549 */
__kvm_translate_va(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr,u64 va)1550 int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
1551 struct s1_walk_result *wr, u64 va)
1552 {
1553 int ret;
1554
1555 ret = setup_s1_walk(vcpu, wi, wr, va);
1556 if (ret)
1557 return ret;
1558
1559 if (wr->level == S1_MMU_DISABLED) {
1560 wr->ur = wr->uw = wr->ux = true;
1561 wr->pr = wr->pw = wr->px = true;
1562 } else {
1563 ret = walk_s1(vcpu, wi, wr, va);
1564 if (ret)
1565 return ret;
1566
1567 compute_s1_permissions(vcpu, wi, wr);
1568 }
1569
1570 return 0;
1571 }
1572
1573 struct desc_match {
1574 u64 ipa;
1575 int level;
1576 };
1577
match_s1_desc(struct s1_walk_context * ctxt,void * priv)1578 static int match_s1_desc(struct s1_walk_context *ctxt, void *priv)
1579 {
1580 struct desc_match *dm = priv;
1581 u64 ipa = dm->ipa;
1582
1583 /* Use S1 granule alignment */
1584 ipa &= GENMASK(51, ctxt->wi->pgshift);
1585
1586 /* Not the IPA we're looking for? Continue. */
1587 if (ipa != ctxt->table_ipa)
1588 return 0;
1589
1590 /* Note the level and interrupt the walk */
1591 dm->level = ctxt->level;
1592 return -EINTR;
1593 }
1594
__kvm_find_s1_desc_level(struct kvm_vcpu * vcpu,u64 va,u64 ipa,int * level)1595 int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level)
1596 {
1597 struct desc_match dm = {
1598 .ipa = ipa,
1599 };
1600 struct s1_walk_info wi = {
1601 .filter = &(struct s1_walk_filter){
1602 .fn = match_s1_desc,
1603 .priv = &dm,
1604 },
1605 .regime = TR_EL10,
1606 .as_el0 = false,
1607 .pan = false,
1608 };
1609 struct s1_walk_result wr = {};
1610 int ret;
1611
1612 ret = setup_s1_walk(vcpu, &wi, &wr, va);
1613 if (ret)
1614 return ret;
1615
1616 /* We really expect the S1 MMU to be on here... */
1617 if (WARN_ON_ONCE(wr.level == S1_MMU_DISABLED)) {
1618 *level = 0;
1619 return 0;
1620 }
1621
1622 /* Walk the guest's PT, looking for a match along the way */
1623 ret = walk_s1(vcpu, &wi, &wr, va);
1624 switch (ret) {
1625 case -EINTR:
1626 /* We interrupted the walk on a match, return the level */
1627 *level = dm.level;
1628 return 0;
1629 case 0:
1630 /* The walk completed, we failed to find the entry */
1631 return -ENOENT;
1632 default:
1633 /* Any other error... */
1634 return ret;
1635 }
1636 }
1637