1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (C) 2017 - Linaro Ltd
4 * Author: Jintack Lim <jintack.lim@linaro.org>
5 */
6
7 #include <linux/kvm_host.h>
8
9 #include <asm/esr.h>
10 #include <asm/kvm_hyp.h>
11 #include <asm/kvm_mmu.h>
12
fail_s1_walk(struct s1_walk_result * wr,u8 fst,bool s1ptw)13 static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool s1ptw)
14 {
15 wr->fst = fst;
16 wr->ptw = s1ptw;
17 wr->s2 = s1ptw;
18 wr->failed = true;
19 }
20
21 #define S1_MMU_DISABLED (-127)
22
get_ia_size(struct s1_walk_info * wi)23 static int get_ia_size(struct s1_walk_info *wi)
24 {
25 return 64 - wi->txsz;
26 }
27
28 /* Return true if the IPA is out of the OA range */
check_output_size(u64 ipa,struct s1_walk_info * wi)29 static bool check_output_size(u64 ipa, struct s1_walk_info *wi)
30 {
31 if (wi->pa52bit)
32 return wi->max_oa_bits < 52 && (ipa & GENMASK_ULL(51, wi->max_oa_bits));
33 return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits));
34 }
35
has_52bit_pa(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,u64 tcr)36 static bool has_52bit_pa(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, u64 tcr)
37 {
38 switch (BIT(wi->pgshift)) {
39 case SZ_64K:
40 default: /* IMPDEF: treat any other value as 64k */
41 if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, PARANGE, 52))
42 return false;
43 return ((wi->regime == TR_EL2 ?
44 FIELD_GET(TCR_EL2_PS_MASK, tcr) :
45 FIELD_GET(TCR_IPS_MASK, tcr)) == 0b0110);
46 case SZ_16K:
47 if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT))
48 return false;
49 break;
50 case SZ_4K:
51 if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT))
52 return false;
53 break;
54 }
55
56 return (tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS));
57 }
58
desc_to_oa(struct s1_walk_info * wi,u64 desc)59 static u64 desc_to_oa(struct s1_walk_info *wi, u64 desc)
60 {
61 u64 addr;
62
63 if (!wi->pa52bit)
64 return desc & GENMASK_ULL(47, wi->pgshift);
65
66 switch (BIT(wi->pgshift)) {
67 case SZ_4K:
68 case SZ_16K:
69 addr = desc & GENMASK_ULL(49, wi->pgshift);
70 addr |= FIELD_GET(KVM_PTE_ADDR_51_50_LPA2, desc) << 50;
71 break;
72 case SZ_64K:
73 default: /* IMPDEF: treat any other value as 64k */
74 addr = desc & GENMASK_ULL(47, wi->pgshift);
75 addr |= FIELD_GET(KVM_PTE_ADDR_51_48, desc) << 48;
76 break;
77 }
78
79 return addr;
80 }
81
82 /* Return the translation regime that applies to an AT instruction */
compute_translation_regime(struct kvm_vcpu * vcpu,u32 op)83 static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op)
84 {
85 /*
86 * We only get here from guest EL2, so the translation
87 * regime AT applies to is solely defined by {E2H,TGE}.
88 */
89 switch (op) {
90 case OP_AT_S1E2R:
91 case OP_AT_S1E2W:
92 case OP_AT_S1E2A:
93 return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2;
94 default:
95 return (vcpu_el2_e2h_is_set(vcpu) &&
96 vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10;
97 }
98 }
99
effective_tcr2(struct kvm_vcpu * vcpu,enum trans_regime regime)100 static u64 effective_tcr2(struct kvm_vcpu *vcpu, enum trans_regime regime)
101 {
102 if (regime == TR_EL10) {
103 if (vcpu_has_nv(vcpu) &&
104 !(__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En))
105 return 0;
106
107 return vcpu_read_sys_reg(vcpu, TCR2_EL1);
108 }
109
110 return vcpu_read_sys_reg(vcpu, TCR2_EL2);
111 }
112
s1pie_enabled(struct kvm_vcpu * vcpu,enum trans_regime regime)113 static bool s1pie_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime)
114 {
115 if (!kvm_has_s1pie(vcpu->kvm))
116 return false;
117
118 /* Abuse TCR2_EL1_PIE and use it for EL2 as well */
119 return effective_tcr2(vcpu, regime) & TCR2_EL1_PIE;
120 }
121
compute_s1poe(struct kvm_vcpu * vcpu,struct s1_walk_info * wi)122 static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi)
123 {
124 u64 val;
125
126 if (!kvm_has_s1poe(vcpu->kvm)) {
127 wi->poe = wi->e0poe = false;
128 return;
129 }
130
131 val = effective_tcr2(vcpu, wi->regime);
132
133 /* Abuse TCR2_EL1_* for EL2 */
134 wi->poe = val & TCR2_EL1_POE;
135 wi->e0poe = (wi->regime != TR_EL2) && (val & TCR2_EL1_E0POE);
136 }
137
setup_s1_walk(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr,u64 va)138 static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
139 struct s1_walk_result *wr, u64 va)
140 {
141 u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr;
142 unsigned int stride, x;
143 bool va55, tbi, lva;
144
145 va55 = va & BIT(55);
146
147 if (vcpu_has_nv(vcpu)) {
148 hcr = __vcpu_sys_reg(vcpu, HCR_EL2);
149 wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC));
150 } else {
151 WARN_ON_ONCE(wi->regime != TR_EL10);
152 wi->s2 = false;
153 hcr = 0;
154 }
155
156 switch (wi->regime) {
157 case TR_EL10:
158 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
159 tcr = vcpu_read_sys_reg(vcpu, TCR_EL1);
160 ttbr = (va55 ?
161 vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
162 vcpu_read_sys_reg(vcpu, TTBR0_EL1));
163 break;
164 case TR_EL2:
165 case TR_EL20:
166 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
167 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
168 ttbr = (va55 ?
169 vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
170 vcpu_read_sys_reg(vcpu, TTBR0_EL2));
171 break;
172 default:
173 BUG();
174 }
175
176 /* Someone was silly enough to encode TG0/TG1 differently */
177 if (va55 && wi->regime != TR_EL2) {
178 wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr);
179 tg = FIELD_GET(TCR_TG1_MASK, tcr);
180
181 switch (tg << TCR_TG1_SHIFT) {
182 case TCR_TG1_4K:
183 wi->pgshift = 12; break;
184 case TCR_TG1_16K:
185 wi->pgshift = 14; break;
186 case TCR_TG1_64K:
187 default: /* IMPDEF: treat any other value as 64k */
188 wi->pgshift = 16; break;
189 }
190 } else {
191 wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
192 tg = FIELD_GET(TCR_TG0_MASK, tcr);
193
194 switch (tg << TCR_TG0_SHIFT) {
195 case TCR_TG0_4K:
196 wi->pgshift = 12; break;
197 case TCR_TG0_16K:
198 wi->pgshift = 14; break;
199 case TCR_TG0_64K:
200 default: /* IMPDEF: treat any other value as 64k */
201 wi->pgshift = 16; break;
202 }
203 }
204
205 wi->pa52bit = has_52bit_pa(vcpu, wi, tcr);
206
207 ia_bits = get_ia_size(wi);
208
209 /* AArch64.S1StartLevel() */
210 stride = wi->pgshift - 3;
211 wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride);
212
213 if (wi->regime == TR_EL2 && va55)
214 goto addrsz;
215
216 tbi = (wi->regime == TR_EL2 ?
217 FIELD_GET(TCR_EL2_TBI, tcr) :
218 (va55 ?
219 FIELD_GET(TCR_TBI1, tcr) :
220 FIELD_GET(TCR_TBI0, tcr)));
221
222 if (!tbi && (u64)sign_extend64(va, 55) != va)
223 goto addrsz;
224
225 wi->sh = (wi->regime == TR_EL2 ?
226 FIELD_GET(TCR_EL2_SH0_MASK, tcr) :
227 (va55 ?
228 FIELD_GET(TCR_SH1_MASK, tcr) :
229 FIELD_GET(TCR_SH0_MASK, tcr)));
230
231 va = (u64)sign_extend64(va, 55);
232
233 /* Let's put the MMU disabled case aside immediately */
234 switch (wi->regime) {
235 case TR_EL10:
236 /*
237 * If dealing with the EL1&0 translation regime, 3 things
238 * can disable the S1 translation:
239 *
240 * - HCR_EL2.DC = 1
241 * - HCR_EL2.{E2H,TGE} = {0,1}
242 * - SCTLR_EL1.M = 0
243 *
244 * The TGE part is interesting. If we have decided that this
245 * is EL1&0, then it means that either {E2H,TGE} == {1,0} or
246 * {0,x}, and we only need to test for TGE == 1.
247 */
248 if (hcr & (HCR_DC | HCR_TGE)) {
249 wr->level = S1_MMU_DISABLED;
250 break;
251 }
252 fallthrough;
253 case TR_EL2:
254 case TR_EL20:
255 if (!(sctlr & SCTLR_ELx_M))
256 wr->level = S1_MMU_DISABLED;
257 break;
258 }
259
260 if (wr->level == S1_MMU_DISABLED) {
261 if (va >= BIT(kvm_get_pa_bits(vcpu->kvm)))
262 goto addrsz;
263
264 wr->pa = va;
265 return 0;
266 }
267
268 wi->be = sctlr & SCTLR_ELx_EE;
269
270 wi->hpd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP);
271 wi->hpd &= (wi->regime == TR_EL2 ?
272 FIELD_GET(TCR_EL2_HPD, tcr) :
273 (va55 ?
274 FIELD_GET(TCR_HPD1, tcr) :
275 FIELD_GET(TCR_HPD0, tcr)));
276 /* R_JHSVW */
277 wi->hpd |= s1pie_enabled(vcpu, wi->regime);
278
279 /* Do we have POE? */
280 compute_s1poe(vcpu, wi);
281
282 /* R_BVXDG */
283 wi->hpd |= (wi->poe || wi->e0poe);
284
285 /* R_PLCGL, R_YXNYW */
286 if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) {
287 if (wi->txsz > 39)
288 goto transfault;
289 } else {
290 if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47))
291 goto transfault;
292 }
293
294 /* R_GTJBY, R_SXWGM */
295 switch (BIT(wi->pgshift)) {
296 case SZ_4K:
297 case SZ_16K:
298 lva = wi->pa52bit;
299 break;
300 case SZ_64K:
301 lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52);
302 break;
303 }
304
305 if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16))
306 goto transfault;
307
308 /* R_YYVYV, I_THCZK */
309 if ((!va55 && va > GENMASK(ia_bits - 1, 0)) ||
310 (va55 && va < GENMASK(63, ia_bits)))
311 goto transfault;
312
313 /* I_ZFSYQ */
314 if (wi->regime != TR_EL2 &&
315 (tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK)))
316 goto transfault;
317
318 /* R_BNDVG and following statements */
319 if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) &&
320 wi->as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0)))
321 goto transfault;
322
323 ps = (wi->regime == TR_EL2 ?
324 FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr));
325
326 wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps, wi->pa52bit));
327
328 /* Compute minimal alignment */
329 x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift);
330
331 wi->baddr = ttbr & TTBRx_EL1_BADDR;
332 if (wi->pa52bit) {
333 /*
334 * Force the alignment on 64 bytes for top-level tables
335 * smaller than 8 entries, since TTBR.BADDR[5:2] are used to
336 * store bits [51:48] of the first level of lookup.
337 */
338 x = max(x, 6);
339
340 wi->baddr |= FIELD_GET(GENMASK_ULL(5, 2), ttbr) << 48;
341 }
342
343 /* R_VPBBF */
344 if (check_output_size(wi->baddr, wi))
345 goto addrsz;
346
347 wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x);
348
349 return 0;
350
351 addrsz:
352 /*
353 * Address Size Fault level 0 to indicate it comes from TTBR.
354 * yes, this is an oddity.
355 */
356 fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false);
357 return -EFAULT;
358
359 transfault:
360 /* Translation Fault on start level */
361 fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(wi->sl), false);
362 return -EFAULT;
363 }
364
walk_s1(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr,u64 va)365 static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
366 struct s1_walk_result *wr, u64 va)
367 {
368 u64 va_top, va_bottom, baddr, desc;
369 int level, stride, ret;
370
371 level = wi->sl;
372 stride = wi->pgshift - 3;
373 baddr = wi->baddr;
374
375 va_top = get_ia_size(wi) - 1;
376
377 while (1) {
378 u64 index, ipa;
379
380 va_bottom = (3 - level) * stride + wi->pgshift;
381 index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3);
382
383 ipa = baddr | index;
384
385 if (wi->s2) {
386 struct kvm_s2_trans s2_trans = {};
387
388 ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans);
389 if (ret) {
390 fail_s1_walk(wr,
391 (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level,
392 true);
393 return ret;
394 }
395
396 if (!kvm_s2_trans_readable(&s2_trans)) {
397 fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level),
398 true);
399
400 return -EPERM;
401 }
402
403 ipa = kvm_s2_trans_output(&s2_trans);
404 }
405
406 if (wi->filter) {
407 ret = wi->filter->fn(&(struct s1_walk_context)
408 {
409 .wi = wi,
410 .table_ipa = baddr,
411 .level = level,
412 }, wi->filter->priv);
413 if (ret)
414 return ret;
415 }
416
417 ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc));
418 if (ret) {
419 fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false);
420 return ret;
421 }
422
423 if (wi->be)
424 desc = be64_to_cpu((__force __be64)desc);
425 else
426 desc = le64_to_cpu((__force __le64)desc);
427
428 /* Invalid descriptor */
429 if (!(desc & BIT(0)))
430 goto transfault;
431
432 /* Block mapping, check validity down the line */
433 if (!(desc & BIT(1)))
434 break;
435
436 /* Page mapping */
437 if (level == 3)
438 break;
439
440 /* Table handling */
441 if (!wi->hpd) {
442 wr->APTable |= FIELD_GET(S1_TABLE_AP, desc);
443 wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc);
444 wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc);
445 }
446
447 baddr = desc_to_oa(wi, desc);
448
449 /* Check for out-of-range OA */
450 if (check_output_size(baddr, wi))
451 goto addrsz;
452
453 /* Prepare for next round */
454 va_top = va_bottom - 1;
455 level++;
456 }
457
458 /* Block mapping, check the validity of the level */
459 if (!(desc & BIT(1))) {
460 bool valid_block = false;
461
462 switch (BIT(wi->pgshift)) {
463 case SZ_4K:
464 valid_block = level == 1 || level == 2 || (wi->pa52bit && level == 0);
465 break;
466 case SZ_16K:
467 case SZ_64K:
468 valid_block = level == 2 || (wi->pa52bit && level == 1);
469 break;
470 }
471
472 if (!valid_block)
473 goto transfault;
474 }
475
476 baddr = desc_to_oa(wi, desc);
477 if (check_output_size(baddr & GENMASK(52, va_bottom), wi))
478 goto addrsz;
479
480 if (!(desc & PTE_AF)) {
481 fail_s1_walk(wr, ESR_ELx_FSC_ACCESS_L(level), false);
482 return -EACCES;
483 }
484
485 va_bottom += contiguous_bit_shift(desc, wi, level);
486
487 wr->failed = false;
488 wr->level = level;
489 wr->desc = desc;
490 wr->pa = baddr & GENMASK(52, va_bottom);
491 wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0);
492
493 wr->nG = (wi->regime != TR_EL2) && (desc & PTE_NG);
494 if (wr->nG) {
495 u64 asid_ttbr, tcr;
496
497 switch (wi->regime) {
498 case TR_EL10:
499 tcr = vcpu_read_sys_reg(vcpu, TCR_EL1);
500 asid_ttbr = ((tcr & TCR_A1) ?
501 vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
502 vcpu_read_sys_reg(vcpu, TTBR0_EL1));
503 break;
504 case TR_EL20:
505 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
506 asid_ttbr = ((tcr & TCR_A1) ?
507 vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
508 vcpu_read_sys_reg(vcpu, TTBR0_EL2));
509 break;
510 default:
511 BUG();
512 }
513
514 wr->asid = FIELD_GET(TTBR_ASID_MASK, asid_ttbr);
515 if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) ||
516 !(tcr & TCR_ASID16))
517 wr->asid &= GENMASK(7, 0);
518 }
519
520 return 0;
521
522 addrsz:
523 fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), false);
524 return -EINVAL;
525 transfault:
526 fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), false);
527 return -ENOENT;
528 }
529
530 struct mmu_config {
531 u64 ttbr0;
532 u64 ttbr1;
533 u64 tcr;
534 u64 mair;
535 u64 tcr2;
536 u64 pir;
537 u64 pire0;
538 u64 por_el0;
539 u64 por_el1;
540 u64 sctlr;
541 u64 vttbr;
542 u64 vtcr;
543 };
544
__mmu_config_save(struct mmu_config * config)545 static void __mmu_config_save(struct mmu_config *config)
546 {
547 config->ttbr0 = read_sysreg_el1(SYS_TTBR0);
548 config->ttbr1 = read_sysreg_el1(SYS_TTBR1);
549 config->tcr = read_sysreg_el1(SYS_TCR);
550 config->mair = read_sysreg_el1(SYS_MAIR);
551 if (cpus_have_final_cap(ARM64_HAS_TCR2)) {
552 config->tcr2 = read_sysreg_el1(SYS_TCR2);
553 if (cpus_have_final_cap(ARM64_HAS_S1PIE)) {
554 config->pir = read_sysreg_el1(SYS_PIR);
555 config->pire0 = read_sysreg_el1(SYS_PIRE0);
556 }
557 if (system_supports_poe()) {
558 config->por_el1 = read_sysreg_el1(SYS_POR);
559 config->por_el0 = read_sysreg_s(SYS_POR_EL0);
560 }
561 }
562 config->sctlr = read_sysreg_el1(SYS_SCTLR);
563 config->vttbr = read_sysreg(vttbr_el2);
564 config->vtcr = read_sysreg(vtcr_el2);
565 }
566
__mmu_config_restore(struct mmu_config * config)567 static void __mmu_config_restore(struct mmu_config *config)
568 {
569 /*
570 * ARM errata 1165522 and 1530923 require TGE to be 1 before
571 * we update the guest state.
572 */
573 asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
574
575 write_sysreg_el1(config->ttbr0, SYS_TTBR0);
576 write_sysreg_el1(config->ttbr1, SYS_TTBR1);
577 write_sysreg_el1(config->tcr, SYS_TCR);
578 write_sysreg_el1(config->mair, SYS_MAIR);
579 if (cpus_have_final_cap(ARM64_HAS_TCR2)) {
580 write_sysreg_el1(config->tcr2, SYS_TCR2);
581 if (cpus_have_final_cap(ARM64_HAS_S1PIE)) {
582 write_sysreg_el1(config->pir, SYS_PIR);
583 write_sysreg_el1(config->pire0, SYS_PIRE0);
584 }
585 if (system_supports_poe()) {
586 write_sysreg_el1(config->por_el1, SYS_POR);
587 write_sysreg_s(config->por_el0, SYS_POR_EL0);
588 }
589 }
590 write_sysreg_el1(config->sctlr, SYS_SCTLR);
591 write_sysreg(config->vttbr, vttbr_el2);
592 write_sysreg(config->vtcr, vtcr_el2);
593 }
594
at_s1e1p_fast(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)595 static bool at_s1e1p_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
596 {
597 u64 host_pan;
598 bool fail;
599
600 host_pan = read_sysreg_s(SYS_PSTATE_PAN);
601 write_sysreg_s(*vcpu_cpsr(vcpu) & PSTATE_PAN, SYS_PSTATE_PAN);
602
603 switch (op) {
604 case OP_AT_S1E1RP:
605 fail = __kvm_at(OP_AT_S1E1RP, vaddr);
606 break;
607 case OP_AT_S1E1WP:
608 fail = __kvm_at(OP_AT_S1E1WP, vaddr);
609 break;
610 }
611
612 write_sysreg_s(host_pan, SYS_PSTATE_PAN);
613
614 return fail;
615 }
616
617 #define MEMATTR(ic, oc) (MEMATTR_##oc << 4 | MEMATTR_##ic)
618 #define MEMATTR_NC 0b0100
619 #define MEMATTR_Wt 0b1000
620 #define MEMATTR_Wb 0b1100
621 #define MEMATTR_WbRaWa 0b1111
622
623 #define MEMATTR_IS_DEVICE(m) (((m) & GENMASK(7, 4)) == 0)
624
s2_memattr_to_attr(u8 memattr)625 static u8 s2_memattr_to_attr(u8 memattr)
626 {
627 memattr &= 0b1111;
628
629 switch (memattr) {
630 case 0b0000:
631 case 0b0001:
632 case 0b0010:
633 case 0b0011:
634 return memattr << 2;
635 case 0b0100:
636 return MEMATTR(Wb, Wb);
637 case 0b0101:
638 return MEMATTR(NC, NC);
639 case 0b0110:
640 return MEMATTR(Wt, NC);
641 case 0b0111:
642 return MEMATTR(Wb, NC);
643 case 0b1000:
644 /* Reserved, assume NC */
645 return MEMATTR(NC, NC);
646 case 0b1001:
647 return MEMATTR(NC, Wt);
648 case 0b1010:
649 return MEMATTR(Wt, Wt);
650 case 0b1011:
651 return MEMATTR(Wb, Wt);
652 case 0b1100:
653 /* Reserved, assume NC */
654 return MEMATTR(NC, NC);
655 case 0b1101:
656 return MEMATTR(NC, Wb);
657 case 0b1110:
658 return MEMATTR(Wt, Wb);
659 case 0b1111:
660 return MEMATTR(Wb, Wb);
661 default:
662 unreachable();
663 }
664 }
665
combine_s1_s2_attr(u8 s1,u8 s2)666 static u8 combine_s1_s2_attr(u8 s1, u8 s2)
667 {
668 bool transient;
669 u8 final = 0;
670
671 /* Upgrade transient s1 to non-transient to simplify things */
672 switch (s1) {
673 case 0b0001 ... 0b0011: /* Normal, Write-Through Transient */
674 transient = true;
675 s1 = MEMATTR_Wt | (s1 & GENMASK(1,0));
676 break;
677 case 0b0101 ... 0b0111: /* Normal, Write-Back Transient */
678 transient = true;
679 s1 = MEMATTR_Wb | (s1 & GENMASK(1,0));
680 break;
681 default:
682 transient = false;
683 }
684
685 /* S2CombineS1AttrHints() */
686 if ((s1 & GENMASK(3, 2)) == MEMATTR_NC ||
687 (s2 & GENMASK(3, 2)) == MEMATTR_NC)
688 final = MEMATTR_NC;
689 else if ((s1 & GENMASK(3, 2)) == MEMATTR_Wt ||
690 (s2 & GENMASK(3, 2)) == MEMATTR_Wt)
691 final = MEMATTR_Wt;
692 else
693 final = MEMATTR_Wb;
694
695 if (final != MEMATTR_NC) {
696 /* Inherit RaWa hints form S1 */
697 if (transient) {
698 switch (s1 & GENMASK(3, 2)) {
699 case MEMATTR_Wt:
700 final = 0;
701 break;
702 case MEMATTR_Wb:
703 final = MEMATTR_NC;
704 break;
705 }
706 }
707
708 final |= s1 & GENMASK(1, 0);
709 }
710
711 return final;
712 }
713
714 #define ATTR_NSH 0b00
715 #define ATTR_RSV 0b01
716 #define ATTR_OSH 0b10
717 #define ATTR_ISH 0b11
718
compute_final_sh(u8 attr,u8 sh)719 static u8 compute_final_sh(u8 attr, u8 sh)
720 {
721 /* Any form of device, as well as NC has SH[1:0]=0b10 */
722 if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC))
723 return ATTR_OSH;
724
725 if (sh == ATTR_RSV) /* Reserved, mapped to NSH */
726 sh = ATTR_NSH;
727
728 return sh;
729 }
730
compute_s1_sh(struct s1_walk_info * wi,struct s1_walk_result * wr,u8 attr)731 static u8 compute_s1_sh(struct s1_walk_info *wi, struct s1_walk_result *wr,
732 u8 attr)
733 {
734 u8 sh;
735
736 /*
737 * non-52bit and LPA have their basic shareability described in the
738 * descriptor. LPA2 gets it from the corresponding field in TCR,
739 * conveniently recorded in the walk info.
740 */
741 if (!wi->pa52bit || BIT(wi->pgshift) == SZ_64K)
742 sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_SH, wr->desc);
743 else
744 sh = wi->sh;
745
746 return compute_final_sh(attr, sh);
747 }
748
combine_sh(u8 s1_sh,u8 s2_sh)749 static u8 combine_sh(u8 s1_sh, u8 s2_sh)
750 {
751 if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH)
752 return ATTR_OSH;
753 if (s1_sh == ATTR_ISH || s2_sh == ATTR_ISH)
754 return ATTR_ISH;
755
756 return ATTR_NSH;
757 }
758
compute_par_s12(struct kvm_vcpu * vcpu,u64 s1_par,struct kvm_s2_trans * tr)759 static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par,
760 struct kvm_s2_trans *tr)
761 {
762 u8 s1_parattr, s2_memattr, final_attr, s2_sh;
763 u64 par;
764
765 /* If S2 has failed to translate, report the damage */
766 if (tr->esr) {
767 par = SYS_PAR_EL1_RES1;
768 par |= SYS_PAR_EL1_F;
769 par |= SYS_PAR_EL1_S;
770 par |= FIELD_PREP(SYS_PAR_EL1_FST, tr->esr);
771 return par;
772 }
773
774 s1_parattr = FIELD_GET(SYS_PAR_EL1_ATTR, s1_par);
775 s2_memattr = FIELD_GET(GENMASK(5, 2), tr->desc);
776
777 if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_FWB) {
778 if (!kvm_has_feat(vcpu->kvm, ID_AA64PFR2_EL1, MTEPERM, IMP))
779 s2_memattr &= ~BIT(3);
780
781 /* Combination of R_VRJSW and R_RHWZM */
782 switch (s2_memattr) {
783 case 0b0101:
784 if (MEMATTR_IS_DEVICE(s1_parattr))
785 final_attr = s1_parattr;
786 else
787 final_attr = MEMATTR(NC, NC);
788 break;
789 case 0b0110:
790 case 0b1110:
791 final_attr = MEMATTR(WbRaWa, WbRaWa);
792 break;
793 case 0b0111:
794 case 0b1111:
795 /* Preserve S1 attribute */
796 final_attr = s1_parattr;
797 break;
798 case 0b0100:
799 case 0b1100:
800 case 0b1101:
801 /* Reserved, do something non-silly */
802 final_attr = s1_parattr;
803 break;
804 default:
805 /*
806 * MemAttr[2]=0, Device from S2.
807 *
808 * FWB does not influence the way that stage 1
809 * memory types and attributes are combined
810 * with stage 2 Device type and attributes.
811 */
812 final_attr = min(s2_memattr_to_attr(s2_memattr),
813 s1_parattr);
814 }
815 } else {
816 /* Combination of R_HMNDG, R_TNHFM and R_GQFSF */
817 u8 s2_parattr = s2_memattr_to_attr(s2_memattr);
818
819 if (MEMATTR_IS_DEVICE(s1_parattr) ||
820 MEMATTR_IS_DEVICE(s2_parattr)) {
821 final_attr = min(s1_parattr, s2_parattr);
822 } else {
823 /* At this stage, this is memory vs memory */
824 final_attr = combine_s1_s2_attr(s1_parattr & 0xf,
825 s2_parattr & 0xf);
826 final_attr |= combine_s1_s2_attr(s1_parattr >> 4,
827 s2_parattr >> 4) << 4;
828 }
829 }
830
831 if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_CD) &&
832 !MEMATTR_IS_DEVICE(final_attr))
833 final_attr = MEMATTR(NC, NC);
834
835 s2_sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_SH, tr->desc);
836
837 par = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr);
838 par |= tr->output & GENMASK(47, 12);
839 par |= FIELD_PREP(SYS_PAR_EL1_SH,
840 combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par),
841 compute_final_sh(final_attr, s2_sh)));
842
843 return par;
844 }
845
compute_par_s1(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)846 static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
847 struct s1_walk_result *wr)
848 {
849 u64 par;
850
851 if (wr->failed) {
852 par = SYS_PAR_EL1_RES1;
853 par |= SYS_PAR_EL1_F;
854 par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst);
855 par |= wr->ptw ? SYS_PAR_EL1_PTW : 0;
856 par |= wr->s2 ? SYS_PAR_EL1_S : 0;
857 } else if (wr->level == S1_MMU_DISABLED) {
858 /* MMU off or HCR_EL2.DC == 1 */
859 par = SYS_PAR_EL1_NSE;
860 par |= wr->pa & SYS_PAR_EL1_PA;
861
862 if (wi->regime == TR_EL10 && vcpu_has_nv(vcpu) &&
863 (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
864 par |= FIELD_PREP(SYS_PAR_EL1_ATTR,
865 MEMATTR(WbRaWa, WbRaWa));
866 par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_NSH);
867 } else {
868 par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */
869 par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_OSH);
870 }
871 } else {
872 u64 mair, sctlr;
873 u8 sh;
874
875 par = SYS_PAR_EL1_NSE;
876
877 mair = (wi->regime == TR_EL10 ?
878 vcpu_read_sys_reg(vcpu, MAIR_EL1) :
879 vcpu_read_sys_reg(vcpu, MAIR_EL2));
880
881 mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8;
882 mair &= 0xff;
883
884 sctlr = (wi->regime == TR_EL10 ?
885 vcpu_read_sys_reg(vcpu, SCTLR_EL1) :
886 vcpu_read_sys_reg(vcpu, SCTLR_EL2));
887
888 /* Force NC for memory if SCTLR_ELx.C is clear */
889 if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair))
890 mair = MEMATTR(NC, NC);
891
892 par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair);
893 par |= wr->pa & SYS_PAR_EL1_PA;
894
895 sh = compute_s1_sh(wi, wr, mair);
896 par |= FIELD_PREP(SYS_PAR_EL1_SH, sh);
897 }
898
899 return par;
900 }
901
pan3_enabled(struct kvm_vcpu * vcpu,enum trans_regime regime)902 static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime)
903 {
904 u64 sctlr;
905
906 if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3))
907 return false;
908
909 if (s1pie_enabled(vcpu, regime))
910 return true;
911
912 if (regime == TR_EL10)
913 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
914 else
915 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
916
917 return sctlr & SCTLR_EL1_EPAN;
918 }
919
compute_s1_direct_permissions(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)920 static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu,
921 struct s1_walk_info *wi,
922 struct s1_walk_result *wr)
923 {
924 bool wxn;
925
926 /* Non-hierarchical part of AArch64.S1DirectBasePermissions() */
927 if (wi->regime != TR_EL2) {
928 switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr->desc)) {
929 case 0b00:
930 wr->pr = wr->pw = true;
931 wr->ur = wr->uw = false;
932 break;
933 case 0b01:
934 wr->pr = wr->pw = wr->ur = wr->uw = true;
935 break;
936 case 0b10:
937 wr->pr = true;
938 wr->pw = wr->ur = wr->uw = false;
939 break;
940 case 0b11:
941 wr->pr = wr->ur = true;
942 wr->pw = wr->uw = false;
943 break;
944 }
945
946 /* We don't use px for anything yet, but hey... */
947 wr->px = !((wr->desc & PTE_PXN) || wr->uw);
948 wr->ux = !(wr->desc & PTE_UXN);
949 } else {
950 wr->ur = wr->uw = wr->ux = false;
951
952 if (!(wr->desc & PTE_RDONLY)) {
953 wr->pr = wr->pw = true;
954 } else {
955 wr->pr = true;
956 wr->pw = false;
957 }
958
959 /* XN maps to UXN */
960 wr->px = !(wr->desc & PTE_UXN);
961 }
962
963 switch (wi->regime) {
964 case TR_EL2:
965 case TR_EL20:
966 wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_WXN);
967 break;
968 case TR_EL10:
969 wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN);
970 break;
971 }
972
973 wr->pwxn = wr->uwxn = wxn;
974 wr->pov = wi->poe;
975 wr->uov = wi->e0poe;
976 }
977
compute_s1_hierarchical_permissions(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)978 static void compute_s1_hierarchical_permissions(struct kvm_vcpu *vcpu,
979 struct s1_walk_info *wi,
980 struct s1_walk_result *wr)
981 {
982 /* Hierarchical part of AArch64.S1DirectBasePermissions() */
983 if (wi->regime != TR_EL2) {
984 switch (wr->APTable) {
985 case 0b00:
986 break;
987 case 0b01:
988 wr->ur = wr->uw = false;
989 break;
990 case 0b10:
991 wr->pw = wr->uw = false;
992 break;
993 case 0b11:
994 wr->pw = wr->ur = wr->uw = false;
995 break;
996 }
997
998 wr->px &= !wr->PXNTable;
999 wr->ux &= !wr->UXNTable;
1000 } else {
1001 if (wr->APTable & BIT(1))
1002 wr->pw = false;
1003
1004 /* XN maps to UXN */
1005 wr->px &= !wr->UXNTable;
1006 }
1007 }
1008
1009 #define perm_idx(v, r, i) ((vcpu_read_sys_reg((v), (r)) >> ((i) * 4)) & 0xf)
1010
1011 #define set_priv_perms(wr, r, w, x) \
1012 do { \
1013 (wr)->pr = (r); \
1014 (wr)->pw = (w); \
1015 (wr)->px = (x); \
1016 } while (0)
1017
1018 #define set_unpriv_perms(wr, r, w, x) \
1019 do { \
1020 (wr)->ur = (r); \
1021 (wr)->uw = (w); \
1022 (wr)->ux = (x); \
1023 } while (0)
1024
1025 #define set_priv_wxn(wr, v) \
1026 do { \
1027 (wr)->pwxn = (v); \
1028 } while (0)
1029
1030 #define set_unpriv_wxn(wr, v) \
1031 do { \
1032 (wr)->uwxn = (v); \
1033 } while (0)
1034
1035 /* Similar to AArch64.S1IndirectBasePermissions(), without GCS */
1036 #define set_perms(w, wr, ip) \
1037 do { \
1038 /* R_LLZDZ */ \
1039 switch ((ip)) { \
1040 case 0b0000: \
1041 set_ ## w ## _perms((wr), false, false, false); \
1042 break; \
1043 case 0b0001: \
1044 set_ ## w ## _perms((wr), true , false, false); \
1045 break; \
1046 case 0b0010: \
1047 set_ ## w ## _perms((wr), false, false, true ); \
1048 break; \
1049 case 0b0011: \
1050 set_ ## w ## _perms((wr), true , false, true ); \
1051 break; \
1052 case 0b0100: \
1053 set_ ## w ## _perms((wr), false, false, false); \
1054 break; \
1055 case 0b0101: \
1056 set_ ## w ## _perms((wr), true , true , false); \
1057 break; \
1058 case 0b0110: \
1059 set_ ## w ## _perms((wr), true , true , true ); \
1060 break; \
1061 case 0b0111: \
1062 set_ ## w ## _perms((wr), true , true , true ); \
1063 break; \
1064 case 0b1000: \
1065 set_ ## w ## _perms((wr), true , false, false); \
1066 break; \
1067 case 0b1001: \
1068 set_ ## w ## _perms((wr), true , false, false); \
1069 break; \
1070 case 0b1010: \
1071 set_ ## w ## _perms((wr), true , false, true ); \
1072 break; \
1073 case 0b1011: \
1074 set_ ## w ## _perms((wr), false, false, false); \
1075 break; \
1076 case 0b1100: \
1077 set_ ## w ## _perms((wr), true , true , false); \
1078 break; \
1079 case 0b1101: \
1080 set_ ## w ## _perms((wr), false, false, false); \
1081 break; \
1082 case 0b1110: \
1083 set_ ## w ## _perms((wr), true , true , true ); \
1084 break; \
1085 case 0b1111: \
1086 set_ ## w ## _perms((wr), false, false, false); \
1087 break; \
1088 } \
1089 \
1090 /* R_HJYGR */ \
1091 set_ ## w ## _wxn((wr), ((ip) == 0b0110)); \
1092 \
1093 } while (0)
1094
compute_s1_indirect_permissions(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)1095 static void compute_s1_indirect_permissions(struct kvm_vcpu *vcpu,
1096 struct s1_walk_info *wi,
1097 struct s1_walk_result *wr)
1098 {
1099 u8 up, pp, idx;
1100
1101 idx = pte_pi_index(wr->desc);
1102
1103 switch (wi->regime) {
1104 case TR_EL10:
1105 pp = perm_idx(vcpu, PIR_EL1, idx);
1106 up = perm_idx(vcpu, PIRE0_EL1, idx);
1107 break;
1108 case TR_EL20:
1109 pp = perm_idx(vcpu, PIR_EL2, idx);
1110 up = perm_idx(vcpu, PIRE0_EL2, idx);
1111 break;
1112 case TR_EL2:
1113 pp = perm_idx(vcpu, PIR_EL2, idx);
1114 up = 0;
1115 break;
1116 }
1117
1118 set_perms(priv, wr, pp);
1119
1120 if (wi->regime != TR_EL2)
1121 set_perms(unpriv, wr, up);
1122 else
1123 set_unpriv_perms(wr, false, false, false);
1124
1125 wr->pov = wi->poe && !(pp & BIT(3));
1126 wr->uov = wi->e0poe && !(up & BIT(3));
1127
1128 /* R_VFPJF */
1129 if (wr->px && wr->uw) {
1130 set_priv_perms(wr, false, false, false);
1131 set_unpriv_perms(wr, false, false, false);
1132 }
1133 }
1134
compute_s1_overlay_permissions(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)1135 static void compute_s1_overlay_permissions(struct kvm_vcpu *vcpu,
1136 struct s1_walk_info *wi,
1137 struct s1_walk_result *wr)
1138 {
1139 u8 idx, pov_perms, uov_perms;
1140
1141 idx = FIELD_GET(PTE_PO_IDX_MASK, wr->desc);
1142
1143 if (wr->pov) {
1144 switch (wi->regime) {
1145 case TR_EL10:
1146 pov_perms = perm_idx(vcpu, POR_EL1, idx);
1147 break;
1148 case TR_EL20:
1149 pov_perms = perm_idx(vcpu, POR_EL2, idx);
1150 break;
1151 case TR_EL2:
1152 pov_perms = perm_idx(vcpu, POR_EL2, idx);
1153 break;
1154 }
1155
1156 if (pov_perms & ~POE_RWX)
1157 pov_perms = POE_NONE;
1158
1159 /* R_QXXPC, S1PrivOverflow enabled */
1160 if (wr->pwxn && (pov_perms & POE_X))
1161 pov_perms &= ~POE_W;
1162
1163 wr->pr &= pov_perms & POE_R;
1164 wr->pw &= pov_perms & POE_W;
1165 wr->px &= pov_perms & POE_X;
1166 }
1167
1168 if (wr->uov) {
1169 switch (wi->regime) {
1170 case TR_EL10:
1171 uov_perms = perm_idx(vcpu, POR_EL0, idx);
1172 break;
1173 case TR_EL20:
1174 uov_perms = perm_idx(vcpu, POR_EL0, idx);
1175 break;
1176 case TR_EL2:
1177 uov_perms = 0;
1178 break;
1179 }
1180
1181 if (uov_perms & ~POE_RWX)
1182 uov_perms = POE_NONE;
1183
1184 /* R_NPBXC, S1UnprivOverlay enabled */
1185 if (wr->uwxn && (uov_perms & POE_X))
1186 uov_perms &= ~POE_W;
1187
1188 wr->ur &= uov_perms & POE_R;
1189 wr->uw &= uov_perms & POE_W;
1190 wr->ux &= uov_perms & POE_X;
1191 }
1192 }
1193
compute_s1_permissions(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)1194 static void compute_s1_permissions(struct kvm_vcpu *vcpu,
1195 struct s1_walk_info *wi,
1196 struct s1_walk_result *wr)
1197 {
1198 bool pan;
1199
1200 if (!s1pie_enabled(vcpu, wi->regime))
1201 compute_s1_direct_permissions(vcpu, wi, wr);
1202 else
1203 compute_s1_indirect_permissions(vcpu, wi, wr);
1204
1205 if (!wi->hpd)
1206 compute_s1_hierarchical_permissions(vcpu, wi, wr);
1207
1208 compute_s1_overlay_permissions(vcpu, wi, wr);
1209
1210 /* R_QXXPC, S1PrivOverlay disabled */
1211 if (!wr->pov)
1212 wr->px &= !(wr->pwxn && wr->pw);
1213
1214 /* R_NPBXC, S1UnprivOverlay disabled */
1215 if (!wr->uov)
1216 wr->ux &= !(wr->uwxn && wr->uw);
1217
1218 pan = wi->pan && (wr->ur || wr->uw ||
1219 (pan3_enabled(vcpu, wi->regime) && wr->ux));
1220 wr->pw &= !pan;
1221 wr->pr &= !pan;
1222 }
1223
handle_at_slow(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)1224 static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
1225 {
1226 struct s1_walk_result wr = {};
1227 struct s1_walk_info wi = {};
1228 bool perm_fail = false;
1229 int ret, idx;
1230
1231 wi.regime = compute_translation_regime(vcpu, op);
1232 wi.as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W);
1233 wi.pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) &&
1234 (*vcpu_cpsr(vcpu) & PSR_PAN_BIT);
1235
1236 ret = setup_s1_walk(vcpu, &wi, &wr, vaddr);
1237 if (ret)
1238 goto compute_par;
1239
1240 if (wr.level == S1_MMU_DISABLED)
1241 goto compute_par;
1242
1243 idx = srcu_read_lock(&vcpu->kvm->srcu);
1244
1245 ret = walk_s1(vcpu, &wi, &wr, vaddr);
1246
1247 srcu_read_unlock(&vcpu->kvm->srcu, idx);
1248
1249 if (ret)
1250 goto compute_par;
1251
1252 compute_s1_permissions(vcpu, &wi, &wr);
1253
1254 switch (op) {
1255 case OP_AT_S1E1RP:
1256 case OP_AT_S1E1R:
1257 case OP_AT_S1E2R:
1258 perm_fail = !wr.pr;
1259 break;
1260 case OP_AT_S1E1WP:
1261 case OP_AT_S1E1W:
1262 case OP_AT_S1E2W:
1263 perm_fail = !wr.pw;
1264 break;
1265 case OP_AT_S1E0R:
1266 perm_fail = !wr.ur;
1267 break;
1268 case OP_AT_S1E0W:
1269 perm_fail = !wr.uw;
1270 break;
1271 case OP_AT_S1E1A:
1272 case OP_AT_S1E2A:
1273 break;
1274 default:
1275 BUG();
1276 }
1277
1278 if (perm_fail)
1279 fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false);
1280
1281 compute_par:
1282 return compute_par_s1(vcpu, &wi, &wr);
1283 }
1284
1285 /*
1286 * Return the PAR_EL1 value as the result of a valid translation.
1287 *
1288 * If the translation is unsuccessful, the value may only contain
1289 * PAR_EL1.F, and cannot be taken at face value. It isn't an
1290 * indication of the translation having failed, only that the fast
1291 * path did not succeed, *unless* it indicates a S1 permission or
1292 * access fault.
1293 */
__kvm_at_s1e01_fast(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)1294 static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
1295 {
1296 struct mmu_config config;
1297 struct kvm_s2_mmu *mmu;
1298 bool fail, mmu_cs;
1299 u64 par;
1300
1301 par = SYS_PAR_EL1_F;
1302
1303 /*
1304 * We've trapped, so everything is live on the CPU. As we will
1305 * be switching contexts behind everybody's back, disable
1306 * interrupts while holding the mmu lock.
1307 */
1308 guard(write_lock_irqsave)(&vcpu->kvm->mmu_lock);
1309
1310 /*
1311 * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already
1312 * the right one (as we trapped from vEL2). If not, save the
1313 * full MMU context.
1314 *
1315 * We are also guaranteed to be in the correct context if
1316 * we're not in a nested VM.
1317 */
1318 mmu_cs = (vcpu_has_nv(vcpu) &&
1319 !(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)));
1320 if (!mmu_cs)
1321 goto skip_mmu_switch;
1322
1323 /*
1324 * Obtaining the S2 MMU for a L2 is horribly racy, and we may not
1325 * find it (recycled by another vcpu, for example). When this
1326 * happens, admit defeat immediately and use the SW (slow) path.
1327 */
1328 mmu = lookup_s2_mmu(vcpu);
1329 if (!mmu)
1330 return par;
1331
1332 __mmu_config_save(&config);
1333
1334 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1), SYS_TTBR0);
1335 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1), SYS_TTBR1);
1336 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1), SYS_TCR);
1337 write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1), SYS_MAIR);
1338 if (kvm_has_tcr2(vcpu->kvm)) {
1339 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR2_EL1), SYS_TCR2);
1340 if (kvm_has_s1pie(vcpu->kvm)) {
1341 write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIR_EL1), SYS_PIR);
1342 write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIRE0_EL1), SYS_PIRE0);
1343 }
1344 if (kvm_has_s1poe(vcpu->kvm)) {
1345 write_sysreg_el1(vcpu_read_sys_reg(vcpu, POR_EL1), SYS_POR);
1346 write_sysreg_s(vcpu_read_sys_reg(vcpu, POR_EL0), SYS_POR_EL0);
1347 }
1348 }
1349 write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1), SYS_SCTLR);
1350 __load_stage2(mmu, mmu->arch);
1351
1352 skip_mmu_switch:
1353 /* Temporarily switch back to guest context */
1354 write_sysreg_hcr(vcpu->arch.hcr_el2);
1355 isb();
1356
1357 switch (op) {
1358 case OP_AT_S1E1RP:
1359 case OP_AT_S1E1WP:
1360 fail = at_s1e1p_fast(vcpu, op, vaddr);
1361 break;
1362 case OP_AT_S1E1R:
1363 fail = __kvm_at(OP_AT_S1E1R, vaddr);
1364 break;
1365 case OP_AT_S1E1W:
1366 fail = __kvm_at(OP_AT_S1E1W, vaddr);
1367 break;
1368 case OP_AT_S1E0R:
1369 fail = __kvm_at(OP_AT_S1E0R, vaddr);
1370 break;
1371 case OP_AT_S1E0W:
1372 fail = __kvm_at(OP_AT_S1E0W, vaddr);
1373 break;
1374 case OP_AT_S1E1A:
1375 fail = __kvm_at(OP_AT_S1E1A, vaddr);
1376 break;
1377 default:
1378 WARN_ON_ONCE(1);
1379 fail = true;
1380 break;
1381 }
1382
1383 if (!fail)
1384 par = read_sysreg_par();
1385
1386 write_sysreg_hcr(HCR_HOST_VHE_FLAGS);
1387
1388 if (mmu_cs)
1389 __mmu_config_restore(&config);
1390
1391 return par;
1392 }
1393
par_check_s1_perm_fault(u64 par)1394 static bool par_check_s1_perm_fault(u64 par)
1395 {
1396 u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
1397
1398 return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM &&
1399 !(par & SYS_PAR_EL1_S));
1400 }
1401
par_check_s1_access_fault(u64 par)1402 static bool par_check_s1_access_fault(u64 par)
1403 {
1404 u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
1405
1406 return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_ACCESS &&
1407 !(par & SYS_PAR_EL1_S));
1408 }
1409
__kvm_at_s1e01(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)1410 void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
1411 {
1412 u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr);
1413
1414 /*
1415 * If PAR_EL1 reports that AT failed on a S1 permission or access
1416 * fault, we know for sure that the PTW was able to walk the S1
1417 * tables and there's nothing else to do.
1418 *
1419 * If AT failed for any other reason, then we must walk the guest S1
1420 * to emulate the instruction.
1421 */
1422 if ((par & SYS_PAR_EL1_F) &&
1423 !par_check_s1_perm_fault(par) &&
1424 !par_check_s1_access_fault(par))
1425 par = handle_at_slow(vcpu, op, vaddr);
1426
1427 vcpu_write_sys_reg(vcpu, par, PAR_EL1);
1428 }
1429
__kvm_at_s1e2(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)1430 void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
1431 {
1432 u64 par;
1433
1434 /*
1435 * We've trapped, so everything is live on the CPU. As we will be
1436 * switching context behind everybody's back, disable interrupts...
1437 */
1438 scoped_guard(write_lock_irqsave, &vcpu->kvm->mmu_lock) {
1439 u64 val, hcr;
1440 bool fail;
1441
1442 val = hcr = read_sysreg(hcr_el2);
1443 val &= ~HCR_TGE;
1444 val |= HCR_VM;
1445
1446 if (!vcpu_el2_e2h_is_set(vcpu))
1447 val |= HCR_NV | HCR_NV1;
1448
1449 write_sysreg_hcr(val);
1450 isb();
1451
1452 par = SYS_PAR_EL1_F;
1453
1454 switch (op) {
1455 case OP_AT_S1E2R:
1456 fail = __kvm_at(OP_AT_S1E1R, vaddr);
1457 break;
1458 case OP_AT_S1E2W:
1459 fail = __kvm_at(OP_AT_S1E1W, vaddr);
1460 break;
1461 case OP_AT_S1E2A:
1462 fail = __kvm_at(OP_AT_S1E1A, vaddr);
1463 break;
1464 default:
1465 WARN_ON_ONCE(1);
1466 fail = true;
1467 }
1468
1469 isb();
1470
1471 if (!fail)
1472 par = read_sysreg_par();
1473
1474 write_sysreg_hcr(hcr);
1475 isb();
1476 }
1477
1478 /* We failed the translation, let's replay it in slow motion */
1479 if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par))
1480 par = handle_at_slow(vcpu, op, vaddr);
1481
1482 vcpu_write_sys_reg(vcpu, par, PAR_EL1);
1483 }
1484
__kvm_at_s12(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)1485 void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
1486 {
1487 struct kvm_s2_trans out = {};
1488 u64 ipa, par;
1489 bool write;
1490 int ret;
1491
1492 /* Do the stage-1 translation */
1493 switch (op) {
1494 case OP_AT_S12E1R:
1495 op = OP_AT_S1E1R;
1496 write = false;
1497 break;
1498 case OP_AT_S12E1W:
1499 op = OP_AT_S1E1W;
1500 write = true;
1501 break;
1502 case OP_AT_S12E0R:
1503 op = OP_AT_S1E0R;
1504 write = false;
1505 break;
1506 case OP_AT_S12E0W:
1507 op = OP_AT_S1E0W;
1508 write = true;
1509 break;
1510 default:
1511 WARN_ON_ONCE(1);
1512 return;
1513 }
1514
1515 __kvm_at_s1e01(vcpu, op, vaddr);
1516 par = vcpu_read_sys_reg(vcpu, PAR_EL1);
1517 if (par & SYS_PAR_EL1_F)
1518 return;
1519
1520 /*
1521 * If we only have a single stage of translation (EL2&0), exit
1522 * early. Same thing if {VM,DC}=={0,0}.
1523 */
1524 if (compute_translation_regime(vcpu, op) == TR_EL20 ||
1525 !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC)))
1526 return;
1527
1528 /* Do the stage-2 translation */
1529 ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0));
1530 out.esr = 0;
1531 ret = kvm_walk_nested_s2(vcpu, ipa, &out);
1532 if (ret < 0)
1533 return;
1534
1535 /* Check the access permission */
1536 if (!out.esr &&
1537 ((!write && !out.readable) || (write && !out.writable)))
1538 out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3);
1539
1540 par = compute_par_s12(vcpu, par, &out);
1541 vcpu_write_sys_reg(vcpu, par, PAR_EL1);
1542 }
1543
1544 /*
1545 * Translate a VA for a given EL in a given translation regime, with
1546 * or without PAN. This requires wi->{regime, as_el0, pan} to be
1547 * set. The rest of the wi and wr should be 0-initialised.
1548 */
__kvm_translate_va(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr,u64 va)1549 int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
1550 struct s1_walk_result *wr, u64 va)
1551 {
1552 int ret;
1553
1554 ret = setup_s1_walk(vcpu, wi, wr, va);
1555 if (ret)
1556 return ret;
1557
1558 if (wr->level == S1_MMU_DISABLED) {
1559 wr->ur = wr->uw = wr->ux = true;
1560 wr->pr = wr->pw = wr->px = true;
1561 } else {
1562 ret = walk_s1(vcpu, wi, wr, va);
1563 if (ret)
1564 return ret;
1565
1566 compute_s1_permissions(vcpu, wi, wr);
1567 }
1568
1569 return 0;
1570 }
1571
1572 struct desc_match {
1573 u64 ipa;
1574 int level;
1575 };
1576
match_s1_desc(struct s1_walk_context * ctxt,void * priv)1577 static int match_s1_desc(struct s1_walk_context *ctxt, void *priv)
1578 {
1579 struct desc_match *dm = priv;
1580 u64 ipa = dm->ipa;
1581
1582 /* Use S1 granule alignment */
1583 ipa &= GENMASK(51, ctxt->wi->pgshift);
1584
1585 /* Not the IPA we're looking for? Continue. */
1586 if (ipa != ctxt->table_ipa)
1587 return 0;
1588
1589 /* Note the level and interrupt the walk */
1590 dm->level = ctxt->level;
1591 return -EINTR;
1592 }
1593
__kvm_find_s1_desc_level(struct kvm_vcpu * vcpu,u64 va,u64 ipa,int * level)1594 int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level)
1595 {
1596 struct desc_match dm = {
1597 .ipa = ipa,
1598 };
1599 struct s1_walk_info wi = {
1600 .filter = &(struct s1_walk_filter){
1601 .fn = match_s1_desc,
1602 .priv = &dm,
1603 },
1604 .as_el0 = false,
1605 .pan = false,
1606 };
1607 struct s1_walk_result wr = {};
1608 int ret;
1609
1610 if (is_hyp_ctxt(vcpu))
1611 wi.regime = vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2;
1612 else
1613 wi.regime = TR_EL10;
1614
1615 ret = setup_s1_walk(vcpu, &wi, &wr, va);
1616 if (ret)
1617 return ret;
1618
1619 /* We really expect the S1 MMU to be on here... */
1620 if (WARN_ON_ONCE(wr.level == S1_MMU_DISABLED)) {
1621 *level = 0;
1622 return 0;
1623 }
1624
1625 /* Walk the guest's PT, looking for a match along the way */
1626 ret = walk_s1(vcpu, &wi, &wr, va);
1627 switch (ret) {
1628 case -EINTR:
1629 /* We interrupted the walk on a match, return the level */
1630 *level = dm.level;
1631 return 0;
1632 case 0:
1633 /* The walk completed, we failed to find the entry */
1634 return -ENOENT;
1635 default:
1636 /* Any other error... */
1637 return ret;
1638 }
1639 }
1640