1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (C) 2017 - Linaro Ltd
4 * Author: Jintack Lim <jintack.lim@linaro.org>
5 */
6
7 #include <linux/kvm_host.h>
8
9 #include <asm/esr.h>
10 #include <asm/kvm_hyp.h>
11 #include <asm/kvm_mmu.h>
12
fail_s1_walk(struct s1_walk_result * wr,u8 fst,bool s1ptw)13 static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool s1ptw)
14 {
15 wr->fst = fst;
16 wr->ptw = s1ptw;
17 wr->s2 = s1ptw;
18 wr->failed = true;
19 }
20
21 #define S1_MMU_DISABLED (-127)
22
get_ia_size(struct s1_walk_info * wi)23 static int get_ia_size(struct s1_walk_info *wi)
24 {
25 return 64 - wi->txsz;
26 }
27
28 /* Return true if the IPA is out of the OA range */
check_output_size(u64 ipa,struct s1_walk_info * wi)29 static bool check_output_size(u64 ipa, struct s1_walk_info *wi)
30 {
31 if (wi->pa52bit)
32 return wi->max_oa_bits < 52 && (ipa & GENMASK_ULL(51, wi->max_oa_bits));
33 return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits));
34 }
35
has_52bit_pa(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,u64 tcr)36 static bool has_52bit_pa(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, u64 tcr)
37 {
38 switch (BIT(wi->pgshift)) {
39 case SZ_64K:
40 default: /* IMPDEF: treat any other value as 64k */
41 if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, PARANGE, 52))
42 return false;
43 return ((wi->regime == TR_EL2 ?
44 FIELD_GET(TCR_EL2_PS_MASK, tcr) :
45 FIELD_GET(TCR_IPS_MASK, tcr)) == 0b0110);
46 case SZ_16K:
47 if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT))
48 return false;
49 break;
50 case SZ_4K:
51 if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT))
52 return false;
53 break;
54 }
55
56 return (tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS));
57 }
58
desc_to_oa(struct s1_walk_info * wi,u64 desc)59 static u64 desc_to_oa(struct s1_walk_info *wi, u64 desc)
60 {
61 u64 addr;
62
63 if (!wi->pa52bit)
64 return desc & GENMASK_ULL(47, wi->pgshift);
65
66 switch (BIT(wi->pgshift)) {
67 case SZ_4K:
68 case SZ_16K:
69 addr = desc & GENMASK_ULL(49, wi->pgshift);
70 addr |= FIELD_GET(KVM_PTE_ADDR_51_50_LPA2, desc) << 50;
71 break;
72 case SZ_64K:
73 default: /* IMPDEF: treat any other value as 64k */
74 addr = desc & GENMASK_ULL(47, wi->pgshift);
75 addr |= FIELD_GET(KVM_PTE_ADDR_51_48, desc) << 48;
76 break;
77 }
78
79 return addr;
80 }
81
82 /* Return the translation regime that applies to an AT instruction */
compute_translation_regime(struct kvm_vcpu * vcpu,u32 op)83 static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op)
84 {
85 /*
86 * We only get here from guest EL2, so the translation
87 * regime AT applies to is solely defined by {E2H,TGE}.
88 */
89 switch (op) {
90 case OP_AT_S1E2R:
91 case OP_AT_S1E2W:
92 case OP_AT_S1E2A:
93 return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2;
94 default:
95 return (vcpu_el2_e2h_is_set(vcpu) &&
96 vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10;
97 }
98 }
99
effective_tcr2(struct kvm_vcpu * vcpu,enum trans_regime regime)100 static u64 effective_tcr2(struct kvm_vcpu *vcpu, enum trans_regime regime)
101 {
102 if (regime == TR_EL10) {
103 if (vcpu_has_nv(vcpu) &&
104 !(__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En))
105 return 0;
106
107 return vcpu_read_sys_reg(vcpu, TCR2_EL1);
108 }
109
110 return vcpu_read_sys_reg(vcpu, TCR2_EL2);
111 }
112
s1pie_enabled(struct kvm_vcpu * vcpu,enum trans_regime regime)113 static bool s1pie_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime)
114 {
115 if (!kvm_has_s1pie(vcpu->kvm))
116 return false;
117
118 /* Abuse TCR2_EL1_PIE and use it for EL2 as well */
119 return effective_tcr2(vcpu, regime) & TCR2_EL1_PIE;
120 }
121
compute_s1poe(struct kvm_vcpu * vcpu,struct s1_walk_info * wi)122 static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi)
123 {
124 u64 val;
125
126 if (!kvm_has_s1poe(vcpu->kvm)) {
127 wi->poe = wi->e0poe = false;
128 return;
129 }
130
131 val = effective_tcr2(vcpu, wi->regime);
132
133 /* Abuse TCR2_EL1_* for EL2 */
134 wi->poe = val & TCR2_EL1_POE;
135 wi->e0poe = (wi->regime != TR_EL2) && (val & TCR2_EL1_E0POE);
136 }
137
setup_s1_walk(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr,u64 va)138 static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
139 struct s1_walk_result *wr, u64 va)
140 {
141 u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr;
142 unsigned int stride, x;
143 bool va55, tbi, lva;
144
145 va55 = va & BIT(55);
146
147 if (vcpu_has_nv(vcpu)) {
148 hcr = __vcpu_sys_reg(vcpu, HCR_EL2);
149 wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC));
150 } else {
151 WARN_ON_ONCE(wi->regime != TR_EL10);
152 wi->s2 = false;
153 hcr = 0;
154 }
155
156 switch (wi->regime) {
157 case TR_EL10:
158 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
159 tcr = vcpu_read_sys_reg(vcpu, TCR_EL1);
160 ttbr = (va55 ?
161 vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
162 vcpu_read_sys_reg(vcpu, TTBR0_EL1));
163 break;
164 case TR_EL2:
165 case TR_EL20:
166 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
167 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
168 ttbr = (va55 ?
169 vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
170 vcpu_read_sys_reg(vcpu, TTBR0_EL2));
171 break;
172 default:
173 BUG();
174 }
175
176 /* Someone was silly enough to encode TG0/TG1 differently */
177 if (va55 && wi->regime != TR_EL2) {
178 wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr);
179 tg = FIELD_GET(TCR_TG1_MASK, tcr);
180
181 switch (tg << TCR_TG1_SHIFT) {
182 case TCR_TG1_4K:
183 wi->pgshift = 12; break;
184 case TCR_TG1_16K:
185 wi->pgshift = 14; break;
186 case TCR_TG1_64K:
187 default: /* IMPDEF: treat any other value as 64k */
188 wi->pgshift = 16; break;
189 }
190 } else {
191 wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
192 tg = FIELD_GET(TCR_TG0_MASK, tcr);
193
194 switch (tg << TCR_TG0_SHIFT) {
195 case TCR_TG0_4K:
196 wi->pgshift = 12; break;
197 case TCR_TG0_16K:
198 wi->pgshift = 14; break;
199 case TCR_TG0_64K:
200 default: /* IMPDEF: treat any other value as 64k */
201 wi->pgshift = 16; break;
202 }
203 }
204
205 wi->pa52bit = has_52bit_pa(vcpu, wi, tcr);
206
207 ia_bits = get_ia_size(wi);
208
209 /* AArch64.S1StartLevel() */
210 stride = wi->pgshift - 3;
211 wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride);
212
213 if (wi->regime == TR_EL2 && va55)
214 goto addrsz;
215
216 tbi = (wi->regime == TR_EL2 ?
217 FIELD_GET(TCR_EL2_TBI, tcr) :
218 (va55 ?
219 FIELD_GET(TCR_TBI1, tcr) :
220 FIELD_GET(TCR_TBI0, tcr)));
221
222 if (!tbi && (u64)sign_extend64(va, 55) != va)
223 goto addrsz;
224
225 wi->sh = (wi->regime == TR_EL2 ?
226 FIELD_GET(TCR_EL2_SH0_MASK, tcr) :
227 (va55 ?
228 FIELD_GET(TCR_SH1_MASK, tcr) :
229 FIELD_GET(TCR_SH0_MASK, tcr)));
230
231 va = (u64)sign_extend64(va, 55);
232
233 /* Let's put the MMU disabled case aside immediately */
234 switch (wi->regime) {
235 case TR_EL10:
236 /*
237 * If dealing with the EL1&0 translation regime, 3 things
238 * can disable the S1 translation:
239 *
240 * - HCR_EL2.DC = 1
241 * - HCR_EL2.{E2H,TGE} = {0,1}
242 * - SCTLR_EL1.M = 0
243 *
244 * The TGE part is interesting. If we have decided that this
245 * is EL1&0, then it means that either {E2H,TGE} == {1,0} or
246 * {0,x}, and we only need to test for TGE == 1.
247 */
248 if (hcr & (HCR_DC | HCR_TGE)) {
249 wr->level = S1_MMU_DISABLED;
250 break;
251 }
252 fallthrough;
253 case TR_EL2:
254 case TR_EL20:
255 if (!(sctlr & SCTLR_ELx_M))
256 wr->level = S1_MMU_DISABLED;
257 break;
258 }
259
260 if (wr->level == S1_MMU_DISABLED) {
261 if (va >= BIT(kvm_get_pa_bits(vcpu->kvm)))
262 goto addrsz;
263
264 wr->pa = va;
265 return 0;
266 }
267
268 wi->be = sctlr & SCTLR_ELx_EE;
269
270 wi->hpd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP);
271 wi->hpd &= (wi->regime == TR_EL2 ?
272 FIELD_GET(TCR_EL2_HPD, tcr) :
273 (va55 ?
274 FIELD_GET(TCR_HPD1, tcr) :
275 FIELD_GET(TCR_HPD0, tcr)));
276 /* R_JHSVW */
277 wi->hpd |= s1pie_enabled(vcpu, wi->regime);
278
279 /* Do we have POE? */
280 compute_s1poe(vcpu, wi);
281
282 /* R_BVXDG */
283 wi->hpd |= (wi->poe || wi->e0poe);
284
285 /* R_PLCGL, R_YXNYW */
286 if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) {
287 if (wi->txsz > 39)
288 goto transfault;
289 } else {
290 if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47))
291 goto transfault;
292 }
293
294 /* R_GTJBY, R_SXWGM */
295 switch (BIT(wi->pgshift)) {
296 case SZ_4K:
297 case SZ_16K:
298 lva = wi->pa52bit;
299 break;
300 case SZ_64K:
301 lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52);
302 break;
303 }
304
305 if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16))
306 goto transfault;
307
308 /* R_YYVYV, I_THCZK */
309 if ((!va55 && va > GENMASK(ia_bits - 1, 0)) ||
310 (va55 && va < GENMASK(63, ia_bits)))
311 goto transfault;
312
313 /* I_ZFSYQ */
314 if (wi->regime != TR_EL2 &&
315 (tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK)))
316 goto transfault;
317
318 /* R_BNDVG and following statements */
319 if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) &&
320 wi->as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0)))
321 goto transfault;
322
323 ps = (wi->regime == TR_EL2 ?
324 FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr));
325
326 wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps, wi->pa52bit));
327
328 /* Compute minimal alignment */
329 x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift);
330
331 wi->baddr = ttbr & TTBRx_EL1_BADDR;
332 if (wi->pa52bit) {
333 /*
334 * Force the alignment on 64 bytes for top-level tables
335 * smaller than 8 entries, since TTBR.BADDR[5:2] are used to
336 * store bits [51:48] of the first level of lookup.
337 */
338 x = max(x, 6);
339
340 wi->baddr |= FIELD_GET(GENMASK_ULL(5, 2), ttbr) << 48;
341 }
342
343 /* R_VPBBF */
344 if (check_output_size(wi->baddr, wi))
345 goto addrsz;
346
347 wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x);
348
349 wi->ha = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HAFDBS, AF);
350 wi->ha &= (wi->regime == TR_EL2 ?
351 FIELD_GET(TCR_EL2_HA, tcr) :
352 FIELD_GET(TCR_HA, tcr));
353
354 return 0;
355
356 addrsz:
357 /*
358 * Address Size Fault level 0 to indicate it comes from TTBR.
359 * yes, this is an oddity.
360 */
361 fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false);
362 return -EFAULT;
363
364 transfault:
365 /* Translation Fault on start level */
366 fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(wi->sl), false);
367 return -EFAULT;
368 }
369
kvm_read_s1_desc(struct kvm_vcpu * vcpu,u64 pa,u64 * desc,struct s1_walk_info * wi)370 static int kvm_read_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 *desc,
371 struct s1_walk_info *wi)
372 {
373 u64 val;
374 int r;
375
376 r = kvm_read_guest(vcpu->kvm, pa, &val, sizeof(val));
377 if (r)
378 return r;
379
380 if (wi->be)
381 *desc = be64_to_cpu((__force __be64)val);
382 else
383 *desc = le64_to_cpu((__force __le64)val);
384
385 return 0;
386 }
387
kvm_swap_s1_desc(struct kvm_vcpu * vcpu,u64 pa,u64 old,u64 new,struct s1_walk_info * wi)388 static int kvm_swap_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 old, u64 new,
389 struct s1_walk_info *wi)
390 {
391 if (wi->be) {
392 old = (__force u64)cpu_to_be64(old);
393 new = (__force u64)cpu_to_be64(new);
394 } else {
395 old = (__force u64)cpu_to_le64(old);
396 new = (__force u64)cpu_to_le64(new);
397 }
398
399 return __kvm_at_swap_desc(vcpu->kvm, pa, old, new);
400 }
401
walk_s1(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr,u64 va)402 static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
403 struct s1_walk_result *wr, u64 va)
404 {
405 u64 va_top, va_bottom, baddr, desc, new_desc, ipa;
406 struct kvm_s2_trans s2_trans = {};
407 int level, stride, ret;
408
409 level = wi->sl;
410 stride = wi->pgshift - 3;
411 baddr = wi->baddr;
412
413 va_top = get_ia_size(wi) - 1;
414
415 while (1) {
416 u64 index;
417
418 va_bottom = (3 - level) * stride + wi->pgshift;
419 index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3);
420
421 ipa = baddr | index;
422
423 if (wi->s2) {
424 ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans);
425 if (ret) {
426 fail_s1_walk(wr,
427 (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level,
428 true);
429 return ret;
430 }
431
432 if (!kvm_s2_trans_readable(&s2_trans)) {
433 fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level),
434 true);
435
436 return -EPERM;
437 }
438
439 ipa = kvm_s2_trans_output(&s2_trans);
440 }
441
442 if (wi->filter) {
443 ret = wi->filter->fn(&(struct s1_walk_context)
444 {
445 .wi = wi,
446 .table_ipa = baddr,
447 .level = level,
448 }, wi->filter->priv);
449 if (ret)
450 return ret;
451 }
452
453 ret = kvm_read_s1_desc(vcpu, ipa, &desc, wi);
454 if (ret) {
455 fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false);
456 return ret;
457 }
458
459 new_desc = desc;
460
461 /* Invalid descriptor */
462 if (!(desc & BIT(0)))
463 goto transfault;
464
465 /* Block mapping, check validity down the line */
466 if (!(desc & BIT(1)))
467 break;
468
469 /* Page mapping */
470 if (level == 3)
471 break;
472
473 /* Table handling */
474 if (!wi->hpd) {
475 wr->APTable |= FIELD_GET(S1_TABLE_AP, desc);
476 wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc);
477 wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc);
478 }
479
480 baddr = desc_to_oa(wi, desc);
481
482 /* Check for out-of-range OA */
483 if (check_output_size(baddr, wi))
484 goto addrsz;
485
486 /* Prepare for next round */
487 va_top = va_bottom - 1;
488 level++;
489 }
490
491 /* Block mapping, check the validity of the level */
492 if (!(desc & BIT(1))) {
493 bool valid_block = false;
494
495 switch (BIT(wi->pgshift)) {
496 case SZ_4K:
497 valid_block = level == 1 || level == 2 || (wi->pa52bit && level == 0);
498 break;
499 case SZ_16K:
500 case SZ_64K:
501 valid_block = level == 2 || (wi->pa52bit && level == 1);
502 break;
503 }
504
505 if (!valid_block)
506 goto transfault;
507 }
508
509 baddr = desc_to_oa(wi, desc);
510 if (check_output_size(baddr & GENMASK(52, va_bottom), wi))
511 goto addrsz;
512
513 if (wi->ha)
514 new_desc |= PTE_AF;
515
516 if (new_desc != desc) {
517 if (wi->s2 && !kvm_s2_trans_writable(&s2_trans)) {
518 fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level), true);
519 return -EPERM;
520 }
521
522 ret = kvm_swap_s1_desc(vcpu, ipa, desc, new_desc, wi);
523 if (ret)
524 return ret;
525
526 desc = new_desc;
527 }
528
529 if (!(desc & PTE_AF)) {
530 fail_s1_walk(wr, ESR_ELx_FSC_ACCESS_L(level), false);
531 return -EACCES;
532 }
533
534 va_bottom += contiguous_bit_shift(desc, wi, level);
535
536 wr->failed = false;
537 wr->level = level;
538 wr->desc = desc;
539 wr->pa = baddr & GENMASK(52, va_bottom);
540 wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0);
541
542 wr->nG = (wi->regime != TR_EL2) && (desc & PTE_NG);
543 if (wr->nG) {
544 u64 asid_ttbr, tcr;
545
546 switch (wi->regime) {
547 case TR_EL10:
548 tcr = vcpu_read_sys_reg(vcpu, TCR_EL1);
549 asid_ttbr = ((tcr & TCR_A1) ?
550 vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
551 vcpu_read_sys_reg(vcpu, TTBR0_EL1));
552 break;
553 case TR_EL20:
554 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
555 asid_ttbr = ((tcr & TCR_A1) ?
556 vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
557 vcpu_read_sys_reg(vcpu, TTBR0_EL2));
558 break;
559 default:
560 BUG();
561 }
562
563 wr->asid = FIELD_GET(TTBR_ASID_MASK, asid_ttbr);
564 if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) ||
565 !(tcr & TCR_ASID16))
566 wr->asid &= GENMASK(7, 0);
567 }
568
569 return 0;
570
571 addrsz:
572 fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), false);
573 return -EINVAL;
574 transfault:
575 fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), false);
576 return -ENOENT;
577 }
578
579 struct mmu_config {
580 u64 ttbr0;
581 u64 ttbr1;
582 u64 tcr;
583 u64 mair;
584 u64 tcr2;
585 u64 pir;
586 u64 pire0;
587 u64 por_el0;
588 u64 por_el1;
589 u64 sctlr;
590 u64 vttbr;
591 u64 vtcr;
592 };
593
__mmu_config_save(struct mmu_config * config)594 static void __mmu_config_save(struct mmu_config *config)
595 {
596 config->ttbr0 = read_sysreg_el1(SYS_TTBR0);
597 config->ttbr1 = read_sysreg_el1(SYS_TTBR1);
598 config->tcr = read_sysreg_el1(SYS_TCR);
599 config->mair = read_sysreg_el1(SYS_MAIR);
600 if (cpus_have_final_cap(ARM64_HAS_TCR2)) {
601 config->tcr2 = read_sysreg_el1(SYS_TCR2);
602 if (cpus_have_final_cap(ARM64_HAS_S1PIE)) {
603 config->pir = read_sysreg_el1(SYS_PIR);
604 config->pire0 = read_sysreg_el1(SYS_PIRE0);
605 }
606 if (system_supports_poe()) {
607 config->por_el1 = read_sysreg_el1(SYS_POR);
608 config->por_el0 = read_sysreg_s(SYS_POR_EL0);
609 }
610 }
611 config->sctlr = read_sysreg_el1(SYS_SCTLR);
612 config->vttbr = read_sysreg(vttbr_el2);
613 config->vtcr = read_sysreg(vtcr_el2);
614 }
615
__mmu_config_restore(struct mmu_config * config)616 static void __mmu_config_restore(struct mmu_config *config)
617 {
618 /*
619 * ARM errata 1165522 and 1530923 require TGE to be 1 before
620 * we update the guest state.
621 */
622 asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
623
624 write_sysreg_el1(config->ttbr0, SYS_TTBR0);
625 write_sysreg_el1(config->ttbr1, SYS_TTBR1);
626 write_sysreg_el1(config->tcr, SYS_TCR);
627 write_sysreg_el1(config->mair, SYS_MAIR);
628 if (cpus_have_final_cap(ARM64_HAS_TCR2)) {
629 write_sysreg_el1(config->tcr2, SYS_TCR2);
630 if (cpus_have_final_cap(ARM64_HAS_S1PIE)) {
631 write_sysreg_el1(config->pir, SYS_PIR);
632 write_sysreg_el1(config->pire0, SYS_PIRE0);
633 }
634 if (system_supports_poe()) {
635 write_sysreg_el1(config->por_el1, SYS_POR);
636 write_sysreg_s(config->por_el0, SYS_POR_EL0);
637 }
638 }
639 write_sysreg_el1(config->sctlr, SYS_SCTLR);
640 write_sysreg(config->vttbr, vttbr_el2);
641 write_sysreg(config->vtcr, vtcr_el2);
642 }
643
at_s1e1p_fast(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)644 static bool at_s1e1p_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
645 {
646 u64 host_pan;
647 bool fail;
648
649 host_pan = read_sysreg_s(SYS_PSTATE_PAN);
650 write_sysreg_s(*vcpu_cpsr(vcpu) & PSTATE_PAN, SYS_PSTATE_PAN);
651
652 switch (op) {
653 case OP_AT_S1E1RP:
654 fail = __kvm_at(OP_AT_S1E1RP, vaddr);
655 break;
656 case OP_AT_S1E1WP:
657 fail = __kvm_at(OP_AT_S1E1WP, vaddr);
658 break;
659 }
660
661 write_sysreg_s(host_pan, SYS_PSTATE_PAN);
662
663 return fail;
664 }
665
666 #define MEMATTR(ic, oc) (MEMATTR_##oc << 4 | MEMATTR_##ic)
667 #define MEMATTR_NC 0b0100
668 #define MEMATTR_Wt 0b1000
669 #define MEMATTR_Wb 0b1100
670 #define MEMATTR_WbRaWa 0b1111
671
672 #define MEMATTR_IS_DEVICE(m) (((m) & GENMASK(7, 4)) == 0)
673
s2_memattr_to_attr(u8 memattr)674 static u8 s2_memattr_to_attr(u8 memattr)
675 {
676 memattr &= 0b1111;
677
678 switch (memattr) {
679 case 0b0000:
680 case 0b0001:
681 case 0b0010:
682 case 0b0011:
683 return memattr << 2;
684 case 0b0100:
685 return MEMATTR(Wb, Wb);
686 case 0b0101:
687 return MEMATTR(NC, NC);
688 case 0b0110:
689 return MEMATTR(Wt, NC);
690 case 0b0111:
691 return MEMATTR(Wb, NC);
692 case 0b1000:
693 /* Reserved, assume NC */
694 return MEMATTR(NC, NC);
695 case 0b1001:
696 return MEMATTR(NC, Wt);
697 case 0b1010:
698 return MEMATTR(Wt, Wt);
699 case 0b1011:
700 return MEMATTR(Wb, Wt);
701 case 0b1100:
702 /* Reserved, assume NC */
703 return MEMATTR(NC, NC);
704 case 0b1101:
705 return MEMATTR(NC, Wb);
706 case 0b1110:
707 return MEMATTR(Wt, Wb);
708 case 0b1111:
709 return MEMATTR(Wb, Wb);
710 default:
711 unreachable();
712 }
713 }
714
combine_s1_s2_attr(u8 s1,u8 s2)715 static u8 combine_s1_s2_attr(u8 s1, u8 s2)
716 {
717 bool transient;
718 u8 final = 0;
719
720 /* Upgrade transient s1 to non-transient to simplify things */
721 switch (s1) {
722 case 0b0001 ... 0b0011: /* Normal, Write-Through Transient */
723 transient = true;
724 s1 = MEMATTR_Wt | (s1 & GENMASK(1,0));
725 break;
726 case 0b0101 ... 0b0111: /* Normal, Write-Back Transient */
727 transient = true;
728 s1 = MEMATTR_Wb | (s1 & GENMASK(1,0));
729 break;
730 default:
731 transient = false;
732 }
733
734 /* S2CombineS1AttrHints() */
735 if ((s1 & GENMASK(3, 2)) == MEMATTR_NC ||
736 (s2 & GENMASK(3, 2)) == MEMATTR_NC)
737 final = MEMATTR_NC;
738 else if ((s1 & GENMASK(3, 2)) == MEMATTR_Wt ||
739 (s2 & GENMASK(3, 2)) == MEMATTR_Wt)
740 final = MEMATTR_Wt;
741 else
742 final = MEMATTR_Wb;
743
744 if (final != MEMATTR_NC) {
745 /* Inherit RaWa hints form S1 */
746 if (transient) {
747 switch (s1 & GENMASK(3, 2)) {
748 case MEMATTR_Wt:
749 final = 0;
750 break;
751 case MEMATTR_Wb:
752 final = MEMATTR_NC;
753 break;
754 }
755 }
756
757 final |= s1 & GENMASK(1, 0);
758 }
759
760 return final;
761 }
762
763 #define ATTR_NSH 0b00
764 #define ATTR_RSV 0b01
765 #define ATTR_OSH 0b10
766 #define ATTR_ISH 0b11
767
compute_final_sh(u8 attr,u8 sh)768 static u8 compute_final_sh(u8 attr, u8 sh)
769 {
770 /* Any form of device, as well as NC has SH[1:0]=0b10 */
771 if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC))
772 return ATTR_OSH;
773
774 if (sh == ATTR_RSV) /* Reserved, mapped to NSH */
775 sh = ATTR_NSH;
776
777 return sh;
778 }
779
compute_s1_sh(struct s1_walk_info * wi,struct s1_walk_result * wr,u8 attr)780 static u8 compute_s1_sh(struct s1_walk_info *wi, struct s1_walk_result *wr,
781 u8 attr)
782 {
783 u8 sh;
784
785 /*
786 * non-52bit and LPA have their basic shareability described in the
787 * descriptor. LPA2 gets it from the corresponding field in TCR,
788 * conveniently recorded in the walk info.
789 */
790 if (!wi->pa52bit || BIT(wi->pgshift) == SZ_64K)
791 sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_SH, wr->desc);
792 else
793 sh = wi->sh;
794
795 return compute_final_sh(attr, sh);
796 }
797
combine_sh(u8 s1_sh,u8 s2_sh)798 static u8 combine_sh(u8 s1_sh, u8 s2_sh)
799 {
800 if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH)
801 return ATTR_OSH;
802 if (s1_sh == ATTR_ISH || s2_sh == ATTR_ISH)
803 return ATTR_ISH;
804
805 return ATTR_NSH;
806 }
807
compute_par_s12(struct kvm_vcpu * vcpu,u64 s1_par,struct kvm_s2_trans * tr)808 static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par,
809 struct kvm_s2_trans *tr)
810 {
811 u8 s1_parattr, s2_memattr, final_attr, s2_sh;
812 u64 par;
813
814 /* If S2 has failed to translate, report the damage */
815 if (tr->esr) {
816 par = SYS_PAR_EL1_RES1;
817 par |= SYS_PAR_EL1_F;
818 par |= SYS_PAR_EL1_S;
819 par |= FIELD_PREP(SYS_PAR_EL1_FST, tr->esr);
820 return par;
821 }
822
823 s1_parattr = FIELD_GET(SYS_PAR_EL1_ATTR, s1_par);
824 s2_memattr = FIELD_GET(GENMASK(5, 2), tr->desc);
825
826 if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_FWB) {
827 if (!kvm_has_feat(vcpu->kvm, ID_AA64PFR2_EL1, MTEPERM, IMP))
828 s2_memattr &= ~BIT(3);
829
830 /* Combination of R_VRJSW and R_RHWZM */
831 switch (s2_memattr) {
832 case 0b0101:
833 if (MEMATTR_IS_DEVICE(s1_parattr))
834 final_attr = s1_parattr;
835 else
836 final_attr = MEMATTR(NC, NC);
837 break;
838 case 0b0110:
839 case 0b1110:
840 final_attr = MEMATTR(WbRaWa, WbRaWa);
841 break;
842 case 0b0111:
843 case 0b1111:
844 /* Preserve S1 attribute */
845 final_attr = s1_parattr;
846 break;
847 case 0b0100:
848 case 0b1100:
849 case 0b1101:
850 /* Reserved, do something non-silly */
851 final_attr = s1_parattr;
852 break;
853 default:
854 /*
855 * MemAttr[2]=0, Device from S2.
856 *
857 * FWB does not influence the way that stage 1
858 * memory types and attributes are combined
859 * with stage 2 Device type and attributes.
860 */
861 final_attr = min(s2_memattr_to_attr(s2_memattr),
862 s1_parattr);
863 }
864 } else {
865 /* Combination of R_HMNDG, R_TNHFM and R_GQFSF */
866 u8 s2_parattr = s2_memattr_to_attr(s2_memattr);
867
868 if (MEMATTR_IS_DEVICE(s1_parattr) ||
869 MEMATTR_IS_DEVICE(s2_parattr)) {
870 final_attr = min(s1_parattr, s2_parattr);
871 } else {
872 /* At this stage, this is memory vs memory */
873 final_attr = combine_s1_s2_attr(s1_parattr & 0xf,
874 s2_parattr & 0xf);
875 final_attr |= combine_s1_s2_attr(s1_parattr >> 4,
876 s2_parattr >> 4) << 4;
877 }
878 }
879
880 if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_CD) &&
881 !MEMATTR_IS_DEVICE(final_attr))
882 final_attr = MEMATTR(NC, NC);
883
884 s2_sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_SH, tr->desc);
885
886 par = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr);
887 par |= tr->output & GENMASK(47, 12);
888 par |= FIELD_PREP(SYS_PAR_EL1_SH,
889 combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par),
890 compute_final_sh(final_attr, s2_sh)));
891
892 return par;
893 }
894
compute_par_s1(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)895 static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
896 struct s1_walk_result *wr)
897 {
898 u64 par;
899
900 if (wr->failed) {
901 par = SYS_PAR_EL1_RES1;
902 par |= SYS_PAR_EL1_F;
903 par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst);
904 par |= wr->ptw ? SYS_PAR_EL1_PTW : 0;
905 par |= wr->s2 ? SYS_PAR_EL1_S : 0;
906 } else if (wr->level == S1_MMU_DISABLED) {
907 /* MMU off or HCR_EL2.DC == 1 */
908 par = SYS_PAR_EL1_NSE;
909 par |= wr->pa & SYS_PAR_EL1_PA;
910
911 if (wi->regime == TR_EL10 && vcpu_has_nv(vcpu) &&
912 (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
913 par |= FIELD_PREP(SYS_PAR_EL1_ATTR,
914 MEMATTR(WbRaWa, WbRaWa));
915 par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_NSH);
916 } else {
917 par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */
918 par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_OSH);
919 }
920 } else {
921 u64 mair, sctlr;
922 u8 sh;
923
924 par = SYS_PAR_EL1_NSE;
925
926 mair = (wi->regime == TR_EL10 ?
927 vcpu_read_sys_reg(vcpu, MAIR_EL1) :
928 vcpu_read_sys_reg(vcpu, MAIR_EL2));
929
930 mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8;
931 mair &= 0xff;
932
933 sctlr = (wi->regime == TR_EL10 ?
934 vcpu_read_sys_reg(vcpu, SCTLR_EL1) :
935 vcpu_read_sys_reg(vcpu, SCTLR_EL2));
936
937 /* Force NC for memory if SCTLR_ELx.C is clear */
938 if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair))
939 mair = MEMATTR(NC, NC);
940
941 par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair);
942 par |= wr->pa & SYS_PAR_EL1_PA;
943
944 sh = compute_s1_sh(wi, wr, mair);
945 par |= FIELD_PREP(SYS_PAR_EL1_SH, sh);
946 }
947
948 return par;
949 }
950
pan3_enabled(struct kvm_vcpu * vcpu,enum trans_regime regime)951 static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime)
952 {
953 u64 sctlr;
954
955 if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3))
956 return false;
957
958 if (s1pie_enabled(vcpu, regime))
959 return true;
960
961 if (regime == TR_EL10)
962 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
963 else
964 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
965
966 return sctlr & SCTLR_EL1_EPAN;
967 }
968
compute_s1_direct_permissions(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)969 static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu,
970 struct s1_walk_info *wi,
971 struct s1_walk_result *wr)
972 {
973 bool wxn;
974
975 /* Non-hierarchical part of AArch64.S1DirectBasePermissions() */
976 if (wi->regime != TR_EL2) {
977 switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr->desc)) {
978 case 0b00:
979 wr->pr = wr->pw = true;
980 wr->ur = wr->uw = false;
981 break;
982 case 0b01:
983 wr->pr = wr->pw = wr->ur = wr->uw = true;
984 break;
985 case 0b10:
986 wr->pr = true;
987 wr->pw = wr->ur = wr->uw = false;
988 break;
989 case 0b11:
990 wr->pr = wr->ur = true;
991 wr->pw = wr->uw = false;
992 break;
993 }
994
995 /* We don't use px for anything yet, but hey... */
996 wr->px = !((wr->desc & PTE_PXN) || wr->uw);
997 wr->ux = !(wr->desc & PTE_UXN);
998 } else {
999 wr->ur = wr->uw = wr->ux = false;
1000
1001 if (!(wr->desc & PTE_RDONLY)) {
1002 wr->pr = wr->pw = true;
1003 } else {
1004 wr->pr = true;
1005 wr->pw = false;
1006 }
1007
1008 /* XN maps to UXN */
1009 wr->px = !(wr->desc & PTE_UXN);
1010 }
1011
1012 switch (wi->regime) {
1013 case TR_EL2:
1014 case TR_EL20:
1015 wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_WXN);
1016 break;
1017 case TR_EL10:
1018 wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN);
1019 break;
1020 }
1021
1022 wr->pwxn = wr->uwxn = wxn;
1023 wr->pov = wi->poe;
1024 wr->uov = wi->e0poe;
1025 }
1026
compute_s1_hierarchical_permissions(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)1027 static void compute_s1_hierarchical_permissions(struct kvm_vcpu *vcpu,
1028 struct s1_walk_info *wi,
1029 struct s1_walk_result *wr)
1030 {
1031 /* Hierarchical part of AArch64.S1DirectBasePermissions() */
1032 if (wi->regime != TR_EL2) {
1033 switch (wr->APTable) {
1034 case 0b00:
1035 break;
1036 case 0b01:
1037 wr->ur = wr->uw = false;
1038 break;
1039 case 0b10:
1040 wr->pw = wr->uw = false;
1041 break;
1042 case 0b11:
1043 wr->pw = wr->ur = wr->uw = false;
1044 break;
1045 }
1046
1047 wr->px &= !wr->PXNTable;
1048 wr->ux &= !wr->UXNTable;
1049 } else {
1050 if (wr->APTable & BIT(1))
1051 wr->pw = false;
1052
1053 /* XN maps to UXN */
1054 wr->px &= !wr->UXNTable;
1055 }
1056 }
1057
1058 #define perm_idx(v, r, i) ((vcpu_read_sys_reg((v), (r)) >> ((i) * 4)) & 0xf)
1059
1060 #define set_priv_perms(wr, r, w, x) \
1061 do { \
1062 (wr)->pr = (r); \
1063 (wr)->pw = (w); \
1064 (wr)->px = (x); \
1065 } while (0)
1066
1067 #define set_unpriv_perms(wr, r, w, x) \
1068 do { \
1069 (wr)->ur = (r); \
1070 (wr)->uw = (w); \
1071 (wr)->ux = (x); \
1072 } while (0)
1073
1074 #define set_priv_wxn(wr, v) \
1075 do { \
1076 (wr)->pwxn = (v); \
1077 } while (0)
1078
1079 #define set_unpriv_wxn(wr, v) \
1080 do { \
1081 (wr)->uwxn = (v); \
1082 } while (0)
1083
1084 /* Similar to AArch64.S1IndirectBasePermissions(), without GCS */
1085 #define set_perms(w, wr, ip) \
1086 do { \
1087 /* R_LLZDZ */ \
1088 switch ((ip)) { \
1089 case 0b0000: \
1090 set_ ## w ## _perms((wr), false, false, false); \
1091 break; \
1092 case 0b0001: \
1093 set_ ## w ## _perms((wr), true , false, false); \
1094 break; \
1095 case 0b0010: \
1096 set_ ## w ## _perms((wr), false, false, true ); \
1097 break; \
1098 case 0b0011: \
1099 set_ ## w ## _perms((wr), true , false, true ); \
1100 break; \
1101 case 0b0100: \
1102 set_ ## w ## _perms((wr), false, false, false); \
1103 break; \
1104 case 0b0101: \
1105 set_ ## w ## _perms((wr), true , true , false); \
1106 break; \
1107 case 0b0110: \
1108 set_ ## w ## _perms((wr), true , true , true ); \
1109 break; \
1110 case 0b0111: \
1111 set_ ## w ## _perms((wr), true , true , true ); \
1112 break; \
1113 case 0b1000: \
1114 set_ ## w ## _perms((wr), true , false, false); \
1115 break; \
1116 case 0b1001: \
1117 set_ ## w ## _perms((wr), true , false, false); \
1118 break; \
1119 case 0b1010: \
1120 set_ ## w ## _perms((wr), true , false, true ); \
1121 break; \
1122 case 0b1011: \
1123 set_ ## w ## _perms((wr), false, false, false); \
1124 break; \
1125 case 0b1100: \
1126 set_ ## w ## _perms((wr), true , true , false); \
1127 break; \
1128 case 0b1101: \
1129 set_ ## w ## _perms((wr), false, false, false); \
1130 break; \
1131 case 0b1110: \
1132 set_ ## w ## _perms((wr), true , true , true ); \
1133 break; \
1134 case 0b1111: \
1135 set_ ## w ## _perms((wr), false, false, false); \
1136 break; \
1137 } \
1138 \
1139 /* R_HJYGR */ \
1140 set_ ## w ## _wxn((wr), ((ip) == 0b0110)); \
1141 \
1142 } while (0)
1143
compute_s1_indirect_permissions(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)1144 static void compute_s1_indirect_permissions(struct kvm_vcpu *vcpu,
1145 struct s1_walk_info *wi,
1146 struct s1_walk_result *wr)
1147 {
1148 u8 up, pp, idx;
1149
1150 idx = pte_pi_index(wr->desc);
1151
1152 switch (wi->regime) {
1153 case TR_EL10:
1154 pp = perm_idx(vcpu, PIR_EL1, idx);
1155 up = perm_idx(vcpu, PIRE0_EL1, idx);
1156 break;
1157 case TR_EL20:
1158 pp = perm_idx(vcpu, PIR_EL2, idx);
1159 up = perm_idx(vcpu, PIRE0_EL2, idx);
1160 break;
1161 case TR_EL2:
1162 pp = perm_idx(vcpu, PIR_EL2, idx);
1163 up = 0;
1164 break;
1165 }
1166
1167 set_perms(priv, wr, pp);
1168
1169 if (wi->regime != TR_EL2)
1170 set_perms(unpriv, wr, up);
1171 else
1172 set_unpriv_perms(wr, false, false, false);
1173
1174 wr->pov = wi->poe && !(pp & BIT(3));
1175 wr->uov = wi->e0poe && !(up & BIT(3));
1176
1177 /* R_VFPJF */
1178 if (wr->px && wr->uw) {
1179 set_priv_perms(wr, false, false, false);
1180 set_unpriv_perms(wr, false, false, false);
1181 }
1182 }
1183
compute_s1_overlay_permissions(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)1184 static void compute_s1_overlay_permissions(struct kvm_vcpu *vcpu,
1185 struct s1_walk_info *wi,
1186 struct s1_walk_result *wr)
1187 {
1188 u8 idx, pov_perms, uov_perms;
1189
1190 idx = FIELD_GET(PTE_PO_IDX_MASK, wr->desc);
1191
1192 if (wr->pov) {
1193 switch (wi->regime) {
1194 case TR_EL10:
1195 pov_perms = perm_idx(vcpu, POR_EL1, idx);
1196 break;
1197 case TR_EL20:
1198 pov_perms = perm_idx(vcpu, POR_EL2, idx);
1199 break;
1200 case TR_EL2:
1201 pov_perms = perm_idx(vcpu, POR_EL2, idx);
1202 break;
1203 }
1204
1205 if (pov_perms & ~POE_RWX)
1206 pov_perms = POE_NONE;
1207
1208 /* R_QXXPC, S1PrivOverflow enabled */
1209 if (wr->pwxn && (pov_perms & POE_X))
1210 pov_perms &= ~POE_W;
1211
1212 wr->pr &= pov_perms & POE_R;
1213 wr->pw &= pov_perms & POE_W;
1214 wr->px &= pov_perms & POE_X;
1215 }
1216
1217 if (wr->uov) {
1218 switch (wi->regime) {
1219 case TR_EL10:
1220 uov_perms = perm_idx(vcpu, POR_EL0, idx);
1221 break;
1222 case TR_EL20:
1223 uov_perms = perm_idx(vcpu, POR_EL0, idx);
1224 break;
1225 case TR_EL2:
1226 uov_perms = 0;
1227 break;
1228 }
1229
1230 if (uov_perms & ~POE_RWX)
1231 uov_perms = POE_NONE;
1232
1233 /* R_NPBXC, S1UnprivOverlay enabled */
1234 if (wr->uwxn && (uov_perms & POE_X))
1235 uov_perms &= ~POE_W;
1236
1237 wr->ur &= uov_perms & POE_R;
1238 wr->uw &= uov_perms & POE_W;
1239 wr->ux &= uov_perms & POE_X;
1240 }
1241 }
1242
compute_s1_permissions(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)1243 static void compute_s1_permissions(struct kvm_vcpu *vcpu,
1244 struct s1_walk_info *wi,
1245 struct s1_walk_result *wr)
1246 {
1247 bool pan;
1248
1249 if (!s1pie_enabled(vcpu, wi->regime))
1250 compute_s1_direct_permissions(vcpu, wi, wr);
1251 else
1252 compute_s1_indirect_permissions(vcpu, wi, wr);
1253
1254 if (!wi->hpd)
1255 compute_s1_hierarchical_permissions(vcpu, wi, wr);
1256
1257 compute_s1_overlay_permissions(vcpu, wi, wr);
1258
1259 /* R_QXXPC, S1PrivOverlay disabled */
1260 if (!wr->pov)
1261 wr->px &= !(wr->pwxn && wr->pw);
1262
1263 /* R_NPBXC, S1UnprivOverlay disabled */
1264 if (!wr->uov)
1265 wr->ux &= !(wr->uwxn && wr->uw);
1266
1267 pan = wi->pan && (wr->ur || wr->uw ||
1268 (pan3_enabled(vcpu, wi->regime) && wr->ux));
1269 wr->pw &= !pan;
1270 wr->pr &= !pan;
1271 }
1272
handle_at_slow(struct kvm_vcpu * vcpu,u32 op,u64 vaddr,u64 * par)1273 static int handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr, u64 *par)
1274 {
1275 struct s1_walk_result wr = {};
1276 struct s1_walk_info wi = {};
1277 bool perm_fail = false;
1278 int ret, idx;
1279
1280 wi.regime = compute_translation_regime(vcpu, op);
1281 wi.as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W);
1282 wi.pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) &&
1283 (*vcpu_cpsr(vcpu) & PSR_PAN_BIT);
1284
1285 ret = setup_s1_walk(vcpu, &wi, &wr, vaddr);
1286 if (ret)
1287 goto compute_par;
1288
1289 if (wr.level == S1_MMU_DISABLED)
1290 goto compute_par;
1291
1292 idx = srcu_read_lock(&vcpu->kvm->srcu);
1293
1294 ret = walk_s1(vcpu, &wi, &wr, vaddr);
1295
1296 srcu_read_unlock(&vcpu->kvm->srcu, idx);
1297
1298 /*
1299 * Race to update a descriptor -- restart the walk.
1300 */
1301 if (ret == -EAGAIN)
1302 return ret;
1303 if (ret)
1304 goto compute_par;
1305
1306 compute_s1_permissions(vcpu, &wi, &wr);
1307
1308 switch (op) {
1309 case OP_AT_S1E1RP:
1310 case OP_AT_S1E1R:
1311 case OP_AT_S1E2R:
1312 perm_fail = !wr.pr;
1313 break;
1314 case OP_AT_S1E1WP:
1315 case OP_AT_S1E1W:
1316 case OP_AT_S1E2W:
1317 perm_fail = !wr.pw;
1318 break;
1319 case OP_AT_S1E0R:
1320 perm_fail = !wr.ur;
1321 break;
1322 case OP_AT_S1E0W:
1323 perm_fail = !wr.uw;
1324 break;
1325 case OP_AT_S1E1A:
1326 case OP_AT_S1E2A:
1327 break;
1328 default:
1329 BUG();
1330 }
1331
1332 if (perm_fail)
1333 fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false);
1334
1335 compute_par:
1336 *par = compute_par_s1(vcpu, &wi, &wr);
1337 return 0;
1338 }
1339
1340 /*
1341 * Return the PAR_EL1 value as the result of a valid translation.
1342 *
1343 * If the translation is unsuccessful, the value may only contain
1344 * PAR_EL1.F, and cannot be taken at face value. It isn't an
1345 * indication of the translation having failed, only that the fast
1346 * path did not succeed, *unless* it indicates a S1 permission or
1347 * access fault.
1348 */
__kvm_at_s1e01_fast(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)1349 static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
1350 {
1351 struct mmu_config config;
1352 struct kvm_s2_mmu *mmu;
1353 bool fail, mmu_cs;
1354 u64 par;
1355
1356 par = SYS_PAR_EL1_F;
1357
1358 /*
1359 * We've trapped, so everything is live on the CPU. As we will
1360 * be switching contexts behind everybody's back, disable
1361 * interrupts while holding the mmu lock.
1362 */
1363 guard(write_lock_irqsave)(&vcpu->kvm->mmu_lock);
1364
1365 /*
1366 * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already
1367 * the right one (as we trapped from vEL2). If not, save the
1368 * full MMU context.
1369 *
1370 * We are also guaranteed to be in the correct context if
1371 * we're not in a nested VM.
1372 */
1373 mmu_cs = (vcpu_has_nv(vcpu) &&
1374 !(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)));
1375 if (!mmu_cs)
1376 goto skip_mmu_switch;
1377
1378 /*
1379 * Obtaining the S2 MMU for a L2 is horribly racy, and we may not
1380 * find it (recycled by another vcpu, for example). When this
1381 * happens, admit defeat immediately and use the SW (slow) path.
1382 */
1383 mmu = lookup_s2_mmu(vcpu);
1384 if (!mmu)
1385 return par;
1386
1387 __mmu_config_save(&config);
1388
1389 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1), SYS_TTBR0);
1390 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1), SYS_TTBR1);
1391 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1), SYS_TCR);
1392 write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1), SYS_MAIR);
1393 if (kvm_has_tcr2(vcpu->kvm)) {
1394 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR2_EL1), SYS_TCR2);
1395 if (kvm_has_s1pie(vcpu->kvm)) {
1396 write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIR_EL1), SYS_PIR);
1397 write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIRE0_EL1), SYS_PIRE0);
1398 }
1399 if (kvm_has_s1poe(vcpu->kvm)) {
1400 write_sysreg_el1(vcpu_read_sys_reg(vcpu, POR_EL1), SYS_POR);
1401 write_sysreg_s(vcpu_read_sys_reg(vcpu, POR_EL0), SYS_POR_EL0);
1402 }
1403 }
1404 write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1), SYS_SCTLR);
1405 __load_stage2(mmu, mmu->arch);
1406
1407 skip_mmu_switch:
1408 /* Temporarily switch back to guest context */
1409 write_sysreg_hcr(vcpu->arch.hcr_el2);
1410 isb();
1411
1412 switch (op) {
1413 case OP_AT_S1E1RP:
1414 case OP_AT_S1E1WP:
1415 fail = at_s1e1p_fast(vcpu, op, vaddr);
1416 break;
1417 case OP_AT_S1E1R:
1418 fail = __kvm_at(OP_AT_S1E1R, vaddr);
1419 break;
1420 case OP_AT_S1E1W:
1421 fail = __kvm_at(OP_AT_S1E1W, vaddr);
1422 break;
1423 case OP_AT_S1E0R:
1424 fail = __kvm_at(OP_AT_S1E0R, vaddr);
1425 break;
1426 case OP_AT_S1E0W:
1427 fail = __kvm_at(OP_AT_S1E0W, vaddr);
1428 break;
1429 case OP_AT_S1E1A:
1430 fail = __kvm_at(OP_AT_S1E1A, vaddr);
1431 break;
1432 default:
1433 WARN_ON_ONCE(1);
1434 fail = true;
1435 break;
1436 }
1437
1438 if (!fail)
1439 par = read_sysreg_par();
1440
1441 write_sysreg_hcr(HCR_HOST_VHE_FLAGS);
1442
1443 if (mmu_cs)
1444 __mmu_config_restore(&config);
1445
1446 return par;
1447 }
1448
par_check_s1_perm_fault(u64 par)1449 static bool par_check_s1_perm_fault(u64 par)
1450 {
1451 u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
1452
1453 return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM &&
1454 !(par & SYS_PAR_EL1_S));
1455 }
1456
par_check_s1_access_fault(u64 par)1457 static bool par_check_s1_access_fault(u64 par)
1458 {
1459 u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
1460
1461 return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_ACCESS &&
1462 !(par & SYS_PAR_EL1_S));
1463 }
1464
__kvm_at_s1e01(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)1465 int __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
1466 {
1467 u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr);
1468 int ret;
1469
1470 /*
1471 * If PAR_EL1 reports that AT failed on a S1 permission or access
1472 * fault, we know for sure that the PTW was able to walk the S1
1473 * tables and there's nothing else to do.
1474 *
1475 * If AT failed for any other reason, then we must walk the guest S1
1476 * to emulate the instruction.
1477 */
1478 if ((par & SYS_PAR_EL1_F) &&
1479 !par_check_s1_perm_fault(par) &&
1480 !par_check_s1_access_fault(par)) {
1481 ret = handle_at_slow(vcpu, op, vaddr, &par);
1482 if (ret)
1483 return ret;
1484 }
1485
1486 vcpu_write_sys_reg(vcpu, par, PAR_EL1);
1487 return 0;
1488 }
1489
__kvm_at_s1e2(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)1490 int __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
1491 {
1492 u64 par;
1493 int ret;
1494
1495 /*
1496 * We've trapped, so everything is live on the CPU. As we will be
1497 * switching context behind everybody's back, disable interrupts...
1498 */
1499 scoped_guard(write_lock_irqsave, &vcpu->kvm->mmu_lock) {
1500 u64 val, hcr;
1501 bool fail;
1502
1503 val = hcr = read_sysreg(hcr_el2);
1504 val &= ~HCR_TGE;
1505 val |= HCR_VM;
1506
1507 if (!vcpu_el2_e2h_is_set(vcpu))
1508 val |= HCR_NV | HCR_NV1;
1509
1510 write_sysreg_hcr(val);
1511 isb();
1512
1513 par = SYS_PAR_EL1_F;
1514
1515 switch (op) {
1516 case OP_AT_S1E2R:
1517 fail = __kvm_at(OP_AT_S1E1R, vaddr);
1518 break;
1519 case OP_AT_S1E2W:
1520 fail = __kvm_at(OP_AT_S1E1W, vaddr);
1521 break;
1522 case OP_AT_S1E2A:
1523 fail = __kvm_at(OP_AT_S1E1A, vaddr);
1524 break;
1525 default:
1526 WARN_ON_ONCE(1);
1527 fail = true;
1528 }
1529
1530 isb();
1531
1532 if (!fail)
1533 par = read_sysreg_par();
1534
1535 write_sysreg_hcr(hcr);
1536 isb();
1537 }
1538
1539 /* We failed the translation, let's replay it in slow motion */
1540 if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) {
1541 ret = handle_at_slow(vcpu, op, vaddr, &par);
1542 if (ret)
1543 return ret;
1544 }
1545
1546 vcpu_write_sys_reg(vcpu, par, PAR_EL1);
1547 return 0;
1548 }
1549
__kvm_at_s12(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)1550 int __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
1551 {
1552 struct kvm_s2_trans out = {};
1553 u64 ipa, par;
1554 bool write;
1555 int ret;
1556
1557 /* Do the stage-1 translation */
1558 switch (op) {
1559 case OP_AT_S12E1R:
1560 op = OP_AT_S1E1R;
1561 write = false;
1562 break;
1563 case OP_AT_S12E1W:
1564 op = OP_AT_S1E1W;
1565 write = true;
1566 break;
1567 case OP_AT_S12E0R:
1568 op = OP_AT_S1E0R;
1569 write = false;
1570 break;
1571 case OP_AT_S12E0W:
1572 op = OP_AT_S1E0W;
1573 write = true;
1574 break;
1575 default:
1576 WARN_ON_ONCE(1);
1577 return 0;
1578 }
1579
1580 __kvm_at_s1e01(vcpu, op, vaddr);
1581 par = vcpu_read_sys_reg(vcpu, PAR_EL1);
1582 if (par & SYS_PAR_EL1_F)
1583 return 0;
1584
1585 /*
1586 * If we only have a single stage of translation (EL2&0), exit
1587 * early. Same thing if {VM,DC}=={0,0}.
1588 */
1589 if (compute_translation_regime(vcpu, op) == TR_EL20 ||
1590 !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC)))
1591 return 0;
1592
1593 /* Do the stage-2 translation */
1594 ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0));
1595 out.esr = 0;
1596 ret = kvm_walk_nested_s2(vcpu, ipa, &out);
1597 if (ret < 0)
1598 return ret;
1599
1600 /* Check the access permission */
1601 if (!out.esr &&
1602 ((!write && !out.readable) || (write && !out.writable)))
1603 out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3);
1604
1605 par = compute_par_s12(vcpu, par, &out);
1606 vcpu_write_sys_reg(vcpu, par, PAR_EL1);
1607 return 0;
1608 }
1609
1610 /*
1611 * Translate a VA for a given EL in a given translation regime, with
1612 * or without PAN. This requires wi->{regime, as_el0, pan} to be
1613 * set. The rest of the wi and wr should be 0-initialised.
1614 */
__kvm_translate_va(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr,u64 va)1615 int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
1616 struct s1_walk_result *wr, u64 va)
1617 {
1618 int ret;
1619
1620 ret = setup_s1_walk(vcpu, wi, wr, va);
1621 if (ret)
1622 return ret;
1623
1624 if (wr->level == S1_MMU_DISABLED) {
1625 wr->ur = wr->uw = wr->ux = true;
1626 wr->pr = wr->pw = wr->px = true;
1627 } else {
1628 ret = walk_s1(vcpu, wi, wr, va);
1629 if (ret)
1630 return ret;
1631
1632 compute_s1_permissions(vcpu, wi, wr);
1633 }
1634
1635 return 0;
1636 }
1637
1638 struct desc_match {
1639 u64 ipa;
1640 int level;
1641 };
1642
match_s1_desc(struct s1_walk_context * ctxt,void * priv)1643 static int match_s1_desc(struct s1_walk_context *ctxt, void *priv)
1644 {
1645 struct desc_match *dm = priv;
1646 u64 ipa = dm->ipa;
1647
1648 /* Use S1 granule alignment */
1649 ipa &= GENMASK(51, ctxt->wi->pgshift);
1650
1651 /* Not the IPA we're looking for? Continue. */
1652 if (ipa != ctxt->table_ipa)
1653 return 0;
1654
1655 /* Note the level and interrupt the walk */
1656 dm->level = ctxt->level;
1657 return -EINTR;
1658 }
1659
__kvm_find_s1_desc_level(struct kvm_vcpu * vcpu,u64 va,u64 ipa,int * level)1660 int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level)
1661 {
1662 struct desc_match dm = {
1663 .ipa = ipa,
1664 };
1665 struct s1_walk_info wi = {
1666 .filter = &(struct s1_walk_filter){
1667 .fn = match_s1_desc,
1668 .priv = &dm,
1669 },
1670 .as_el0 = false,
1671 .pan = false,
1672 };
1673 struct s1_walk_result wr = {};
1674 int ret;
1675
1676 if (is_hyp_ctxt(vcpu))
1677 wi.regime = vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2;
1678 else
1679 wi.regime = TR_EL10;
1680
1681 ret = setup_s1_walk(vcpu, &wi, &wr, va);
1682 if (ret)
1683 return ret;
1684
1685 /* We really expect the S1 MMU to be on here... */
1686 if (WARN_ON_ONCE(wr.level == S1_MMU_DISABLED)) {
1687 *level = 0;
1688 return 0;
1689 }
1690
1691 /* Walk the guest's PT, looking for a match along the way */
1692 ret = walk_s1(vcpu, &wi, &wr, va);
1693 switch (ret) {
1694 case -EINTR:
1695 /* We interrupted the walk on a match, return the level */
1696 *level = dm.level;
1697 return 0;
1698 case 0:
1699 /* The walk completed, we failed to find the entry */
1700 return -ENOENT;
1701 default:
1702 /* Any other error... */
1703 return ret;
1704 }
1705 }
1706
1707 #ifdef CONFIG_ARM64_LSE_ATOMICS
__lse_swap_desc(u64 __user * ptep,u64 old,u64 new)1708 static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new)
1709 {
1710 u64 tmp = old;
1711 int ret = 0;
1712
1713 uaccess_enable_privileged();
1714
1715 asm volatile(__LSE_PREAMBLE
1716 "1: cas %[old], %[new], %[addr]\n"
1717 "2:\n"
1718 _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret])
1719 : [old] "+r" (old), [addr] "+Q" (*ptep), [ret] "+r" (ret)
1720 : [new] "r" (new)
1721 : "memory");
1722
1723 uaccess_disable_privileged();
1724
1725 if (ret)
1726 return ret;
1727 if (tmp != old)
1728 return -EAGAIN;
1729
1730 return ret;
1731 }
1732 #else
__lse_swap_desc(u64 __user * ptep,u64 old,u64 new)1733 static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new)
1734 {
1735 return -EINVAL;
1736 }
1737 #endif
1738
__llsc_swap_desc(u64 __user * ptep,u64 old,u64 new)1739 static int __llsc_swap_desc(u64 __user *ptep, u64 old, u64 new)
1740 {
1741 int ret = 1;
1742 u64 tmp;
1743
1744 uaccess_enable_privileged();
1745
1746 asm volatile("prfm pstl1strm, %[addr]\n"
1747 "1: ldxr %[tmp], %[addr]\n"
1748 "sub %[tmp], %[tmp], %[old]\n"
1749 "cbnz %[tmp], 3f\n"
1750 "2: stlxr %w[ret], %[new], %[addr]\n"
1751 "3:\n"
1752 _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w[ret])
1753 _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w[ret])
1754 : [ret] "+r" (ret), [addr] "+Q" (*ptep), [tmp] "=&r" (tmp)
1755 : [old] "r" (old), [new] "r" (new)
1756 : "memory");
1757
1758 uaccess_disable_privileged();
1759
1760 /* STLXR didn't update the descriptor, or the compare failed */
1761 if (ret == 1)
1762 return -EAGAIN;
1763
1764 return ret;
1765 }
1766
__kvm_at_swap_desc(struct kvm * kvm,gpa_t ipa,u64 old,u64 new)1767 int __kvm_at_swap_desc(struct kvm *kvm, gpa_t ipa, u64 old, u64 new)
1768 {
1769 struct kvm_memory_slot *slot;
1770 unsigned long hva;
1771 u64 __user *ptep;
1772 bool writable;
1773 int offset;
1774 gfn_t gfn;
1775 int r;
1776
1777 lockdep_assert(srcu_read_lock_held(&kvm->srcu));
1778
1779 gfn = ipa >> PAGE_SHIFT;
1780 offset = offset_in_page(ipa);
1781 slot = gfn_to_memslot(kvm, gfn);
1782 hva = gfn_to_hva_memslot_prot(slot, gfn, &writable);
1783 if (kvm_is_error_hva(hva))
1784 return -EINVAL;
1785 if (!writable)
1786 return -EPERM;
1787
1788 ptep = (u64 __user *)hva + offset;
1789 if (cpus_have_final_cap(ARM64_HAS_LSE_ATOMICS))
1790 r = __lse_swap_desc(ptep, old, new);
1791 else
1792 r = __llsc_swap_desc(ptep, old, new);
1793
1794 if (r < 0)
1795 return r;
1796
1797 mark_page_dirty_in_slot(kvm, slot, gfn);
1798 return 0;
1799 }
1800