1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2017 - Linaro Ltd 4 * Author: Jintack Lim <jintack.lim@linaro.org> 5 */ 6 7 #include <linux/kvm_host.h> 8 9 #include <asm/esr.h> 10 #include <asm/kvm_hyp.h> 11 #include <asm/kvm_mmu.h> 12 #include <asm/lsui.h> 13 14 static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool s1ptw) 15 { 16 wr->fst = fst; 17 wr->ptw = s1ptw; 18 wr->s2 = s1ptw; 19 wr->failed = true; 20 } 21 22 #define S1_MMU_DISABLED (-127) 23 24 static int get_ia_size(struct s1_walk_info *wi) 25 { 26 return 64 - wi->txsz; 27 } 28 29 /* Return true if the IPA is out of the OA range */ 30 static bool check_output_size(u64 ipa, struct s1_walk_info *wi) 31 { 32 if (wi->pa52bit) 33 return wi->max_oa_bits < 52 && (ipa & GENMASK_ULL(51, wi->max_oa_bits)); 34 return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits)); 35 } 36 37 static bool has_52bit_pa(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, u64 tcr) 38 { 39 switch (BIT(wi->pgshift)) { 40 case SZ_64K: 41 default: /* IMPDEF: treat any other value as 64k */ 42 if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, PARANGE, 52)) 43 return false; 44 return ((wi->regime == TR_EL2 ? 45 FIELD_GET(TCR_EL2_PS_MASK, tcr) : 46 FIELD_GET(TCR_IPS_MASK, tcr)) == 0b0110); 47 case SZ_16K: 48 if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT)) 49 return false; 50 break; 51 case SZ_4K: 52 if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT)) 53 return false; 54 break; 55 } 56 57 return (tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS)); 58 } 59 60 static u64 desc_to_oa(struct s1_walk_info *wi, u64 desc) 61 { 62 u64 addr; 63 64 if (!wi->pa52bit) 65 return desc & GENMASK_ULL(47, wi->pgshift); 66 67 switch (BIT(wi->pgshift)) { 68 case SZ_4K: 69 case SZ_16K: 70 addr = desc & GENMASK_ULL(49, wi->pgshift); 71 addr |= FIELD_GET(KVM_PTE_ADDR_51_50_LPA2, desc) << 50; 72 break; 73 case SZ_64K: 74 default: /* IMPDEF: treat any other value as 64k */ 75 addr = desc & GENMASK_ULL(47, wi->pgshift); 76 addr |= FIELD_GET(KVM_PTE_ADDR_51_48, desc) << 48; 77 break; 78 } 79 80 return addr; 81 } 82 83 /* Return the translation regime that applies to an AT instruction */ 84 static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op) 85 { 86 /* 87 * We only get here from guest EL2, so the translation 88 * regime AT applies to is solely defined by {E2H,TGE}. 89 */ 90 switch (op) { 91 case OP_AT_S1E2R: 92 case OP_AT_S1E2W: 93 case OP_AT_S1E2A: 94 return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2; 95 default: 96 return (vcpu_el2_e2h_is_set(vcpu) && 97 vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10; 98 } 99 } 100 101 static u64 effective_tcr2(struct kvm_vcpu *vcpu, enum trans_regime regime) 102 { 103 if (regime == TR_EL10) { 104 if (vcpu_has_nv(vcpu) && 105 !(__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En)) 106 return 0; 107 108 return vcpu_read_sys_reg(vcpu, TCR2_EL1); 109 } 110 111 return vcpu_read_sys_reg(vcpu, TCR2_EL2); 112 } 113 114 static bool s1pie_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) 115 { 116 if (!kvm_has_s1pie(vcpu->kvm)) 117 return false; 118 119 /* Abuse TCR2_EL1_PIE and use it for EL2 as well */ 120 return effective_tcr2(vcpu, regime) & TCR2_EL1_PIE; 121 } 122 123 static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi) 124 { 125 u64 val; 126 127 if (!kvm_has_s1poe(vcpu->kvm)) { 128 wi->poe = wi->e0poe = false; 129 return; 130 } 131 132 val = effective_tcr2(vcpu, wi->regime); 133 134 /* Abuse TCR2_EL1_* for EL2 */ 135 wi->poe = val & TCR2_EL1_POE; 136 wi->e0poe = (wi->regime != TR_EL2) && (val & TCR2_EL1_E0POE); 137 } 138 139 #define _has_tgran(__r, __sz) \ 140 ({ \ 141 u64 _s1, _mmfr0 = __r; \ 142 \ 143 _s1 = SYS_FIELD_GET(ID_AA64MMFR0_EL1, \ 144 TGRAN##__sz, _mmfr0); \ 145 \ 146 _s1 != ID_AA64MMFR0_EL1_TGRAN##__sz##_NI; \ 147 }) 148 149 static bool has_tgran(u64 mmfr0, unsigned int shift) 150 { 151 switch (shift) { 152 case 12: 153 return _has_tgran(mmfr0, 4); 154 case 14: 155 return _has_tgran(mmfr0, 16); 156 case 16: 157 return _has_tgran(mmfr0, 64); 158 default: 159 BUG(); 160 } 161 } 162 163 static unsigned int tcr_to_tg0_pgshift(u64 tcr) 164 { 165 u64 tg0 = tcr & TCR_TG0_MASK; 166 167 switch (tg0) { 168 case TCR_TG0_4K: 169 return 12; 170 case TCR_TG0_16K: 171 return 14; 172 case TCR_TG0_64K: 173 default: /* IMPDEF: treat any other value as 64k */ 174 return 16; 175 } 176 } 177 178 static unsigned int tcr_to_tg1_pgshift(u64 tcr) 179 { 180 u64 tg1 = tcr & TCR_TG1_MASK; 181 182 switch (tg1) { 183 case TCR_TG1_4K: 184 return 12; 185 case TCR_TG1_16K: 186 return 14; 187 case TCR_TG1_64K: 188 default: /* IMPDEF: treat any other value as 64k */ 189 return 16; 190 } 191 } 192 193 static unsigned int fallback_tgran_shift(u64 mmfr0) 194 { 195 if (has_tgran(mmfr0, PAGE_SHIFT)) 196 return PAGE_SHIFT; 197 else if (has_tgran(mmfr0, 12)) 198 return 12; 199 else if (has_tgran(mmfr0, 14)) 200 return 14; 201 else if (has_tgran(mmfr0, 16)) 202 return 16; 203 else /* Should be unreacheable */ 204 return PAGE_SHIFT; 205 } 206 207 static unsigned int tcr_tg_pgshift(struct kvm *kvm, u64 tcr, bool upper_range) 208 { 209 u64 mmfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64MMFR0_EL1); 210 unsigned int shift; 211 212 /* Someone was silly enough to encode TG0/TG1 differently */ 213 if (upper_range) 214 shift = tcr_to_tg1_pgshift(tcr); 215 else 216 shift = tcr_to_tg0_pgshift(tcr); 217 218 /* 219 * If TGx is programmed to an unimplemented value (not advertised in 220 * ID_AA64MMFR0_EL1), we should treat it as if an implemented value is 221 * written, as per the architecture. Choose an available one while 222 * prioritizing PAGE_SIZE. 223 */ 224 if (!has_tgran(mmfr0, shift)) 225 return fallback_tgran_shift(mmfr0); 226 227 return shift; 228 } 229 230 static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, 231 struct s1_walk_result *wr, u64 va) 232 { 233 u64 hcr, sctlr, tcr, ps, ia_bits, ttbr; 234 unsigned int stride, x; 235 bool va55, tbi, lva, upper_range; 236 237 va55 = va & BIT(55); 238 upper_range = va55 && wi->regime != TR_EL2; 239 240 if (vcpu_has_nv(vcpu)) { 241 hcr = __vcpu_sys_reg(vcpu, HCR_EL2); 242 wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC)); 243 } else { 244 WARN_ON_ONCE(wi->regime != TR_EL10); 245 wi->s2 = false; 246 hcr = 0; 247 } 248 249 switch (wi->regime) { 250 case TR_EL10: 251 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); 252 tcr = vcpu_read_sys_reg(vcpu, TCR_EL1); 253 ttbr = (va55 ? 254 vcpu_read_sys_reg(vcpu, TTBR1_EL1) : 255 vcpu_read_sys_reg(vcpu, TTBR0_EL1)); 256 break; 257 case TR_EL2: 258 case TR_EL20: 259 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); 260 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); 261 ttbr = (va55 ? 262 vcpu_read_sys_reg(vcpu, TTBR1_EL2) : 263 vcpu_read_sys_reg(vcpu, TTBR0_EL2)); 264 break; 265 default: 266 BUG(); 267 } 268 269 if (upper_range) 270 wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); 271 else 272 wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); 273 274 wi->pgshift = tcr_tg_pgshift(vcpu->kvm, tcr, upper_range); 275 wi->pa52bit = has_52bit_pa(vcpu, wi, tcr); 276 277 ia_bits = get_ia_size(wi); 278 279 /* AArch64.S1StartLevel() */ 280 stride = wi->pgshift - 3; 281 wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride); 282 283 if (wi->regime == TR_EL2 && va55) 284 goto addrsz; 285 286 tbi = (wi->regime == TR_EL2 ? 287 FIELD_GET(TCR_EL2_TBI, tcr) : 288 (va55 ? 289 FIELD_GET(TCR_TBI1, tcr) : 290 FIELD_GET(TCR_TBI0, tcr))); 291 292 if (!tbi && (u64)sign_extend64(va, 55) != va) 293 goto addrsz; 294 295 wi->sh = (wi->regime == TR_EL2 ? 296 FIELD_GET(TCR_EL2_SH0_MASK, tcr) : 297 (va55 ? 298 FIELD_GET(TCR_SH1_MASK, tcr) : 299 FIELD_GET(TCR_SH0_MASK, tcr))); 300 301 va = (u64)sign_extend64(va, 55); 302 303 /* Let's put the MMU disabled case aside immediately */ 304 switch (wi->regime) { 305 case TR_EL10: 306 /* 307 * If dealing with the EL1&0 translation regime, 3 things 308 * can disable the S1 translation: 309 * 310 * - HCR_EL2.DC = 1 311 * - HCR_EL2.{E2H,TGE} = {0,1} 312 * - SCTLR_EL1.M = 0 313 * 314 * The TGE part is interesting. If we have decided that this 315 * is EL1&0, then it means that either {E2H,TGE} == {1,0} or 316 * {0,x}, and we only need to test for TGE == 1. 317 */ 318 if (hcr & (HCR_DC | HCR_TGE)) { 319 wr->level = S1_MMU_DISABLED; 320 break; 321 } 322 fallthrough; 323 case TR_EL2: 324 case TR_EL20: 325 if (!(sctlr & SCTLR_ELx_M)) 326 wr->level = S1_MMU_DISABLED; 327 break; 328 } 329 330 if (wr->level == S1_MMU_DISABLED) { 331 if (va >= BIT(kvm_get_pa_bits(vcpu->kvm))) 332 goto addrsz; 333 334 wr->pa = va; 335 return 0; 336 } 337 338 wi->be = sctlr & SCTLR_ELx_EE; 339 340 wi->hpd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP); 341 wi->hpd &= (wi->regime == TR_EL2 ? 342 FIELD_GET(TCR_EL2_HPD, tcr) : 343 (va55 ? 344 FIELD_GET(TCR_HPD1, tcr) : 345 FIELD_GET(TCR_HPD0, tcr))); 346 /* R_JHSVW */ 347 wi->hpd |= s1pie_enabled(vcpu, wi->regime); 348 349 /* Do we have POE? */ 350 compute_s1poe(vcpu, wi); 351 352 /* R_BVXDG */ 353 wi->hpd |= (wi->poe || wi->e0poe); 354 355 /* R_PLCGL, R_YXNYW */ 356 if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) { 357 if (wi->txsz > 39) 358 goto transfault; 359 } else { 360 if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47)) 361 goto transfault; 362 } 363 364 /* R_GTJBY, R_SXWGM */ 365 switch (BIT(wi->pgshift)) { 366 case SZ_4K: 367 case SZ_16K: 368 lva = wi->pa52bit; 369 break; 370 case SZ_64K: 371 lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52); 372 break; 373 } 374 375 if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16)) 376 goto transfault; 377 378 /* R_YYVYV, I_THCZK */ 379 if ((!va55 && va > GENMASK(ia_bits - 1, 0)) || 380 (va55 && va < GENMASK(63, ia_bits))) 381 goto transfault; 382 383 /* I_ZFSYQ */ 384 if (wi->regime != TR_EL2 && 385 (tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK))) 386 goto transfault; 387 388 /* R_BNDVG and following statements */ 389 if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) && 390 wi->as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0))) 391 goto transfault; 392 393 ps = (wi->regime == TR_EL2 ? 394 FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr)); 395 396 wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps, wi->pa52bit)); 397 398 /* Compute minimal alignment */ 399 x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift); 400 401 wi->baddr = ttbr & TTBRx_EL1_BADDR; 402 if (wi->pa52bit) { 403 /* 404 * Force the alignment on 64 bytes for top-level tables 405 * smaller than 8 entries, since TTBR.BADDR[5:2] are used to 406 * store bits [51:48] of the first level of lookup. 407 */ 408 x = max(x, 6); 409 410 wi->baddr |= FIELD_GET(GENMASK_ULL(5, 2), ttbr) << 48; 411 } 412 413 /* R_VPBBF */ 414 if (check_output_size(wi->baddr, wi)) 415 goto addrsz; 416 417 wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x); 418 419 wi->ha = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HAFDBS, AF); 420 wi->ha &= (wi->regime == TR_EL2 ? 421 FIELD_GET(TCR_EL2_HA, tcr) : 422 FIELD_GET(TCR_HA, tcr)); 423 424 return 0; 425 426 addrsz: 427 /* 428 * Address Size Fault level 0 to indicate it comes from TTBR. 429 * yes, this is an oddity. 430 */ 431 fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false); 432 return -EFAULT; 433 434 transfault: 435 /* Translation Fault on start level */ 436 fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(wi->sl), false); 437 return -EFAULT; 438 } 439 440 static int kvm_read_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 *desc, 441 struct s1_walk_info *wi) 442 { 443 u64 val; 444 int r; 445 446 r = kvm_read_guest(vcpu->kvm, pa, &val, sizeof(val)); 447 if (r) 448 return r; 449 450 if (wi->be) 451 *desc = be64_to_cpu((__force __be64)val); 452 else 453 *desc = le64_to_cpu((__force __le64)val); 454 455 return 0; 456 } 457 458 static int kvm_swap_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 old, u64 new, 459 struct s1_walk_info *wi) 460 { 461 if (wi->be) { 462 old = (__force u64)cpu_to_be64(old); 463 new = (__force u64)cpu_to_be64(new); 464 } else { 465 old = (__force u64)cpu_to_le64(old); 466 new = (__force u64)cpu_to_le64(new); 467 } 468 469 return __kvm_at_swap_desc(vcpu->kvm, pa, old, new); 470 } 471 472 static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, 473 struct s1_walk_result *wr, u64 va) 474 { 475 u64 va_top, va_bottom, baddr, desc, new_desc, ipa; 476 struct kvm_s2_trans s2_trans = {}; 477 int level, stride, ret; 478 479 level = wi->sl; 480 stride = wi->pgshift - 3; 481 baddr = wi->baddr; 482 483 va_top = get_ia_size(wi) - 1; 484 485 while (1) { 486 u64 index; 487 488 va_bottom = (3 - level) * stride + wi->pgshift; 489 index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3); 490 491 ipa = baddr | index; 492 493 if (wi->s2) { 494 ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans); 495 if (ret == -EAGAIN) 496 return ret; 497 498 if (ret) { 499 fail_s1_walk(wr, 500 (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level, 501 true); 502 return ret; 503 } 504 505 if (!kvm_s2_trans_readable(&s2_trans)) { 506 fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level), 507 true); 508 509 return -EPERM; 510 } 511 512 ipa = kvm_s2_trans_output(&s2_trans); 513 } 514 515 if (wi->filter) { 516 ret = wi->filter->fn(&(struct s1_walk_context) 517 { 518 .wi = wi, 519 .table_ipa = baddr, 520 .level = level, 521 }, wi->filter->priv); 522 if (ret) 523 return ret; 524 } 525 526 ret = kvm_read_s1_desc(vcpu, ipa, &desc, wi); 527 if (ret) { 528 fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false); 529 return ret; 530 } 531 532 new_desc = desc; 533 534 /* Invalid descriptor */ 535 if (!(desc & BIT(0))) 536 goto transfault; 537 538 /* Block mapping, check validity down the line */ 539 if (!(desc & BIT(1))) 540 break; 541 542 /* Page mapping */ 543 if (level == 3) 544 break; 545 546 /* Table handling */ 547 if (!wi->hpd) { 548 wr->APTable |= FIELD_GET(S1_TABLE_AP, desc); 549 wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc); 550 wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc); 551 } 552 553 baddr = desc_to_oa(wi, desc); 554 555 /* Check for out-of-range OA */ 556 if (check_output_size(baddr, wi)) 557 goto addrsz; 558 559 /* Prepare for next round */ 560 va_top = va_bottom - 1; 561 level++; 562 } 563 564 /* Block mapping, check the validity of the level */ 565 if (!(desc & BIT(1))) { 566 bool valid_block = false; 567 bool lpa = kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, PARANGE, 52); 568 569 switch (BIT(wi->pgshift)) { 570 case SZ_4K: 571 valid_block = level == 1 || level == 2 || (wi->pa52bit && level == 0); 572 break; 573 case SZ_16K: 574 valid_block = level == 2 || (wi->pa52bit && level == 1); 575 break; 576 case SZ_64K: 577 valid_block = level == 2 || (lpa && level == 1); 578 break; 579 } 580 581 if (!valid_block) 582 goto transfault; 583 } 584 585 baddr = desc_to_oa(wi, desc); 586 if (check_output_size(baddr & GENMASK(52, va_bottom), wi)) 587 goto addrsz; 588 589 if (wi->ha) 590 new_desc |= PTE_AF; 591 592 if (new_desc != desc) { 593 if (wi->s2 && !kvm_s2_trans_writable(&s2_trans)) { 594 fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level), true); 595 return -EPERM; 596 } 597 598 ret = kvm_swap_s1_desc(vcpu, ipa, desc, new_desc, wi); 599 if (ret == -EAGAIN) 600 return ret; 601 if (ret) { 602 fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false); 603 return ret; 604 } 605 606 desc = new_desc; 607 } 608 609 if (!(desc & PTE_AF)) { 610 fail_s1_walk(wr, ESR_ELx_FSC_ACCESS_L(level), false); 611 return -EACCES; 612 } 613 614 va_bottom += contiguous_bit_shift(desc, wi, level); 615 616 wr->failed = false; 617 wr->level = level; 618 wr->desc = desc; 619 wr->pa = baddr & GENMASK(52, va_bottom); 620 wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0); 621 622 wr->nG = (wi->regime != TR_EL2) && (desc & PTE_NG); 623 if (wr->nG) 624 wr->asid = get_asid_by_regime(vcpu, wi->regime); 625 626 return 0; 627 628 addrsz: 629 fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), false); 630 return -EINVAL; 631 transfault: 632 fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), false); 633 return -ENOENT; 634 } 635 636 struct mmu_config { 637 u64 ttbr0; 638 u64 ttbr1; 639 u64 tcr; 640 u64 mair; 641 u64 tcr2; 642 u64 pir; 643 u64 pire0; 644 u64 por_el0; 645 u64 por_el1; 646 u64 sctlr; 647 u64 vttbr; 648 u64 vtcr; 649 }; 650 651 static void __mmu_config_save(struct mmu_config *config) 652 { 653 config->ttbr0 = read_sysreg_el1(SYS_TTBR0); 654 config->ttbr1 = read_sysreg_el1(SYS_TTBR1); 655 config->tcr = read_sysreg_el1(SYS_TCR); 656 config->mair = read_sysreg_el1(SYS_MAIR); 657 if (cpus_have_final_cap(ARM64_HAS_TCR2)) { 658 config->tcr2 = read_sysreg_el1(SYS_TCR2); 659 if (cpus_have_final_cap(ARM64_HAS_S1PIE)) { 660 config->pir = read_sysreg_el1(SYS_PIR); 661 config->pire0 = read_sysreg_el1(SYS_PIRE0); 662 } 663 if (system_supports_poe()) { 664 config->por_el1 = read_sysreg_el1(SYS_POR); 665 config->por_el0 = read_sysreg_s(SYS_POR_EL0); 666 } 667 } 668 config->sctlr = read_sysreg_el1(SYS_SCTLR); 669 config->vttbr = read_sysreg(vttbr_el2); 670 config->vtcr = read_sysreg(vtcr_el2); 671 } 672 673 static void __mmu_config_restore(struct mmu_config *config) 674 { 675 /* 676 * ARM errata 1165522 and 1530923 require TGE to be 1 before 677 * we update the guest state. 678 */ 679 asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); 680 681 write_sysreg_el1(config->ttbr0, SYS_TTBR0); 682 write_sysreg_el1(config->ttbr1, SYS_TTBR1); 683 write_sysreg_el1(config->tcr, SYS_TCR); 684 write_sysreg_el1(config->mair, SYS_MAIR); 685 if (cpus_have_final_cap(ARM64_HAS_TCR2)) { 686 write_sysreg_el1(config->tcr2, SYS_TCR2); 687 if (cpus_have_final_cap(ARM64_HAS_S1PIE)) { 688 write_sysreg_el1(config->pir, SYS_PIR); 689 write_sysreg_el1(config->pire0, SYS_PIRE0); 690 } 691 if (system_supports_poe()) { 692 write_sysreg_el1(config->por_el1, SYS_POR); 693 write_sysreg_s(config->por_el0, SYS_POR_EL0); 694 } 695 } 696 write_sysreg_el1(config->sctlr, SYS_SCTLR); 697 write_sysreg(config->vttbr, vttbr_el2); 698 write_sysreg(config->vtcr, vtcr_el2); 699 } 700 701 static bool at_s1e1p_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) 702 { 703 u64 host_pan; 704 bool fail; 705 706 host_pan = read_sysreg_s(SYS_PSTATE_PAN); 707 write_sysreg_s(*vcpu_cpsr(vcpu) & PSTATE_PAN, SYS_PSTATE_PAN); 708 709 switch (op) { 710 case OP_AT_S1E1RP: 711 fail = __kvm_at(OP_AT_S1E1RP, vaddr); 712 break; 713 case OP_AT_S1E1WP: 714 fail = __kvm_at(OP_AT_S1E1WP, vaddr); 715 break; 716 } 717 718 write_sysreg_s(host_pan, SYS_PSTATE_PAN); 719 720 return fail; 721 } 722 723 #define MEMATTR(ic, oc) (MEMATTR_##oc << 4 | MEMATTR_##ic) 724 #define MEMATTR_NC 0b0100 725 #define MEMATTR_Wt 0b1000 726 #define MEMATTR_Wb 0b1100 727 #define MEMATTR_WbRaWa 0b1111 728 729 #define MEMATTR_IS_DEVICE(m) (((m) & GENMASK(7, 4)) == 0) 730 731 static u8 s2_memattr_to_attr(u8 memattr) 732 { 733 memattr &= 0b1111; 734 735 switch (memattr) { 736 case 0b0000: 737 case 0b0001: 738 case 0b0010: 739 case 0b0011: 740 return memattr << 2; 741 case 0b0100: 742 return MEMATTR(Wb, Wb); 743 case 0b0101: 744 return MEMATTR(NC, NC); 745 case 0b0110: 746 return MEMATTR(Wt, NC); 747 case 0b0111: 748 return MEMATTR(Wb, NC); 749 case 0b1000: 750 /* Reserved, assume NC */ 751 return MEMATTR(NC, NC); 752 case 0b1001: 753 return MEMATTR(NC, Wt); 754 case 0b1010: 755 return MEMATTR(Wt, Wt); 756 case 0b1011: 757 return MEMATTR(Wb, Wt); 758 case 0b1100: 759 /* Reserved, assume NC */ 760 return MEMATTR(NC, NC); 761 case 0b1101: 762 return MEMATTR(NC, Wb); 763 case 0b1110: 764 return MEMATTR(Wt, Wb); 765 case 0b1111: 766 return MEMATTR(Wb, Wb); 767 default: 768 unreachable(); 769 } 770 } 771 772 static u8 combine_s1_s2_attr(u8 s1, u8 s2) 773 { 774 bool transient; 775 u8 final = 0; 776 777 /* Upgrade transient s1 to non-transient to simplify things */ 778 switch (s1) { 779 case 0b0001 ... 0b0011: /* Normal, Write-Through Transient */ 780 transient = true; 781 s1 = MEMATTR_Wt | (s1 & GENMASK(1,0)); 782 break; 783 case 0b0101 ... 0b0111: /* Normal, Write-Back Transient */ 784 transient = true; 785 s1 = MEMATTR_Wb | (s1 & GENMASK(1,0)); 786 break; 787 default: 788 transient = false; 789 } 790 791 /* S2CombineS1AttrHints() */ 792 if ((s1 & GENMASK(3, 2)) == MEMATTR_NC || 793 (s2 & GENMASK(3, 2)) == MEMATTR_NC) 794 final = MEMATTR_NC; 795 else if ((s1 & GENMASK(3, 2)) == MEMATTR_Wt || 796 (s2 & GENMASK(3, 2)) == MEMATTR_Wt) 797 final = MEMATTR_Wt; 798 else 799 final = MEMATTR_Wb; 800 801 if (final != MEMATTR_NC) { 802 /* Inherit RaWa hints form S1 */ 803 if (transient) { 804 switch (s1 & GENMASK(3, 2)) { 805 case MEMATTR_Wt: 806 final = 0; 807 break; 808 case MEMATTR_Wb: 809 final = MEMATTR_NC; 810 break; 811 } 812 } 813 814 final |= s1 & GENMASK(1, 0); 815 } 816 817 return final; 818 } 819 820 #define ATTR_NSH 0b00 821 #define ATTR_RSV 0b01 822 #define ATTR_OSH 0b10 823 #define ATTR_ISH 0b11 824 825 static u8 compute_final_sh(u8 attr, u8 sh) 826 { 827 /* Any form of device, as well as NC has SH[1:0]=0b10 */ 828 if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC)) 829 return ATTR_OSH; 830 831 if (sh == ATTR_RSV) /* Reserved, mapped to NSH */ 832 sh = ATTR_NSH; 833 834 return sh; 835 } 836 837 static u8 compute_s1_sh(struct s1_walk_info *wi, struct s1_walk_result *wr, 838 u8 attr) 839 { 840 u8 sh; 841 842 /* 843 * non-52bit and LPA have their basic shareability described in the 844 * descriptor. LPA2 gets it from the corresponding field in TCR, 845 * conveniently recorded in the walk info. 846 */ 847 if (!wi->pa52bit || BIT(wi->pgshift) == SZ_64K) 848 sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_SH, wr->desc); 849 else 850 sh = wi->sh; 851 852 return compute_final_sh(attr, sh); 853 } 854 855 static u8 combine_sh(u8 s1_sh, u8 s2_sh) 856 { 857 if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH) 858 return ATTR_OSH; 859 if (s1_sh == ATTR_ISH || s2_sh == ATTR_ISH) 860 return ATTR_ISH; 861 862 return ATTR_NSH; 863 } 864 865 static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par, 866 struct kvm_s2_trans *tr) 867 { 868 u8 s1_parattr, s2_memattr, final_attr, s2_sh; 869 u64 par; 870 871 /* If S2 has failed to translate, report the damage */ 872 if (tr->esr) { 873 par = SYS_PAR_EL1_RES1; 874 par |= SYS_PAR_EL1_F; 875 par |= SYS_PAR_EL1_S; 876 par |= FIELD_PREP(SYS_PAR_EL1_FST, tr->esr); 877 return par; 878 } 879 880 s1_parattr = FIELD_GET(SYS_PAR_EL1_ATTR, s1_par); 881 s2_memattr = FIELD_GET(GENMASK(5, 2), tr->desc); 882 883 if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_FWB) { 884 if (!kvm_has_feat(vcpu->kvm, ID_AA64PFR2_EL1, MTEPERM, IMP)) 885 s2_memattr &= ~BIT(3); 886 887 /* Combination of R_VRJSW and R_RHWZM */ 888 switch (s2_memattr) { 889 case 0b0101: 890 if (MEMATTR_IS_DEVICE(s1_parattr)) 891 final_attr = s1_parattr; 892 else 893 final_attr = MEMATTR(NC, NC); 894 break; 895 case 0b0110: 896 case 0b1110: 897 final_attr = MEMATTR(WbRaWa, WbRaWa); 898 break; 899 case 0b0111: 900 case 0b1111: 901 /* Preserve S1 attribute */ 902 final_attr = s1_parattr; 903 break; 904 case 0b0100: 905 case 0b1100: 906 case 0b1101: 907 /* Reserved, do something non-silly */ 908 final_attr = s1_parattr; 909 break; 910 default: 911 /* 912 * MemAttr[2]=0, Device from S2. 913 * 914 * FWB does not influence the way that stage 1 915 * memory types and attributes are combined 916 * with stage 2 Device type and attributes. 917 */ 918 final_attr = min(s2_memattr_to_attr(s2_memattr), 919 s1_parattr); 920 } 921 } else { 922 /* Combination of R_HMNDG, R_TNHFM and R_GQFSF */ 923 u8 s2_parattr = s2_memattr_to_attr(s2_memattr); 924 925 if (MEMATTR_IS_DEVICE(s1_parattr) || 926 MEMATTR_IS_DEVICE(s2_parattr)) { 927 final_attr = min(s1_parattr, s2_parattr); 928 } else { 929 /* At this stage, this is memory vs memory */ 930 final_attr = combine_s1_s2_attr(s1_parattr & 0xf, 931 s2_parattr & 0xf); 932 final_attr |= combine_s1_s2_attr(s1_parattr >> 4, 933 s2_parattr >> 4) << 4; 934 } 935 } 936 937 if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_CD) && 938 !MEMATTR_IS_DEVICE(final_attr)) 939 final_attr = MEMATTR(NC, NC); 940 941 s2_sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_SH, tr->desc); 942 943 par = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr); 944 par |= tr->output & GENMASK(47, 12); 945 par |= FIELD_PREP(SYS_PAR_EL1_SH, 946 combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par), 947 compute_final_sh(final_attr, s2_sh))); 948 949 return par; 950 } 951 952 static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, 953 struct s1_walk_result *wr) 954 { 955 u64 par; 956 957 if (wr->failed) { 958 par = SYS_PAR_EL1_RES1; 959 par |= SYS_PAR_EL1_F; 960 par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst); 961 par |= wr->ptw ? SYS_PAR_EL1_PTW : 0; 962 par |= wr->s2 ? SYS_PAR_EL1_S : 0; 963 } else if (wr->level == S1_MMU_DISABLED) { 964 /* MMU off or HCR_EL2.DC == 1 */ 965 par = SYS_PAR_EL1_NSE; 966 par |= wr->pa & SYS_PAR_EL1_PA; 967 968 if (wi->regime == TR_EL10 && vcpu_has_nv(vcpu) && 969 (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) { 970 par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 971 MEMATTR(WbRaWa, WbRaWa)); 972 par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_NSH); 973 } else { 974 par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */ 975 par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_OSH); 976 } 977 } else { 978 u64 mair, sctlr; 979 u8 sh; 980 981 par = SYS_PAR_EL1_NSE; 982 983 mair = (wi->regime == TR_EL10 ? 984 vcpu_read_sys_reg(vcpu, MAIR_EL1) : 985 vcpu_read_sys_reg(vcpu, MAIR_EL2)); 986 987 mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8; 988 mair &= 0xff; 989 990 sctlr = (wi->regime == TR_EL10 ? 991 vcpu_read_sys_reg(vcpu, SCTLR_EL1) : 992 vcpu_read_sys_reg(vcpu, SCTLR_EL2)); 993 994 /* Force NC for memory if SCTLR_ELx.C is clear */ 995 if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair)) 996 mair = MEMATTR(NC, NC); 997 998 par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair); 999 par |= wr->pa & SYS_PAR_EL1_PA; 1000 1001 sh = compute_s1_sh(wi, wr, mair); 1002 par |= FIELD_PREP(SYS_PAR_EL1_SH, sh); 1003 } 1004 1005 return par; 1006 } 1007 1008 static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) 1009 { 1010 u64 sctlr; 1011 1012 if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3)) 1013 return false; 1014 1015 if (s1pie_enabled(vcpu, regime)) 1016 return true; 1017 1018 if (regime == TR_EL10) 1019 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); 1020 else 1021 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); 1022 1023 return sctlr & SCTLR_EL1_EPAN; 1024 } 1025 1026 static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu, 1027 struct s1_walk_info *wi, 1028 struct s1_walk_result *wr) 1029 { 1030 bool wxn; 1031 1032 /* Non-hierarchical part of AArch64.S1DirectBasePermissions() */ 1033 if (wi->regime != TR_EL2) { 1034 switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr->desc)) { 1035 case 0b00: 1036 wr->pr = wr->pw = true; 1037 wr->ur = wr->uw = false; 1038 break; 1039 case 0b01: 1040 wr->pr = wr->pw = wr->ur = wr->uw = true; 1041 break; 1042 case 0b10: 1043 wr->pr = true; 1044 wr->pw = wr->ur = wr->uw = false; 1045 break; 1046 case 0b11: 1047 wr->pr = wr->ur = true; 1048 wr->pw = wr->uw = false; 1049 break; 1050 } 1051 1052 /* We don't use px for anything yet, but hey... */ 1053 wr->px = !((wr->desc & PTE_PXN) || wr->uw); 1054 wr->ux = !(wr->desc & PTE_UXN); 1055 } else { 1056 wr->ur = wr->uw = wr->ux = false; 1057 1058 if (!(wr->desc & PTE_RDONLY)) { 1059 wr->pr = wr->pw = true; 1060 } else { 1061 wr->pr = true; 1062 wr->pw = false; 1063 } 1064 1065 /* XN maps to UXN */ 1066 wr->px = !(wr->desc & PTE_UXN); 1067 } 1068 1069 switch (wi->regime) { 1070 case TR_EL2: 1071 case TR_EL20: 1072 wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_WXN); 1073 break; 1074 case TR_EL10: 1075 wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN); 1076 break; 1077 } 1078 1079 wr->pwxn = wr->uwxn = wxn; 1080 wr->pov = wi->poe; 1081 wr->uov = wi->e0poe; 1082 } 1083 1084 static void compute_s1_hierarchical_permissions(struct kvm_vcpu *vcpu, 1085 struct s1_walk_info *wi, 1086 struct s1_walk_result *wr) 1087 { 1088 /* Hierarchical part of AArch64.S1DirectBasePermissions() */ 1089 if (wi->regime != TR_EL2) { 1090 switch (wr->APTable) { 1091 case 0b00: 1092 break; 1093 case 0b01: 1094 wr->ur = wr->uw = false; 1095 break; 1096 case 0b10: 1097 wr->pw = wr->uw = false; 1098 break; 1099 case 0b11: 1100 wr->pw = wr->ur = wr->uw = false; 1101 break; 1102 } 1103 1104 wr->px &= !wr->PXNTable; 1105 wr->ux &= !wr->UXNTable; 1106 } else { 1107 if (wr->APTable & BIT(1)) 1108 wr->pw = false; 1109 1110 /* XN maps to UXN */ 1111 wr->px &= !wr->UXNTable; 1112 } 1113 } 1114 1115 #define perm_idx(v, r, i) ((vcpu_read_sys_reg((v), (r)) >> ((i) * 4)) & 0xf) 1116 1117 #define set_priv_perms(wr, r, w, x) \ 1118 do { \ 1119 (wr)->pr = (r); \ 1120 (wr)->pw = (w); \ 1121 (wr)->px = (x); \ 1122 } while (0) 1123 1124 #define set_unpriv_perms(wr, r, w, x) \ 1125 do { \ 1126 (wr)->ur = (r); \ 1127 (wr)->uw = (w); \ 1128 (wr)->ux = (x); \ 1129 } while (0) 1130 1131 #define set_priv_wxn(wr, v) \ 1132 do { \ 1133 (wr)->pwxn = (v); \ 1134 } while (0) 1135 1136 #define set_unpriv_wxn(wr, v) \ 1137 do { \ 1138 (wr)->uwxn = (v); \ 1139 } while (0) 1140 1141 /* Similar to AArch64.S1IndirectBasePermissions(), without GCS */ 1142 #define set_perms(w, wr, ip) \ 1143 do { \ 1144 /* R_LLZDZ */ \ 1145 switch ((ip)) { \ 1146 case 0b0000: \ 1147 set_ ## w ## _perms((wr), false, false, false); \ 1148 break; \ 1149 case 0b0001: \ 1150 set_ ## w ## _perms((wr), true , false, false); \ 1151 break; \ 1152 case 0b0010: \ 1153 set_ ## w ## _perms((wr), false, false, true ); \ 1154 break; \ 1155 case 0b0011: \ 1156 set_ ## w ## _perms((wr), true , false, true ); \ 1157 break; \ 1158 case 0b0100: \ 1159 set_ ## w ## _perms((wr), false, false, false); \ 1160 break; \ 1161 case 0b0101: \ 1162 set_ ## w ## _perms((wr), true , true , false); \ 1163 break; \ 1164 case 0b0110: \ 1165 set_ ## w ## _perms((wr), true , true , true ); \ 1166 break; \ 1167 case 0b0111: \ 1168 set_ ## w ## _perms((wr), true , true , true ); \ 1169 break; \ 1170 case 0b1000: \ 1171 set_ ## w ## _perms((wr), true , false, false); \ 1172 break; \ 1173 case 0b1001: \ 1174 set_ ## w ## _perms((wr), true , false, false); \ 1175 break; \ 1176 case 0b1010: \ 1177 set_ ## w ## _perms((wr), true , false, true ); \ 1178 break; \ 1179 case 0b1011: \ 1180 set_ ## w ## _perms((wr), false, false, false); \ 1181 break; \ 1182 case 0b1100: \ 1183 set_ ## w ## _perms((wr), true , true , false); \ 1184 break; \ 1185 case 0b1101: \ 1186 set_ ## w ## _perms((wr), false, false, false); \ 1187 break; \ 1188 case 0b1110: \ 1189 set_ ## w ## _perms((wr), true , true , true ); \ 1190 break; \ 1191 case 0b1111: \ 1192 set_ ## w ## _perms((wr), false, false, false); \ 1193 break; \ 1194 } \ 1195 \ 1196 /* R_HJYGR */ \ 1197 set_ ## w ## _wxn((wr), ((ip) == 0b0110)); \ 1198 \ 1199 } while (0) 1200 1201 static void compute_s1_indirect_permissions(struct kvm_vcpu *vcpu, 1202 struct s1_walk_info *wi, 1203 struct s1_walk_result *wr) 1204 { 1205 u8 up, pp, idx; 1206 1207 idx = pte_pi_index(wr->desc); 1208 1209 switch (wi->regime) { 1210 case TR_EL10: 1211 pp = perm_idx(vcpu, PIR_EL1, idx); 1212 up = perm_idx(vcpu, PIRE0_EL1, idx); 1213 break; 1214 case TR_EL20: 1215 pp = perm_idx(vcpu, PIR_EL2, idx); 1216 up = perm_idx(vcpu, PIRE0_EL2, idx); 1217 break; 1218 case TR_EL2: 1219 pp = perm_idx(vcpu, PIR_EL2, idx); 1220 up = 0; 1221 break; 1222 } 1223 1224 set_perms(priv, wr, pp); 1225 1226 if (wi->regime != TR_EL2) 1227 set_perms(unpriv, wr, up); 1228 else 1229 set_unpriv_perms(wr, false, false, false); 1230 1231 wr->pov = wi->poe && !(pp & BIT(3)); 1232 wr->uov = wi->e0poe && !(up & BIT(3)); 1233 1234 /* R_VFPJF */ 1235 if (wr->px && wr->uw) { 1236 set_priv_perms(wr, false, false, false); 1237 set_unpriv_perms(wr, false, false, false); 1238 } 1239 } 1240 1241 static void compute_s1_overlay_permissions(struct kvm_vcpu *vcpu, 1242 struct s1_walk_info *wi, 1243 struct s1_walk_result *wr) 1244 { 1245 u8 idx, pov_perms, uov_perms; 1246 1247 idx = FIELD_GET(PTE_PO_IDX_MASK, wr->desc); 1248 1249 if (wr->pov) { 1250 switch (wi->regime) { 1251 case TR_EL10: 1252 pov_perms = perm_idx(vcpu, POR_EL1, idx); 1253 break; 1254 case TR_EL20: 1255 pov_perms = perm_idx(vcpu, POR_EL2, idx); 1256 break; 1257 case TR_EL2: 1258 pov_perms = perm_idx(vcpu, POR_EL2, idx); 1259 break; 1260 } 1261 1262 if (pov_perms & ~POE_RWX) 1263 pov_perms = POE_NONE; 1264 1265 /* R_QXXPC, S1PrivOverflow enabled */ 1266 if (wr->pwxn && (pov_perms & POE_X)) 1267 pov_perms &= ~POE_W; 1268 1269 wr->pr &= pov_perms & POE_R; 1270 wr->pw &= pov_perms & POE_W; 1271 wr->px &= pov_perms & POE_X; 1272 } 1273 1274 if (wr->uov) { 1275 switch (wi->regime) { 1276 case TR_EL10: 1277 uov_perms = perm_idx(vcpu, POR_EL0, idx); 1278 break; 1279 case TR_EL20: 1280 uov_perms = perm_idx(vcpu, POR_EL0, idx); 1281 break; 1282 case TR_EL2: 1283 uov_perms = 0; 1284 break; 1285 } 1286 1287 if (uov_perms & ~POE_RWX) 1288 uov_perms = POE_NONE; 1289 1290 /* R_NPBXC, S1UnprivOverlay enabled */ 1291 if (wr->uwxn && (uov_perms & POE_X)) 1292 uov_perms &= ~POE_W; 1293 1294 wr->ur &= uov_perms & POE_R; 1295 wr->uw &= uov_perms & POE_W; 1296 wr->ux &= uov_perms & POE_X; 1297 } 1298 } 1299 1300 static void compute_s1_permissions(struct kvm_vcpu *vcpu, 1301 struct s1_walk_info *wi, 1302 struct s1_walk_result *wr) 1303 { 1304 bool pan; 1305 1306 if (!s1pie_enabled(vcpu, wi->regime)) 1307 compute_s1_direct_permissions(vcpu, wi, wr); 1308 else 1309 compute_s1_indirect_permissions(vcpu, wi, wr); 1310 1311 if (!wi->hpd) 1312 compute_s1_hierarchical_permissions(vcpu, wi, wr); 1313 1314 compute_s1_overlay_permissions(vcpu, wi, wr); 1315 1316 /* R_QXXPC, S1PrivOverlay disabled */ 1317 if (!wr->pov) 1318 wr->px &= !(wr->pwxn && wr->pw); 1319 1320 /* R_NPBXC, S1UnprivOverlay disabled */ 1321 if (!wr->uov) 1322 wr->ux &= !(wr->uwxn && wr->uw); 1323 1324 pan = wi->pan && (wr->ur || wr->uw || 1325 (pan3_enabled(vcpu, wi->regime) && wr->ux)); 1326 wr->pw &= !pan; 1327 wr->pr &= !pan; 1328 } 1329 1330 static int handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr, u64 *par) 1331 { 1332 struct s1_walk_result wr = {}; 1333 struct s1_walk_info wi = {}; 1334 bool perm_fail = false; 1335 int ret, idx; 1336 1337 wi.regime = compute_translation_regime(vcpu, op); 1338 wi.as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W); 1339 wi.pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) && 1340 (*vcpu_cpsr(vcpu) & PSR_PAN_BIT); 1341 1342 ret = setup_s1_walk(vcpu, &wi, &wr, vaddr); 1343 if (ret) 1344 goto compute_par; 1345 1346 if (wr.level == S1_MMU_DISABLED) 1347 goto compute_par; 1348 1349 idx = srcu_read_lock(&vcpu->kvm->srcu); 1350 1351 ret = walk_s1(vcpu, &wi, &wr, vaddr); 1352 1353 srcu_read_unlock(&vcpu->kvm->srcu, idx); 1354 1355 /* 1356 * Race to update a descriptor -- restart the walk. 1357 */ 1358 if (ret == -EAGAIN) 1359 return ret; 1360 if (ret) 1361 goto compute_par; 1362 1363 compute_s1_permissions(vcpu, &wi, &wr); 1364 1365 switch (op) { 1366 case OP_AT_S1E1RP: 1367 case OP_AT_S1E1R: 1368 case OP_AT_S1E2R: 1369 perm_fail = !wr.pr; 1370 break; 1371 case OP_AT_S1E1WP: 1372 case OP_AT_S1E1W: 1373 case OP_AT_S1E2W: 1374 perm_fail = !wr.pw; 1375 break; 1376 case OP_AT_S1E0R: 1377 perm_fail = !wr.ur; 1378 break; 1379 case OP_AT_S1E0W: 1380 perm_fail = !wr.uw; 1381 break; 1382 case OP_AT_S1E1A: 1383 case OP_AT_S1E2A: 1384 break; 1385 default: 1386 BUG(); 1387 } 1388 1389 if (perm_fail) 1390 fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false); 1391 1392 compute_par: 1393 *par = compute_par_s1(vcpu, &wi, &wr); 1394 return 0; 1395 } 1396 1397 /* 1398 * Return the PAR_EL1 value as the result of a valid translation. 1399 * 1400 * If the translation is unsuccessful, the value may only contain 1401 * PAR_EL1.F, and cannot be taken at face value. It isn't an 1402 * indication of the translation having failed, only that the fast 1403 * path did not succeed, *unless* it indicates a S1 permission or 1404 * access fault. 1405 */ 1406 static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) 1407 { 1408 struct mmu_config config; 1409 struct kvm_s2_mmu *mmu; 1410 bool fail, mmu_cs; 1411 u64 par; 1412 1413 par = SYS_PAR_EL1_F; 1414 1415 /* 1416 * We've trapped, so everything is live on the CPU. As we will 1417 * be switching contexts behind everybody's back, disable 1418 * interrupts while holding the mmu lock. 1419 */ 1420 guard(write_lock_irqsave)(&vcpu->kvm->mmu_lock); 1421 1422 /* 1423 * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already 1424 * the right one (as we trapped from vEL2). If not, save the 1425 * full MMU context. 1426 * 1427 * We are also guaranteed to be in the correct context if 1428 * we're not in a nested VM. 1429 */ 1430 mmu_cs = (vcpu_has_nv(vcpu) && 1431 !(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))); 1432 if (!mmu_cs) 1433 goto skip_mmu_switch; 1434 1435 /* 1436 * Obtaining the S2 MMU for a L2 is horribly racy, and we may not 1437 * find it (recycled by another vcpu, for example). When this 1438 * happens, admit defeat immediately and use the SW (slow) path. 1439 */ 1440 mmu = lookup_s2_mmu(vcpu); 1441 if (!mmu) 1442 return par; 1443 1444 __mmu_config_save(&config); 1445 1446 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1), SYS_TTBR0); 1447 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1), SYS_TTBR1); 1448 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1), SYS_TCR); 1449 write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1), SYS_MAIR); 1450 if (kvm_has_tcr2(vcpu->kvm)) { 1451 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR2_EL1), SYS_TCR2); 1452 if (kvm_has_s1pie(vcpu->kvm)) { 1453 write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIR_EL1), SYS_PIR); 1454 write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIRE0_EL1), SYS_PIRE0); 1455 } 1456 if (kvm_has_s1poe(vcpu->kvm)) { 1457 write_sysreg_el1(vcpu_read_sys_reg(vcpu, POR_EL1), SYS_POR); 1458 write_sysreg_s(vcpu_read_sys_reg(vcpu, POR_EL0), SYS_POR_EL0); 1459 } 1460 } 1461 write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1), SYS_SCTLR); 1462 __load_stage2(mmu); 1463 1464 skip_mmu_switch: 1465 /* Temporarily switch back to guest context */ 1466 write_sysreg_hcr(vcpu->arch.hcr_el2); 1467 isb(); 1468 1469 switch (op) { 1470 case OP_AT_S1E1RP: 1471 case OP_AT_S1E1WP: 1472 fail = at_s1e1p_fast(vcpu, op, vaddr); 1473 break; 1474 case OP_AT_S1E1R: 1475 fail = __kvm_at(OP_AT_S1E1R, vaddr); 1476 break; 1477 case OP_AT_S1E1W: 1478 fail = __kvm_at(OP_AT_S1E1W, vaddr); 1479 break; 1480 case OP_AT_S1E0R: 1481 fail = __kvm_at(OP_AT_S1E0R, vaddr); 1482 break; 1483 case OP_AT_S1E0W: 1484 fail = __kvm_at(OP_AT_S1E0W, vaddr); 1485 break; 1486 case OP_AT_S1E1A: 1487 fail = __kvm_at(OP_AT_S1E1A, vaddr); 1488 break; 1489 default: 1490 WARN_ON_ONCE(1); 1491 fail = true; 1492 break; 1493 } 1494 1495 if (!fail) 1496 par = read_sysreg_par(); 1497 1498 write_sysreg_hcr(HCR_HOST_VHE_FLAGS); 1499 1500 if (mmu_cs) 1501 __mmu_config_restore(&config); 1502 1503 return par; 1504 } 1505 1506 static bool par_check_s1_perm_fault(u64 par) 1507 { 1508 u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par); 1509 1510 return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM && 1511 !(par & SYS_PAR_EL1_S)); 1512 } 1513 1514 static bool par_check_s1_access_fault(u64 par) 1515 { 1516 u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par); 1517 1518 return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_ACCESS && 1519 !(par & SYS_PAR_EL1_S)); 1520 } 1521 1522 int __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) 1523 { 1524 u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr); 1525 int ret; 1526 1527 /* 1528 * If PAR_EL1 reports that AT failed on a S1 permission or access 1529 * fault, we know for sure that the PTW was able to walk the S1 1530 * tables and there's nothing else to do. 1531 * 1532 * If AT failed for any other reason, then we must walk the guest S1 1533 * to emulate the instruction. 1534 */ 1535 if ((par & SYS_PAR_EL1_F) && 1536 !par_check_s1_perm_fault(par) && 1537 !par_check_s1_access_fault(par)) { 1538 ret = handle_at_slow(vcpu, op, vaddr, &par); 1539 if (ret) 1540 return ret; 1541 } 1542 1543 vcpu_write_sys_reg(vcpu, par, PAR_EL1); 1544 return 0; 1545 } 1546 1547 int __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) 1548 { 1549 u64 par; 1550 int ret; 1551 1552 /* 1553 * We've trapped, so everything is live on the CPU. As we will be 1554 * switching context behind everybody's back, disable interrupts... 1555 */ 1556 scoped_guard(write_lock_irqsave, &vcpu->kvm->mmu_lock) { 1557 u64 val, hcr; 1558 bool fail; 1559 1560 val = hcr = read_sysreg(hcr_el2); 1561 val &= ~HCR_TGE; 1562 val |= HCR_VM; 1563 1564 if (!vcpu_el2_e2h_is_set(vcpu)) 1565 val |= HCR_NV | HCR_NV1; 1566 1567 write_sysreg_hcr(val); 1568 isb(); 1569 1570 par = SYS_PAR_EL1_F; 1571 1572 switch (op) { 1573 case OP_AT_S1E2R: 1574 fail = __kvm_at(OP_AT_S1E1R, vaddr); 1575 break; 1576 case OP_AT_S1E2W: 1577 fail = __kvm_at(OP_AT_S1E1W, vaddr); 1578 break; 1579 case OP_AT_S1E2A: 1580 fail = __kvm_at(OP_AT_S1E1A, vaddr); 1581 break; 1582 default: 1583 WARN_ON_ONCE(1); 1584 fail = true; 1585 } 1586 1587 if (!fail) 1588 par = read_sysreg_par(); 1589 1590 write_sysreg_hcr(hcr); 1591 isb(); 1592 } 1593 1594 /* We failed the translation, let's replay it in slow motion */ 1595 if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) { 1596 ret = handle_at_slow(vcpu, op, vaddr, &par); 1597 if (ret) 1598 return ret; 1599 } 1600 1601 vcpu_write_sys_reg(vcpu, par, PAR_EL1); 1602 return 0; 1603 } 1604 1605 int __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) 1606 { 1607 struct kvm_s2_trans out = {}; 1608 u64 ipa, par; 1609 bool write; 1610 int ret; 1611 1612 /* Do the stage-1 translation */ 1613 switch (op) { 1614 case OP_AT_S12E1R: 1615 op = OP_AT_S1E1R; 1616 write = false; 1617 break; 1618 case OP_AT_S12E1W: 1619 op = OP_AT_S1E1W; 1620 write = true; 1621 break; 1622 case OP_AT_S12E0R: 1623 op = OP_AT_S1E0R; 1624 write = false; 1625 break; 1626 case OP_AT_S12E0W: 1627 op = OP_AT_S1E0W; 1628 write = true; 1629 break; 1630 default: 1631 WARN_ON_ONCE(1); 1632 return 0; 1633 } 1634 1635 ret = __kvm_at_s1e01(vcpu, op, vaddr); 1636 if (ret) 1637 return ret; 1638 1639 par = vcpu_read_sys_reg(vcpu, PAR_EL1); 1640 if (par & SYS_PAR_EL1_F) 1641 return 0; 1642 1643 /* 1644 * If we only have a single stage of translation (EL2&0), exit 1645 * early. Same thing if {VM,DC}=={0,0}. 1646 */ 1647 if (compute_translation_regime(vcpu, op) == TR_EL20 || 1648 !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC))) 1649 return 0; 1650 1651 /* Do the stage-2 translation */ 1652 ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0)); 1653 out.esr = 0; 1654 scoped_guard(srcu, &vcpu->kvm->srcu) 1655 ret = kvm_walk_nested_s2(vcpu, ipa, &out); 1656 if (ret < 0) 1657 return ret; 1658 1659 /* Check the access permission */ 1660 if (!out.esr && 1661 ((!write && !out.readable) || (write && !out.writable))) 1662 out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3); 1663 1664 par = compute_par_s12(vcpu, par, &out); 1665 vcpu_write_sys_reg(vcpu, par, PAR_EL1); 1666 return 0; 1667 } 1668 1669 /* 1670 * Translate a VA for a given EL in a given translation regime, with 1671 * or without PAN. This requires wi->{regime, as_el0, pan} to be 1672 * set. The rest of the wi and wr should be 0-initialised. 1673 */ 1674 int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, 1675 struct s1_walk_result *wr, u64 va) 1676 { 1677 int ret; 1678 1679 ret = setup_s1_walk(vcpu, wi, wr, va); 1680 if (ret) 1681 return ret; 1682 1683 if (wr->level == S1_MMU_DISABLED) { 1684 wr->ur = wr->uw = wr->ux = true; 1685 wr->pr = wr->pw = wr->px = true; 1686 } else { 1687 ret = walk_s1(vcpu, wi, wr, va); 1688 if (ret) 1689 return ret; 1690 1691 compute_s1_permissions(vcpu, wi, wr); 1692 } 1693 1694 return 0; 1695 } 1696 1697 struct desc_match { 1698 u64 ipa; 1699 int level; 1700 }; 1701 1702 static int match_s1_desc(struct s1_walk_context *ctxt, void *priv) 1703 { 1704 struct desc_match *dm = priv; 1705 u64 ipa = dm->ipa; 1706 1707 /* Use S1 granule alignment */ 1708 ipa &= GENMASK(51, ctxt->wi->pgshift); 1709 1710 /* Not the IPA we're looking for? Continue. */ 1711 if (ipa != ctxt->table_ipa) 1712 return 0; 1713 1714 /* Note the level and interrupt the walk */ 1715 dm->level = ctxt->level; 1716 return -EINTR; 1717 } 1718 1719 int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level) 1720 { 1721 struct desc_match dm = { 1722 .ipa = ipa, 1723 }; 1724 struct s1_walk_info wi = { 1725 .filter = &(struct s1_walk_filter){ 1726 .fn = match_s1_desc, 1727 .priv = &dm, 1728 }, 1729 .as_el0 = false, 1730 .pan = false, 1731 }; 1732 struct s1_walk_result wr = {}; 1733 int ret; 1734 1735 if (is_hyp_ctxt(vcpu)) 1736 wi.regime = vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2; 1737 else 1738 wi.regime = TR_EL10; 1739 1740 ret = setup_s1_walk(vcpu, &wi, &wr, va); 1741 if (ret) 1742 return ret; 1743 1744 /* We really expect the S1 MMU to be on here... */ 1745 if (WARN_ON_ONCE(wr.level == S1_MMU_DISABLED)) { 1746 *level = 0; 1747 return 0; 1748 } 1749 1750 /* Walk the guest's PT, looking for a match along the way */ 1751 scoped_guard(srcu, &vcpu->kvm->srcu) 1752 ret = walk_s1(vcpu, &wi, &wr, va); 1753 switch (ret) { 1754 case -EINTR: 1755 /* We interrupted the walk on a match, return the level */ 1756 *level = dm.level; 1757 return 0; 1758 case 0: 1759 /* The walk completed, we failed to find the entry */ 1760 return -ENOENT; 1761 default: 1762 /* Any other error... */ 1763 return ret; 1764 } 1765 } 1766 1767 static int __lsui_swap_desc(u64 __user *ptep, u64 old, u64 new) 1768 { 1769 u64 tmp = old; 1770 int ret = 0; 1771 1772 /* 1773 * Wrap LSUI instructions with uaccess_ttbr0_enable()/disable(), 1774 * as PAN toggling is not required. 1775 */ 1776 uaccess_ttbr0_enable(); 1777 1778 asm volatile(__LSUI_PREAMBLE 1779 "1: cast %[old], %[new], %[addr]\n" 1780 "2:\n" 1781 _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret]) 1782 : [old] "+r" (old), [addr] "+Q" (*ptep), [ret] "+r" (ret) 1783 : [new] "r" (new) 1784 : "memory"); 1785 1786 uaccess_ttbr0_disable(); 1787 1788 if (ret) 1789 return ret; 1790 if (tmp != old) 1791 return -EAGAIN; 1792 1793 return ret; 1794 } 1795 1796 static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new) 1797 { 1798 u64 tmp = old; 1799 int ret = 0; 1800 1801 uaccess_enable_privileged(); 1802 1803 asm volatile(__LSE_PREAMBLE 1804 "1: cas %[old], %[new], %[addr]\n" 1805 "2:\n" 1806 _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret]) 1807 : [old] "+r" (old), [addr] "+Q" (*ptep), [ret] "+r" (ret) 1808 : [new] "r" (new) 1809 : "memory"); 1810 1811 uaccess_disable_privileged(); 1812 1813 if (ret) 1814 return ret; 1815 if (tmp != old) 1816 return -EAGAIN; 1817 1818 return ret; 1819 } 1820 1821 static int __llsc_swap_desc(u64 __user *ptep, u64 old, u64 new) 1822 { 1823 int ret = 1; 1824 u64 tmp; 1825 1826 uaccess_enable_privileged(); 1827 1828 asm volatile("prfm pstl1strm, %[addr]\n" 1829 "1: ldxr %[tmp], %[addr]\n" 1830 "sub %[tmp], %[tmp], %[old]\n" 1831 "cbnz %[tmp], 3f\n" 1832 "2: stlxr %w[ret], %[new], %[addr]\n" 1833 "3:\n" 1834 _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w[ret]) 1835 _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w[ret]) 1836 : [ret] "+r" (ret), [addr] "+Q" (*ptep), [tmp] "=&r" (tmp) 1837 : [old] "r" (old), [new] "r" (new) 1838 : "memory"); 1839 1840 uaccess_disable_privileged(); 1841 1842 /* STLXR didn't update the descriptor, or the compare failed */ 1843 if (ret == 1) 1844 return -EAGAIN; 1845 1846 return ret; 1847 } 1848 1849 int __kvm_at_swap_desc(struct kvm *kvm, gpa_t ipa, u64 old, u64 new) 1850 { 1851 struct kvm_memory_slot *slot; 1852 unsigned long hva; 1853 u64 __user *ptep; 1854 bool writable; 1855 int offset; 1856 gfn_t gfn; 1857 int r; 1858 1859 lockdep_assert(srcu_read_lock_held(&kvm->srcu)); 1860 1861 gfn = ipa >> PAGE_SHIFT; 1862 offset = offset_in_page(ipa); 1863 slot = gfn_to_memslot(kvm, gfn); 1864 hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); 1865 if (kvm_is_error_hva(hva)) 1866 return -EINVAL; 1867 if (!writable) 1868 return -EPERM; 1869 1870 ptep = (void __user *)hva + offset; 1871 if (cpus_have_final_cap(ARM64_HAS_LSUI)) 1872 r = __lsui_swap_desc(ptep, old, new); 1873 else if (cpus_have_final_cap(ARM64_HAS_LSE_ATOMICS)) 1874 r = __lse_swap_desc(ptep, old, new); 1875 else 1876 r = __llsc_swap_desc(ptep, old, new); 1877 1878 if (r < 0) 1879 return r; 1880 1881 mark_page_dirty_in_slot(kvm, slot, gfn); 1882 return 0; 1883 } 1884