1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2017 - Linaro Ltd 4 * Author: Jintack Lim <jintack.lim@linaro.org> 5 */ 6 7 #include <linux/kvm_host.h> 8 9 #include <asm/esr.h> 10 #include <asm/kvm_hyp.h> 11 #include <asm/kvm_mmu.h> 12 13 static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool s1ptw) 14 { 15 wr->fst = fst; 16 wr->ptw = s1ptw; 17 wr->s2 = s1ptw; 18 wr->failed = true; 19 } 20 21 #define S1_MMU_DISABLED (-127) 22 23 static int get_ia_size(struct s1_walk_info *wi) 24 { 25 return 64 - wi->txsz; 26 } 27 28 /* Return true if the IPA is out of the OA range */ 29 static bool check_output_size(u64 ipa, struct s1_walk_info *wi) 30 { 31 if (wi->pa52bit) 32 return wi->max_oa_bits < 52 && (ipa & GENMASK_ULL(51, wi->max_oa_bits)); 33 return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits)); 34 } 35 36 static bool has_52bit_pa(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, u64 tcr) 37 { 38 switch (BIT(wi->pgshift)) { 39 case SZ_64K: 40 default: /* IMPDEF: treat any other value as 64k */ 41 if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, PARANGE, 52)) 42 return false; 43 return ((wi->regime == TR_EL2 ? 44 FIELD_GET(TCR_EL2_PS_MASK, tcr) : 45 FIELD_GET(TCR_IPS_MASK, tcr)) == 0b0110); 46 case SZ_16K: 47 if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT)) 48 return false; 49 break; 50 case SZ_4K: 51 if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT)) 52 return false; 53 break; 54 } 55 56 return (tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS)); 57 } 58 59 static u64 desc_to_oa(struct s1_walk_info *wi, u64 desc) 60 { 61 u64 addr; 62 63 if (!wi->pa52bit) 64 return desc & GENMASK_ULL(47, wi->pgshift); 65 66 switch (BIT(wi->pgshift)) { 67 case SZ_4K: 68 case SZ_16K: 69 addr = desc & GENMASK_ULL(49, wi->pgshift); 70 addr |= FIELD_GET(KVM_PTE_ADDR_51_50_LPA2, desc) << 50; 71 break; 72 case SZ_64K: 73 default: /* IMPDEF: treat any other value as 64k */ 74 addr = desc & GENMASK_ULL(47, wi->pgshift); 75 addr |= FIELD_GET(KVM_PTE_ADDR_51_48, desc) << 48; 76 break; 77 } 78 79 return addr; 80 } 81 82 /* Return the translation regime that applies to an AT instruction */ 83 static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op) 84 { 85 /* 86 * We only get here from guest EL2, so the translation 87 * regime AT applies to is solely defined by {E2H,TGE}. 88 */ 89 switch (op) { 90 case OP_AT_S1E2R: 91 case OP_AT_S1E2W: 92 case OP_AT_S1E2A: 93 return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2; 94 default: 95 return (vcpu_el2_e2h_is_set(vcpu) && 96 vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10; 97 } 98 } 99 100 static u64 effective_tcr2(struct kvm_vcpu *vcpu, enum trans_regime regime) 101 { 102 if (regime == TR_EL10) { 103 if (vcpu_has_nv(vcpu) && 104 !(__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En)) 105 return 0; 106 107 return vcpu_read_sys_reg(vcpu, TCR2_EL1); 108 } 109 110 return vcpu_read_sys_reg(vcpu, TCR2_EL2); 111 } 112 113 static bool s1pie_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) 114 { 115 if (!kvm_has_s1pie(vcpu->kvm)) 116 return false; 117 118 /* Abuse TCR2_EL1_PIE and use it for EL2 as well */ 119 return effective_tcr2(vcpu, regime) & TCR2_EL1_PIE; 120 } 121 122 static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi) 123 { 124 u64 val; 125 126 if (!kvm_has_s1poe(vcpu->kvm)) { 127 wi->poe = wi->e0poe = false; 128 return; 129 } 130 131 val = effective_tcr2(vcpu, wi->regime); 132 133 /* Abuse TCR2_EL1_* for EL2 */ 134 wi->poe = val & TCR2_EL1_POE; 135 wi->e0poe = (wi->regime != TR_EL2) && (val & TCR2_EL1_E0POE); 136 } 137 138 static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, 139 struct s1_walk_result *wr, u64 va) 140 { 141 u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr; 142 unsigned int stride, x; 143 bool va55, tbi, lva; 144 145 va55 = va & BIT(55); 146 147 if (vcpu_has_nv(vcpu)) { 148 hcr = __vcpu_sys_reg(vcpu, HCR_EL2); 149 wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC)); 150 } else { 151 WARN_ON_ONCE(wi->regime != TR_EL10); 152 wi->s2 = false; 153 hcr = 0; 154 } 155 156 switch (wi->regime) { 157 case TR_EL10: 158 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); 159 tcr = vcpu_read_sys_reg(vcpu, TCR_EL1); 160 ttbr = (va55 ? 161 vcpu_read_sys_reg(vcpu, TTBR1_EL1) : 162 vcpu_read_sys_reg(vcpu, TTBR0_EL1)); 163 break; 164 case TR_EL2: 165 case TR_EL20: 166 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); 167 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); 168 ttbr = (va55 ? 169 vcpu_read_sys_reg(vcpu, TTBR1_EL2) : 170 vcpu_read_sys_reg(vcpu, TTBR0_EL2)); 171 break; 172 default: 173 BUG(); 174 } 175 176 /* Someone was silly enough to encode TG0/TG1 differently */ 177 if (va55 && wi->regime != TR_EL2) { 178 wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); 179 tg = FIELD_GET(TCR_TG1_MASK, tcr); 180 181 switch (tg << TCR_TG1_SHIFT) { 182 case TCR_TG1_4K: 183 wi->pgshift = 12; break; 184 case TCR_TG1_16K: 185 wi->pgshift = 14; break; 186 case TCR_TG1_64K: 187 default: /* IMPDEF: treat any other value as 64k */ 188 wi->pgshift = 16; break; 189 } 190 } else { 191 wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); 192 tg = FIELD_GET(TCR_TG0_MASK, tcr); 193 194 switch (tg << TCR_TG0_SHIFT) { 195 case TCR_TG0_4K: 196 wi->pgshift = 12; break; 197 case TCR_TG0_16K: 198 wi->pgshift = 14; break; 199 case TCR_TG0_64K: 200 default: /* IMPDEF: treat any other value as 64k */ 201 wi->pgshift = 16; break; 202 } 203 } 204 205 wi->pa52bit = has_52bit_pa(vcpu, wi, tcr); 206 207 ia_bits = get_ia_size(wi); 208 209 /* AArch64.S1StartLevel() */ 210 stride = wi->pgshift - 3; 211 wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride); 212 213 if (wi->regime == TR_EL2 && va55) 214 goto addrsz; 215 216 tbi = (wi->regime == TR_EL2 ? 217 FIELD_GET(TCR_EL2_TBI, tcr) : 218 (va55 ? 219 FIELD_GET(TCR_TBI1, tcr) : 220 FIELD_GET(TCR_TBI0, tcr))); 221 222 if (!tbi && (u64)sign_extend64(va, 55) != va) 223 goto addrsz; 224 225 wi->sh = (wi->regime == TR_EL2 ? 226 FIELD_GET(TCR_EL2_SH0_MASK, tcr) : 227 (va55 ? 228 FIELD_GET(TCR_SH1_MASK, tcr) : 229 FIELD_GET(TCR_SH0_MASK, tcr))); 230 231 va = (u64)sign_extend64(va, 55); 232 233 /* Let's put the MMU disabled case aside immediately */ 234 switch (wi->regime) { 235 case TR_EL10: 236 /* 237 * If dealing with the EL1&0 translation regime, 3 things 238 * can disable the S1 translation: 239 * 240 * - HCR_EL2.DC = 1 241 * - HCR_EL2.{E2H,TGE} = {0,1} 242 * - SCTLR_EL1.M = 0 243 * 244 * The TGE part is interesting. If we have decided that this 245 * is EL1&0, then it means that either {E2H,TGE} == {1,0} or 246 * {0,x}, and we only need to test for TGE == 1. 247 */ 248 if (hcr & (HCR_DC | HCR_TGE)) { 249 wr->level = S1_MMU_DISABLED; 250 break; 251 } 252 fallthrough; 253 case TR_EL2: 254 case TR_EL20: 255 if (!(sctlr & SCTLR_ELx_M)) 256 wr->level = S1_MMU_DISABLED; 257 break; 258 } 259 260 if (wr->level == S1_MMU_DISABLED) { 261 if (va >= BIT(kvm_get_pa_bits(vcpu->kvm))) 262 goto addrsz; 263 264 wr->pa = va; 265 return 0; 266 } 267 268 wi->be = sctlr & SCTLR_ELx_EE; 269 270 wi->hpd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP); 271 wi->hpd &= (wi->regime == TR_EL2 ? 272 FIELD_GET(TCR_EL2_HPD, tcr) : 273 (va55 ? 274 FIELD_GET(TCR_HPD1, tcr) : 275 FIELD_GET(TCR_HPD0, tcr))); 276 /* R_JHSVW */ 277 wi->hpd |= s1pie_enabled(vcpu, wi->regime); 278 279 /* Do we have POE? */ 280 compute_s1poe(vcpu, wi); 281 282 /* R_BVXDG */ 283 wi->hpd |= (wi->poe || wi->e0poe); 284 285 /* R_PLCGL, R_YXNYW */ 286 if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) { 287 if (wi->txsz > 39) 288 goto transfault; 289 } else { 290 if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47)) 291 goto transfault; 292 } 293 294 /* R_GTJBY, R_SXWGM */ 295 switch (BIT(wi->pgshift)) { 296 case SZ_4K: 297 case SZ_16K: 298 lva = wi->pa52bit; 299 break; 300 case SZ_64K: 301 lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52); 302 break; 303 } 304 305 if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16)) 306 goto transfault; 307 308 /* R_YYVYV, I_THCZK */ 309 if ((!va55 && va > GENMASK(ia_bits - 1, 0)) || 310 (va55 && va < GENMASK(63, ia_bits))) 311 goto transfault; 312 313 /* I_ZFSYQ */ 314 if (wi->regime != TR_EL2 && 315 (tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK))) 316 goto transfault; 317 318 /* R_BNDVG and following statements */ 319 if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) && 320 wi->as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0))) 321 goto transfault; 322 323 ps = (wi->regime == TR_EL2 ? 324 FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr)); 325 326 wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps, wi->pa52bit)); 327 328 /* Compute minimal alignment */ 329 x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift); 330 331 wi->baddr = ttbr & TTBRx_EL1_BADDR; 332 if (wi->pa52bit) { 333 /* 334 * Force the alignment on 64 bytes for top-level tables 335 * smaller than 8 entries, since TTBR.BADDR[5:2] are used to 336 * store bits [51:48] of the first level of lookup. 337 */ 338 x = max(x, 6); 339 340 wi->baddr |= FIELD_GET(GENMASK_ULL(5, 2), ttbr) << 48; 341 } 342 343 /* R_VPBBF */ 344 if (check_output_size(wi->baddr, wi)) 345 goto addrsz; 346 347 wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x); 348 349 wi->ha = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HAFDBS, AF); 350 wi->ha &= (wi->regime == TR_EL2 ? 351 FIELD_GET(TCR_EL2_HA, tcr) : 352 FIELD_GET(TCR_HA, tcr)); 353 354 return 0; 355 356 addrsz: 357 /* 358 * Address Size Fault level 0 to indicate it comes from TTBR. 359 * yes, this is an oddity. 360 */ 361 fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false); 362 return -EFAULT; 363 364 transfault: 365 /* Translation Fault on start level */ 366 fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(wi->sl), false); 367 return -EFAULT; 368 } 369 370 static int kvm_read_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 *desc, 371 struct s1_walk_info *wi) 372 { 373 u64 val; 374 int r; 375 376 r = kvm_read_guest(vcpu->kvm, pa, &val, sizeof(val)); 377 if (r) 378 return r; 379 380 if (wi->be) 381 *desc = be64_to_cpu((__force __be64)val); 382 else 383 *desc = le64_to_cpu((__force __le64)val); 384 385 return 0; 386 } 387 388 static int kvm_swap_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 old, u64 new, 389 struct s1_walk_info *wi) 390 { 391 if (wi->be) { 392 old = (__force u64)cpu_to_be64(old); 393 new = (__force u64)cpu_to_be64(new); 394 } else { 395 old = (__force u64)cpu_to_le64(old); 396 new = (__force u64)cpu_to_le64(new); 397 } 398 399 return __kvm_at_swap_desc(vcpu->kvm, pa, old, new); 400 } 401 402 static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, 403 struct s1_walk_result *wr, u64 va) 404 { 405 u64 va_top, va_bottom, baddr, desc, new_desc, ipa; 406 int level, stride, ret; 407 408 level = wi->sl; 409 stride = wi->pgshift - 3; 410 baddr = wi->baddr; 411 412 va_top = get_ia_size(wi) - 1; 413 414 while (1) { 415 u64 index; 416 417 va_bottom = (3 - level) * stride + wi->pgshift; 418 index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3); 419 420 ipa = baddr | index; 421 422 if (wi->s2) { 423 struct kvm_s2_trans s2_trans = {}; 424 425 ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans); 426 if (ret) { 427 fail_s1_walk(wr, 428 (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level, 429 true); 430 return ret; 431 } 432 433 if (!kvm_s2_trans_readable(&s2_trans)) { 434 fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level), 435 true); 436 437 return -EPERM; 438 } 439 440 ipa = kvm_s2_trans_output(&s2_trans); 441 } 442 443 if (wi->filter) { 444 ret = wi->filter->fn(&(struct s1_walk_context) 445 { 446 .wi = wi, 447 .table_ipa = baddr, 448 .level = level, 449 }, wi->filter->priv); 450 if (ret) 451 return ret; 452 } 453 454 ret = kvm_read_s1_desc(vcpu, ipa, &desc, wi); 455 if (ret) { 456 fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false); 457 return ret; 458 } 459 460 new_desc = desc; 461 462 /* Invalid descriptor */ 463 if (!(desc & BIT(0))) 464 goto transfault; 465 466 /* Block mapping, check validity down the line */ 467 if (!(desc & BIT(1))) 468 break; 469 470 /* Page mapping */ 471 if (level == 3) 472 break; 473 474 /* Table handling */ 475 if (!wi->hpd) { 476 wr->APTable |= FIELD_GET(S1_TABLE_AP, desc); 477 wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc); 478 wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc); 479 } 480 481 baddr = desc_to_oa(wi, desc); 482 483 /* Check for out-of-range OA */ 484 if (check_output_size(baddr, wi)) 485 goto addrsz; 486 487 /* Prepare for next round */ 488 va_top = va_bottom - 1; 489 level++; 490 } 491 492 /* Block mapping, check the validity of the level */ 493 if (!(desc & BIT(1))) { 494 bool valid_block = false; 495 496 switch (BIT(wi->pgshift)) { 497 case SZ_4K: 498 valid_block = level == 1 || level == 2 || (wi->pa52bit && level == 0); 499 break; 500 case SZ_16K: 501 case SZ_64K: 502 valid_block = level == 2 || (wi->pa52bit && level == 1); 503 break; 504 } 505 506 if (!valid_block) 507 goto transfault; 508 } 509 510 baddr = desc_to_oa(wi, desc); 511 if (check_output_size(baddr & GENMASK(52, va_bottom), wi)) 512 goto addrsz; 513 514 if (wi->ha) 515 new_desc |= PTE_AF; 516 517 if (new_desc != desc) { 518 ret = kvm_swap_s1_desc(vcpu, ipa, desc, new_desc, wi); 519 if (ret) 520 return ret; 521 522 desc = new_desc; 523 } 524 525 if (!(desc & PTE_AF)) { 526 fail_s1_walk(wr, ESR_ELx_FSC_ACCESS_L(level), false); 527 return -EACCES; 528 } 529 530 va_bottom += contiguous_bit_shift(desc, wi, level); 531 532 wr->failed = false; 533 wr->level = level; 534 wr->desc = desc; 535 wr->pa = baddr & GENMASK(52, va_bottom); 536 wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0); 537 538 wr->nG = (wi->regime != TR_EL2) && (desc & PTE_NG); 539 if (wr->nG) { 540 u64 asid_ttbr, tcr; 541 542 switch (wi->regime) { 543 case TR_EL10: 544 tcr = vcpu_read_sys_reg(vcpu, TCR_EL1); 545 asid_ttbr = ((tcr & TCR_A1) ? 546 vcpu_read_sys_reg(vcpu, TTBR1_EL1) : 547 vcpu_read_sys_reg(vcpu, TTBR0_EL1)); 548 break; 549 case TR_EL20: 550 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); 551 asid_ttbr = ((tcr & TCR_A1) ? 552 vcpu_read_sys_reg(vcpu, TTBR1_EL2) : 553 vcpu_read_sys_reg(vcpu, TTBR0_EL2)); 554 break; 555 default: 556 BUG(); 557 } 558 559 wr->asid = FIELD_GET(TTBR_ASID_MASK, asid_ttbr); 560 if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) || 561 !(tcr & TCR_ASID16)) 562 wr->asid &= GENMASK(7, 0); 563 } 564 565 return 0; 566 567 addrsz: 568 fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), false); 569 return -EINVAL; 570 transfault: 571 fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), false); 572 return -ENOENT; 573 } 574 575 struct mmu_config { 576 u64 ttbr0; 577 u64 ttbr1; 578 u64 tcr; 579 u64 mair; 580 u64 tcr2; 581 u64 pir; 582 u64 pire0; 583 u64 por_el0; 584 u64 por_el1; 585 u64 sctlr; 586 u64 vttbr; 587 u64 vtcr; 588 }; 589 590 static void __mmu_config_save(struct mmu_config *config) 591 { 592 config->ttbr0 = read_sysreg_el1(SYS_TTBR0); 593 config->ttbr1 = read_sysreg_el1(SYS_TTBR1); 594 config->tcr = read_sysreg_el1(SYS_TCR); 595 config->mair = read_sysreg_el1(SYS_MAIR); 596 if (cpus_have_final_cap(ARM64_HAS_TCR2)) { 597 config->tcr2 = read_sysreg_el1(SYS_TCR2); 598 if (cpus_have_final_cap(ARM64_HAS_S1PIE)) { 599 config->pir = read_sysreg_el1(SYS_PIR); 600 config->pire0 = read_sysreg_el1(SYS_PIRE0); 601 } 602 if (system_supports_poe()) { 603 config->por_el1 = read_sysreg_el1(SYS_POR); 604 config->por_el0 = read_sysreg_s(SYS_POR_EL0); 605 } 606 } 607 config->sctlr = read_sysreg_el1(SYS_SCTLR); 608 config->vttbr = read_sysreg(vttbr_el2); 609 config->vtcr = read_sysreg(vtcr_el2); 610 } 611 612 static void __mmu_config_restore(struct mmu_config *config) 613 { 614 /* 615 * ARM errata 1165522 and 1530923 require TGE to be 1 before 616 * we update the guest state. 617 */ 618 asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); 619 620 write_sysreg_el1(config->ttbr0, SYS_TTBR0); 621 write_sysreg_el1(config->ttbr1, SYS_TTBR1); 622 write_sysreg_el1(config->tcr, SYS_TCR); 623 write_sysreg_el1(config->mair, SYS_MAIR); 624 if (cpus_have_final_cap(ARM64_HAS_TCR2)) { 625 write_sysreg_el1(config->tcr2, SYS_TCR2); 626 if (cpus_have_final_cap(ARM64_HAS_S1PIE)) { 627 write_sysreg_el1(config->pir, SYS_PIR); 628 write_sysreg_el1(config->pire0, SYS_PIRE0); 629 } 630 if (system_supports_poe()) { 631 write_sysreg_el1(config->por_el1, SYS_POR); 632 write_sysreg_s(config->por_el0, SYS_POR_EL0); 633 } 634 } 635 write_sysreg_el1(config->sctlr, SYS_SCTLR); 636 write_sysreg(config->vttbr, vttbr_el2); 637 write_sysreg(config->vtcr, vtcr_el2); 638 } 639 640 static bool at_s1e1p_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) 641 { 642 u64 host_pan; 643 bool fail; 644 645 host_pan = read_sysreg_s(SYS_PSTATE_PAN); 646 write_sysreg_s(*vcpu_cpsr(vcpu) & PSTATE_PAN, SYS_PSTATE_PAN); 647 648 switch (op) { 649 case OP_AT_S1E1RP: 650 fail = __kvm_at(OP_AT_S1E1RP, vaddr); 651 break; 652 case OP_AT_S1E1WP: 653 fail = __kvm_at(OP_AT_S1E1WP, vaddr); 654 break; 655 } 656 657 write_sysreg_s(host_pan, SYS_PSTATE_PAN); 658 659 return fail; 660 } 661 662 #define MEMATTR(ic, oc) (MEMATTR_##oc << 4 | MEMATTR_##ic) 663 #define MEMATTR_NC 0b0100 664 #define MEMATTR_Wt 0b1000 665 #define MEMATTR_Wb 0b1100 666 #define MEMATTR_WbRaWa 0b1111 667 668 #define MEMATTR_IS_DEVICE(m) (((m) & GENMASK(7, 4)) == 0) 669 670 static u8 s2_memattr_to_attr(u8 memattr) 671 { 672 memattr &= 0b1111; 673 674 switch (memattr) { 675 case 0b0000: 676 case 0b0001: 677 case 0b0010: 678 case 0b0011: 679 return memattr << 2; 680 case 0b0100: 681 return MEMATTR(Wb, Wb); 682 case 0b0101: 683 return MEMATTR(NC, NC); 684 case 0b0110: 685 return MEMATTR(Wt, NC); 686 case 0b0111: 687 return MEMATTR(Wb, NC); 688 case 0b1000: 689 /* Reserved, assume NC */ 690 return MEMATTR(NC, NC); 691 case 0b1001: 692 return MEMATTR(NC, Wt); 693 case 0b1010: 694 return MEMATTR(Wt, Wt); 695 case 0b1011: 696 return MEMATTR(Wb, Wt); 697 case 0b1100: 698 /* Reserved, assume NC */ 699 return MEMATTR(NC, NC); 700 case 0b1101: 701 return MEMATTR(NC, Wb); 702 case 0b1110: 703 return MEMATTR(Wt, Wb); 704 case 0b1111: 705 return MEMATTR(Wb, Wb); 706 default: 707 unreachable(); 708 } 709 } 710 711 static u8 combine_s1_s2_attr(u8 s1, u8 s2) 712 { 713 bool transient; 714 u8 final = 0; 715 716 /* Upgrade transient s1 to non-transient to simplify things */ 717 switch (s1) { 718 case 0b0001 ... 0b0011: /* Normal, Write-Through Transient */ 719 transient = true; 720 s1 = MEMATTR_Wt | (s1 & GENMASK(1,0)); 721 break; 722 case 0b0101 ... 0b0111: /* Normal, Write-Back Transient */ 723 transient = true; 724 s1 = MEMATTR_Wb | (s1 & GENMASK(1,0)); 725 break; 726 default: 727 transient = false; 728 } 729 730 /* S2CombineS1AttrHints() */ 731 if ((s1 & GENMASK(3, 2)) == MEMATTR_NC || 732 (s2 & GENMASK(3, 2)) == MEMATTR_NC) 733 final = MEMATTR_NC; 734 else if ((s1 & GENMASK(3, 2)) == MEMATTR_Wt || 735 (s2 & GENMASK(3, 2)) == MEMATTR_Wt) 736 final = MEMATTR_Wt; 737 else 738 final = MEMATTR_Wb; 739 740 if (final != MEMATTR_NC) { 741 /* Inherit RaWa hints form S1 */ 742 if (transient) { 743 switch (s1 & GENMASK(3, 2)) { 744 case MEMATTR_Wt: 745 final = 0; 746 break; 747 case MEMATTR_Wb: 748 final = MEMATTR_NC; 749 break; 750 } 751 } 752 753 final |= s1 & GENMASK(1, 0); 754 } 755 756 return final; 757 } 758 759 #define ATTR_NSH 0b00 760 #define ATTR_RSV 0b01 761 #define ATTR_OSH 0b10 762 #define ATTR_ISH 0b11 763 764 static u8 compute_final_sh(u8 attr, u8 sh) 765 { 766 /* Any form of device, as well as NC has SH[1:0]=0b10 */ 767 if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC)) 768 return ATTR_OSH; 769 770 if (sh == ATTR_RSV) /* Reserved, mapped to NSH */ 771 sh = ATTR_NSH; 772 773 return sh; 774 } 775 776 static u8 compute_s1_sh(struct s1_walk_info *wi, struct s1_walk_result *wr, 777 u8 attr) 778 { 779 u8 sh; 780 781 /* 782 * non-52bit and LPA have their basic shareability described in the 783 * descriptor. LPA2 gets it from the corresponding field in TCR, 784 * conveniently recorded in the walk info. 785 */ 786 if (!wi->pa52bit || BIT(wi->pgshift) == SZ_64K) 787 sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_SH, wr->desc); 788 else 789 sh = wi->sh; 790 791 return compute_final_sh(attr, sh); 792 } 793 794 static u8 combine_sh(u8 s1_sh, u8 s2_sh) 795 { 796 if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH) 797 return ATTR_OSH; 798 if (s1_sh == ATTR_ISH || s2_sh == ATTR_ISH) 799 return ATTR_ISH; 800 801 return ATTR_NSH; 802 } 803 804 static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par, 805 struct kvm_s2_trans *tr) 806 { 807 u8 s1_parattr, s2_memattr, final_attr, s2_sh; 808 u64 par; 809 810 /* If S2 has failed to translate, report the damage */ 811 if (tr->esr) { 812 par = SYS_PAR_EL1_RES1; 813 par |= SYS_PAR_EL1_F; 814 par |= SYS_PAR_EL1_S; 815 par |= FIELD_PREP(SYS_PAR_EL1_FST, tr->esr); 816 return par; 817 } 818 819 s1_parattr = FIELD_GET(SYS_PAR_EL1_ATTR, s1_par); 820 s2_memattr = FIELD_GET(GENMASK(5, 2), tr->desc); 821 822 if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_FWB) { 823 if (!kvm_has_feat(vcpu->kvm, ID_AA64PFR2_EL1, MTEPERM, IMP)) 824 s2_memattr &= ~BIT(3); 825 826 /* Combination of R_VRJSW and R_RHWZM */ 827 switch (s2_memattr) { 828 case 0b0101: 829 if (MEMATTR_IS_DEVICE(s1_parattr)) 830 final_attr = s1_parattr; 831 else 832 final_attr = MEMATTR(NC, NC); 833 break; 834 case 0b0110: 835 case 0b1110: 836 final_attr = MEMATTR(WbRaWa, WbRaWa); 837 break; 838 case 0b0111: 839 case 0b1111: 840 /* Preserve S1 attribute */ 841 final_attr = s1_parattr; 842 break; 843 case 0b0100: 844 case 0b1100: 845 case 0b1101: 846 /* Reserved, do something non-silly */ 847 final_attr = s1_parattr; 848 break; 849 default: 850 /* 851 * MemAttr[2]=0, Device from S2. 852 * 853 * FWB does not influence the way that stage 1 854 * memory types and attributes are combined 855 * with stage 2 Device type and attributes. 856 */ 857 final_attr = min(s2_memattr_to_attr(s2_memattr), 858 s1_parattr); 859 } 860 } else { 861 /* Combination of R_HMNDG, R_TNHFM and R_GQFSF */ 862 u8 s2_parattr = s2_memattr_to_attr(s2_memattr); 863 864 if (MEMATTR_IS_DEVICE(s1_parattr) || 865 MEMATTR_IS_DEVICE(s2_parattr)) { 866 final_attr = min(s1_parattr, s2_parattr); 867 } else { 868 /* At this stage, this is memory vs memory */ 869 final_attr = combine_s1_s2_attr(s1_parattr & 0xf, 870 s2_parattr & 0xf); 871 final_attr |= combine_s1_s2_attr(s1_parattr >> 4, 872 s2_parattr >> 4) << 4; 873 } 874 } 875 876 if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_CD) && 877 !MEMATTR_IS_DEVICE(final_attr)) 878 final_attr = MEMATTR(NC, NC); 879 880 s2_sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_SH, tr->desc); 881 882 par = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr); 883 par |= tr->output & GENMASK(47, 12); 884 par |= FIELD_PREP(SYS_PAR_EL1_SH, 885 combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par), 886 compute_final_sh(final_attr, s2_sh))); 887 888 return par; 889 } 890 891 static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, 892 struct s1_walk_result *wr) 893 { 894 u64 par; 895 896 if (wr->failed) { 897 par = SYS_PAR_EL1_RES1; 898 par |= SYS_PAR_EL1_F; 899 par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst); 900 par |= wr->ptw ? SYS_PAR_EL1_PTW : 0; 901 par |= wr->s2 ? SYS_PAR_EL1_S : 0; 902 } else if (wr->level == S1_MMU_DISABLED) { 903 /* MMU off or HCR_EL2.DC == 1 */ 904 par = SYS_PAR_EL1_NSE; 905 par |= wr->pa & SYS_PAR_EL1_PA; 906 907 if (wi->regime == TR_EL10 && vcpu_has_nv(vcpu) && 908 (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) { 909 par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 910 MEMATTR(WbRaWa, WbRaWa)); 911 par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_NSH); 912 } else { 913 par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */ 914 par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_OSH); 915 } 916 } else { 917 u64 mair, sctlr; 918 u8 sh; 919 920 par = SYS_PAR_EL1_NSE; 921 922 mair = (wi->regime == TR_EL10 ? 923 vcpu_read_sys_reg(vcpu, MAIR_EL1) : 924 vcpu_read_sys_reg(vcpu, MAIR_EL2)); 925 926 mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8; 927 mair &= 0xff; 928 929 sctlr = (wi->regime == TR_EL10 ? 930 vcpu_read_sys_reg(vcpu, SCTLR_EL1) : 931 vcpu_read_sys_reg(vcpu, SCTLR_EL2)); 932 933 /* Force NC for memory if SCTLR_ELx.C is clear */ 934 if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair)) 935 mair = MEMATTR(NC, NC); 936 937 par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair); 938 par |= wr->pa & SYS_PAR_EL1_PA; 939 940 sh = compute_s1_sh(wi, wr, mair); 941 par |= FIELD_PREP(SYS_PAR_EL1_SH, sh); 942 } 943 944 return par; 945 } 946 947 static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) 948 { 949 u64 sctlr; 950 951 if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3)) 952 return false; 953 954 if (s1pie_enabled(vcpu, regime)) 955 return true; 956 957 if (regime == TR_EL10) 958 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); 959 else 960 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); 961 962 return sctlr & SCTLR_EL1_EPAN; 963 } 964 965 static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu, 966 struct s1_walk_info *wi, 967 struct s1_walk_result *wr) 968 { 969 bool wxn; 970 971 /* Non-hierarchical part of AArch64.S1DirectBasePermissions() */ 972 if (wi->regime != TR_EL2) { 973 switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr->desc)) { 974 case 0b00: 975 wr->pr = wr->pw = true; 976 wr->ur = wr->uw = false; 977 break; 978 case 0b01: 979 wr->pr = wr->pw = wr->ur = wr->uw = true; 980 break; 981 case 0b10: 982 wr->pr = true; 983 wr->pw = wr->ur = wr->uw = false; 984 break; 985 case 0b11: 986 wr->pr = wr->ur = true; 987 wr->pw = wr->uw = false; 988 break; 989 } 990 991 /* We don't use px for anything yet, but hey... */ 992 wr->px = !((wr->desc & PTE_PXN) || wr->uw); 993 wr->ux = !(wr->desc & PTE_UXN); 994 } else { 995 wr->ur = wr->uw = wr->ux = false; 996 997 if (!(wr->desc & PTE_RDONLY)) { 998 wr->pr = wr->pw = true; 999 } else { 1000 wr->pr = true; 1001 wr->pw = false; 1002 } 1003 1004 /* XN maps to UXN */ 1005 wr->px = !(wr->desc & PTE_UXN); 1006 } 1007 1008 switch (wi->regime) { 1009 case TR_EL2: 1010 case TR_EL20: 1011 wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_WXN); 1012 break; 1013 case TR_EL10: 1014 wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN); 1015 break; 1016 } 1017 1018 wr->pwxn = wr->uwxn = wxn; 1019 wr->pov = wi->poe; 1020 wr->uov = wi->e0poe; 1021 } 1022 1023 static void compute_s1_hierarchical_permissions(struct kvm_vcpu *vcpu, 1024 struct s1_walk_info *wi, 1025 struct s1_walk_result *wr) 1026 { 1027 /* Hierarchical part of AArch64.S1DirectBasePermissions() */ 1028 if (wi->regime != TR_EL2) { 1029 switch (wr->APTable) { 1030 case 0b00: 1031 break; 1032 case 0b01: 1033 wr->ur = wr->uw = false; 1034 break; 1035 case 0b10: 1036 wr->pw = wr->uw = false; 1037 break; 1038 case 0b11: 1039 wr->pw = wr->ur = wr->uw = false; 1040 break; 1041 } 1042 1043 wr->px &= !wr->PXNTable; 1044 wr->ux &= !wr->UXNTable; 1045 } else { 1046 if (wr->APTable & BIT(1)) 1047 wr->pw = false; 1048 1049 /* XN maps to UXN */ 1050 wr->px &= !wr->UXNTable; 1051 } 1052 } 1053 1054 #define perm_idx(v, r, i) ((vcpu_read_sys_reg((v), (r)) >> ((i) * 4)) & 0xf) 1055 1056 #define set_priv_perms(wr, r, w, x) \ 1057 do { \ 1058 (wr)->pr = (r); \ 1059 (wr)->pw = (w); \ 1060 (wr)->px = (x); \ 1061 } while (0) 1062 1063 #define set_unpriv_perms(wr, r, w, x) \ 1064 do { \ 1065 (wr)->ur = (r); \ 1066 (wr)->uw = (w); \ 1067 (wr)->ux = (x); \ 1068 } while (0) 1069 1070 #define set_priv_wxn(wr, v) \ 1071 do { \ 1072 (wr)->pwxn = (v); \ 1073 } while (0) 1074 1075 #define set_unpriv_wxn(wr, v) \ 1076 do { \ 1077 (wr)->uwxn = (v); \ 1078 } while (0) 1079 1080 /* Similar to AArch64.S1IndirectBasePermissions(), without GCS */ 1081 #define set_perms(w, wr, ip) \ 1082 do { \ 1083 /* R_LLZDZ */ \ 1084 switch ((ip)) { \ 1085 case 0b0000: \ 1086 set_ ## w ## _perms((wr), false, false, false); \ 1087 break; \ 1088 case 0b0001: \ 1089 set_ ## w ## _perms((wr), true , false, false); \ 1090 break; \ 1091 case 0b0010: \ 1092 set_ ## w ## _perms((wr), false, false, true ); \ 1093 break; \ 1094 case 0b0011: \ 1095 set_ ## w ## _perms((wr), true , false, true ); \ 1096 break; \ 1097 case 0b0100: \ 1098 set_ ## w ## _perms((wr), false, false, false); \ 1099 break; \ 1100 case 0b0101: \ 1101 set_ ## w ## _perms((wr), true , true , false); \ 1102 break; \ 1103 case 0b0110: \ 1104 set_ ## w ## _perms((wr), true , true , true ); \ 1105 break; \ 1106 case 0b0111: \ 1107 set_ ## w ## _perms((wr), true , true , true ); \ 1108 break; \ 1109 case 0b1000: \ 1110 set_ ## w ## _perms((wr), true , false, false); \ 1111 break; \ 1112 case 0b1001: \ 1113 set_ ## w ## _perms((wr), true , false, false); \ 1114 break; \ 1115 case 0b1010: \ 1116 set_ ## w ## _perms((wr), true , false, true ); \ 1117 break; \ 1118 case 0b1011: \ 1119 set_ ## w ## _perms((wr), false, false, false); \ 1120 break; \ 1121 case 0b1100: \ 1122 set_ ## w ## _perms((wr), true , true , false); \ 1123 break; \ 1124 case 0b1101: \ 1125 set_ ## w ## _perms((wr), false, false, false); \ 1126 break; \ 1127 case 0b1110: \ 1128 set_ ## w ## _perms((wr), true , true , true ); \ 1129 break; \ 1130 case 0b1111: \ 1131 set_ ## w ## _perms((wr), false, false, false); \ 1132 break; \ 1133 } \ 1134 \ 1135 /* R_HJYGR */ \ 1136 set_ ## w ## _wxn((wr), ((ip) == 0b0110)); \ 1137 \ 1138 } while (0) 1139 1140 static void compute_s1_indirect_permissions(struct kvm_vcpu *vcpu, 1141 struct s1_walk_info *wi, 1142 struct s1_walk_result *wr) 1143 { 1144 u8 up, pp, idx; 1145 1146 idx = pte_pi_index(wr->desc); 1147 1148 switch (wi->regime) { 1149 case TR_EL10: 1150 pp = perm_idx(vcpu, PIR_EL1, idx); 1151 up = perm_idx(vcpu, PIRE0_EL1, idx); 1152 break; 1153 case TR_EL20: 1154 pp = perm_idx(vcpu, PIR_EL2, idx); 1155 up = perm_idx(vcpu, PIRE0_EL2, idx); 1156 break; 1157 case TR_EL2: 1158 pp = perm_idx(vcpu, PIR_EL2, idx); 1159 up = 0; 1160 break; 1161 } 1162 1163 set_perms(priv, wr, pp); 1164 1165 if (wi->regime != TR_EL2) 1166 set_perms(unpriv, wr, up); 1167 else 1168 set_unpriv_perms(wr, false, false, false); 1169 1170 wr->pov = wi->poe && !(pp & BIT(3)); 1171 wr->uov = wi->e0poe && !(up & BIT(3)); 1172 1173 /* R_VFPJF */ 1174 if (wr->px && wr->uw) { 1175 set_priv_perms(wr, false, false, false); 1176 set_unpriv_perms(wr, false, false, false); 1177 } 1178 } 1179 1180 static void compute_s1_overlay_permissions(struct kvm_vcpu *vcpu, 1181 struct s1_walk_info *wi, 1182 struct s1_walk_result *wr) 1183 { 1184 u8 idx, pov_perms, uov_perms; 1185 1186 idx = FIELD_GET(PTE_PO_IDX_MASK, wr->desc); 1187 1188 if (wr->pov) { 1189 switch (wi->regime) { 1190 case TR_EL10: 1191 pov_perms = perm_idx(vcpu, POR_EL1, idx); 1192 break; 1193 case TR_EL20: 1194 pov_perms = perm_idx(vcpu, POR_EL2, idx); 1195 break; 1196 case TR_EL2: 1197 pov_perms = perm_idx(vcpu, POR_EL2, idx); 1198 break; 1199 } 1200 1201 if (pov_perms & ~POE_RWX) 1202 pov_perms = POE_NONE; 1203 1204 /* R_QXXPC, S1PrivOverflow enabled */ 1205 if (wr->pwxn && (pov_perms & POE_X)) 1206 pov_perms &= ~POE_W; 1207 1208 wr->pr &= pov_perms & POE_R; 1209 wr->pw &= pov_perms & POE_W; 1210 wr->px &= pov_perms & POE_X; 1211 } 1212 1213 if (wr->uov) { 1214 switch (wi->regime) { 1215 case TR_EL10: 1216 uov_perms = perm_idx(vcpu, POR_EL0, idx); 1217 break; 1218 case TR_EL20: 1219 uov_perms = perm_idx(vcpu, POR_EL0, idx); 1220 break; 1221 case TR_EL2: 1222 uov_perms = 0; 1223 break; 1224 } 1225 1226 if (uov_perms & ~POE_RWX) 1227 uov_perms = POE_NONE; 1228 1229 /* R_NPBXC, S1UnprivOverlay enabled */ 1230 if (wr->uwxn && (uov_perms & POE_X)) 1231 uov_perms &= ~POE_W; 1232 1233 wr->ur &= uov_perms & POE_R; 1234 wr->uw &= uov_perms & POE_W; 1235 wr->ux &= uov_perms & POE_X; 1236 } 1237 } 1238 1239 static void compute_s1_permissions(struct kvm_vcpu *vcpu, 1240 struct s1_walk_info *wi, 1241 struct s1_walk_result *wr) 1242 { 1243 bool pan; 1244 1245 if (!s1pie_enabled(vcpu, wi->regime)) 1246 compute_s1_direct_permissions(vcpu, wi, wr); 1247 else 1248 compute_s1_indirect_permissions(vcpu, wi, wr); 1249 1250 if (!wi->hpd) 1251 compute_s1_hierarchical_permissions(vcpu, wi, wr); 1252 1253 compute_s1_overlay_permissions(vcpu, wi, wr); 1254 1255 /* R_QXXPC, S1PrivOverlay disabled */ 1256 if (!wr->pov) 1257 wr->px &= !(wr->pwxn && wr->pw); 1258 1259 /* R_NPBXC, S1UnprivOverlay disabled */ 1260 if (!wr->uov) 1261 wr->ux &= !(wr->uwxn && wr->uw); 1262 1263 pan = wi->pan && (wr->ur || wr->uw || 1264 (pan3_enabled(vcpu, wi->regime) && wr->ux)); 1265 wr->pw &= !pan; 1266 wr->pr &= !pan; 1267 } 1268 1269 static int handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr, u64 *par) 1270 { 1271 struct s1_walk_result wr = {}; 1272 struct s1_walk_info wi = {}; 1273 bool perm_fail = false; 1274 int ret, idx; 1275 1276 wi.regime = compute_translation_regime(vcpu, op); 1277 wi.as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W); 1278 wi.pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) && 1279 (*vcpu_cpsr(vcpu) & PSR_PAN_BIT); 1280 1281 ret = setup_s1_walk(vcpu, &wi, &wr, vaddr); 1282 if (ret) 1283 goto compute_par; 1284 1285 if (wr.level == S1_MMU_DISABLED) 1286 goto compute_par; 1287 1288 idx = srcu_read_lock(&vcpu->kvm->srcu); 1289 1290 ret = walk_s1(vcpu, &wi, &wr, vaddr); 1291 1292 srcu_read_unlock(&vcpu->kvm->srcu, idx); 1293 1294 /* 1295 * Race to update a descriptor -- restart the walk. 1296 */ 1297 if (ret == -EAGAIN) 1298 return ret; 1299 if (ret) 1300 goto compute_par; 1301 1302 compute_s1_permissions(vcpu, &wi, &wr); 1303 1304 switch (op) { 1305 case OP_AT_S1E1RP: 1306 case OP_AT_S1E1R: 1307 case OP_AT_S1E2R: 1308 perm_fail = !wr.pr; 1309 break; 1310 case OP_AT_S1E1WP: 1311 case OP_AT_S1E1W: 1312 case OP_AT_S1E2W: 1313 perm_fail = !wr.pw; 1314 break; 1315 case OP_AT_S1E0R: 1316 perm_fail = !wr.ur; 1317 break; 1318 case OP_AT_S1E0W: 1319 perm_fail = !wr.uw; 1320 break; 1321 case OP_AT_S1E1A: 1322 case OP_AT_S1E2A: 1323 break; 1324 default: 1325 BUG(); 1326 } 1327 1328 if (perm_fail) 1329 fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false); 1330 1331 compute_par: 1332 *par = compute_par_s1(vcpu, &wi, &wr); 1333 return 0; 1334 } 1335 1336 /* 1337 * Return the PAR_EL1 value as the result of a valid translation. 1338 * 1339 * If the translation is unsuccessful, the value may only contain 1340 * PAR_EL1.F, and cannot be taken at face value. It isn't an 1341 * indication of the translation having failed, only that the fast 1342 * path did not succeed, *unless* it indicates a S1 permission or 1343 * access fault. 1344 */ 1345 static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) 1346 { 1347 struct mmu_config config; 1348 struct kvm_s2_mmu *mmu; 1349 bool fail, mmu_cs; 1350 u64 par; 1351 1352 par = SYS_PAR_EL1_F; 1353 1354 /* 1355 * We've trapped, so everything is live on the CPU. As we will 1356 * be switching contexts behind everybody's back, disable 1357 * interrupts while holding the mmu lock. 1358 */ 1359 guard(write_lock_irqsave)(&vcpu->kvm->mmu_lock); 1360 1361 /* 1362 * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already 1363 * the right one (as we trapped from vEL2). If not, save the 1364 * full MMU context. 1365 * 1366 * We are also guaranteed to be in the correct context if 1367 * we're not in a nested VM. 1368 */ 1369 mmu_cs = (vcpu_has_nv(vcpu) && 1370 !(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))); 1371 if (!mmu_cs) 1372 goto skip_mmu_switch; 1373 1374 /* 1375 * Obtaining the S2 MMU for a L2 is horribly racy, and we may not 1376 * find it (recycled by another vcpu, for example). When this 1377 * happens, admit defeat immediately and use the SW (slow) path. 1378 */ 1379 mmu = lookup_s2_mmu(vcpu); 1380 if (!mmu) 1381 return par; 1382 1383 __mmu_config_save(&config); 1384 1385 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1), SYS_TTBR0); 1386 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1), SYS_TTBR1); 1387 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1), SYS_TCR); 1388 write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1), SYS_MAIR); 1389 if (kvm_has_tcr2(vcpu->kvm)) { 1390 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR2_EL1), SYS_TCR2); 1391 if (kvm_has_s1pie(vcpu->kvm)) { 1392 write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIR_EL1), SYS_PIR); 1393 write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIRE0_EL1), SYS_PIRE0); 1394 } 1395 if (kvm_has_s1poe(vcpu->kvm)) { 1396 write_sysreg_el1(vcpu_read_sys_reg(vcpu, POR_EL1), SYS_POR); 1397 write_sysreg_s(vcpu_read_sys_reg(vcpu, POR_EL0), SYS_POR_EL0); 1398 } 1399 } 1400 write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1), SYS_SCTLR); 1401 __load_stage2(mmu, mmu->arch); 1402 1403 skip_mmu_switch: 1404 /* Temporarily switch back to guest context */ 1405 write_sysreg_hcr(vcpu->arch.hcr_el2); 1406 isb(); 1407 1408 switch (op) { 1409 case OP_AT_S1E1RP: 1410 case OP_AT_S1E1WP: 1411 fail = at_s1e1p_fast(vcpu, op, vaddr); 1412 break; 1413 case OP_AT_S1E1R: 1414 fail = __kvm_at(OP_AT_S1E1R, vaddr); 1415 break; 1416 case OP_AT_S1E1W: 1417 fail = __kvm_at(OP_AT_S1E1W, vaddr); 1418 break; 1419 case OP_AT_S1E0R: 1420 fail = __kvm_at(OP_AT_S1E0R, vaddr); 1421 break; 1422 case OP_AT_S1E0W: 1423 fail = __kvm_at(OP_AT_S1E0W, vaddr); 1424 break; 1425 case OP_AT_S1E1A: 1426 fail = __kvm_at(OP_AT_S1E1A, vaddr); 1427 break; 1428 default: 1429 WARN_ON_ONCE(1); 1430 fail = true; 1431 break; 1432 } 1433 1434 if (!fail) 1435 par = read_sysreg_par(); 1436 1437 write_sysreg_hcr(HCR_HOST_VHE_FLAGS); 1438 1439 if (mmu_cs) 1440 __mmu_config_restore(&config); 1441 1442 return par; 1443 } 1444 1445 static bool par_check_s1_perm_fault(u64 par) 1446 { 1447 u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par); 1448 1449 return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM && 1450 !(par & SYS_PAR_EL1_S)); 1451 } 1452 1453 static bool par_check_s1_access_fault(u64 par) 1454 { 1455 u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par); 1456 1457 return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_ACCESS && 1458 !(par & SYS_PAR_EL1_S)); 1459 } 1460 1461 int __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) 1462 { 1463 u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr); 1464 int ret; 1465 1466 /* 1467 * If PAR_EL1 reports that AT failed on a S1 permission or access 1468 * fault, we know for sure that the PTW was able to walk the S1 1469 * tables and there's nothing else to do. 1470 * 1471 * If AT failed for any other reason, then we must walk the guest S1 1472 * to emulate the instruction. 1473 */ 1474 if ((par & SYS_PAR_EL1_F) && 1475 !par_check_s1_perm_fault(par) && 1476 !par_check_s1_access_fault(par)) { 1477 ret = handle_at_slow(vcpu, op, vaddr, &par); 1478 if (ret) 1479 return ret; 1480 } 1481 1482 vcpu_write_sys_reg(vcpu, par, PAR_EL1); 1483 return 0; 1484 } 1485 1486 int __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) 1487 { 1488 u64 par; 1489 int ret; 1490 1491 /* 1492 * We've trapped, so everything is live on the CPU. As we will be 1493 * switching context behind everybody's back, disable interrupts... 1494 */ 1495 scoped_guard(write_lock_irqsave, &vcpu->kvm->mmu_lock) { 1496 u64 val, hcr; 1497 bool fail; 1498 1499 val = hcr = read_sysreg(hcr_el2); 1500 val &= ~HCR_TGE; 1501 val |= HCR_VM; 1502 1503 if (!vcpu_el2_e2h_is_set(vcpu)) 1504 val |= HCR_NV | HCR_NV1; 1505 1506 write_sysreg_hcr(val); 1507 isb(); 1508 1509 par = SYS_PAR_EL1_F; 1510 1511 switch (op) { 1512 case OP_AT_S1E2R: 1513 fail = __kvm_at(OP_AT_S1E1R, vaddr); 1514 break; 1515 case OP_AT_S1E2W: 1516 fail = __kvm_at(OP_AT_S1E1W, vaddr); 1517 break; 1518 case OP_AT_S1E2A: 1519 fail = __kvm_at(OP_AT_S1E1A, vaddr); 1520 break; 1521 default: 1522 WARN_ON_ONCE(1); 1523 fail = true; 1524 } 1525 1526 isb(); 1527 1528 if (!fail) 1529 par = read_sysreg_par(); 1530 1531 write_sysreg_hcr(hcr); 1532 isb(); 1533 } 1534 1535 /* We failed the translation, let's replay it in slow motion */ 1536 if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) { 1537 ret = handle_at_slow(vcpu, op, vaddr, &par); 1538 if (ret) 1539 return ret; 1540 } 1541 1542 vcpu_write_sys_reg(vcpu, par, PAR_EL1); 1543 return 0; 1544 } 1545 1546 int __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) 1547 { 1548 struct kvm_s2_trans out = {}; 1549 u64 ipa, par; 1550 bool write; 1551 int ret; 1552 1553 /* Do the stage-1 translation */ 1554 switch (op) { 1555 case OP_AT_S12E1R: 1556 op = OP_AT_S1E1R; 1557 write = false; 1558 break; 1559 case OP_AT_S12E1W: 1560 op = OP_AT_S1E1W; 1561 write = true; 1562 break; 1563 case OP_AT_S12E0R: 1564 op = OP_AT_S1E0R; 1565 write = false; 1566 break; 1567 case OP_AT_S12E0W: 1568 op = OP_AT_S1E0W; 1569 write = true; 1570 break; 1571 default: 1572 WARN_ON_ONCE(1); 1573 return 0; 1574 } 1575 1576 __kvm_at_s1e01(vcpu, op, vaddr); 1577 par = vcpu_read_sys_reg(vcpu, PAR_EL1); 1578 if (par & SYS_PAR_EL1_F) 1579 return 0; 1580 1581 /* 1582 * If we only have a single stage of translation (EL2&0), exit 1583 * early. Same thing if {VM,DC}=={0,0}. 1584 */ 1585 if (compute_translation_regime(vcpu, op) == TR_EL20 || 1586 !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC))) 1587 return 0; 1588 1589 /* Do the stage-2 translation */ 1590 ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0)); 1591 out.esr = 0; 1592 ret = kvm_walk_nested_s2(vcpu, ipa, &out); 1593 if (ret < 0) 1594 return ret; 1595 1596 /* Check the access permission */ 1597 if (!out.esr && 1598 ((!write && !out.readable) || (write && !out.writable))) 1599 out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3); 1600 1601 par = compute_par_s12(vcpu, par, &out); 1602 vcpu_write_sys_reg(vcpu, par, PAR_EL1); 1603 return 0; 1604 } 1605 1606 /* 1607 * Translate a VA for a given EL in a given translation regime, with 1608 * or without PAN. This requires wi->{regime, as_el0, pan} to be 1609 * set. The rest of the wi and wr should be 0-initialised. 1610 */ 1611 int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, 1612 struct s1_walk_result *wr, u64 va) 1613 { 1614 int ret; 1615 1616 ret = setup_s1_walk(vcpu, wi, wr, va); 1617 if (ret) 1618 return ret; 1619 1620 if (wr->level == S1_MMU_DISABLED) { 1621 wr->ur = wr->uw = wr->ux = true; 1622 wr->pr = wr->pw = wr->px = true; 1623 } else { 1624 ret = walk_s1(vcpu, wi, wr, va); 1625 if (ret) 1626 return ret; 1627 1628 compute_s1_permissions(vcpu, wi, wr); 1629 } 1630 1631 return 0; 1632 } 1633 1634 struct desc_match { 1635 u64 ipa; 1636 int level; 1637 }; 1638 1639 static int match_s1_desc(struct s1_walk_context *ctxt, void *priv) 1640 { 1641 struct desc_match *dm = priv; 1642 u64 ipa = dm->ipa; 1643 1644 /* Use S1 granule alignment */ 1645 ipa &= GENMASK(51, ctxt->wi->pgshift); 1646 1647 /* Not the IPA we're looking for? Continue. */ 1648 if (ipa != ctxt->table_ipa) 1649 return 0; 1650 1651 /* Note the level and interrupt the walk */ 1652 dm->level = ctxt->level; 1653 return -EINTR; 1654 } 1655 1656 int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level) 1657 { 1658 struct desc_match dm = { 1659 .ipa = ipa, 1660 }; 1661 struct s1_walk_info wi = { 1662 .filter = &(struct s1_walk_filter){ 1663 .fn = match_s1_desc, 1664 .priv = &dm, 1665 }, 1666 .as_el0 = false, 1667 .pan = false, 1668 }; 1669 struct s1_walk_result wr = {}; 1670 int ret; 1671 1672 if (is_hyp_ctxt(vcpu)) 1673 wi.regime = vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2; 1674 else 1675 wi.regime = TR_EL10; 1676 1677 ret = setup_s1_walk(vcpu, &wi, &wr, va); 1678 if (ret) 1679 return ret; 1680 1681 /* We really expect the S1 MMU to be on here... */ 1682 if (WARN_ON_ONCE(wr.level == S1_MMU_DISABLED)) { 1683 *level = 0; 1684 return 0; 1685 } 1686 1687 /* Walk the guest's PT, looking for a match along the way */ 1688 ret = walk_s1(vcpu, &wi, &wr, va); 1689 switch (ret) { 1690 case -EINTR: 1691 /* We interrupted the walk on a match, return the level */ 1692 *level = dm.level; 1693 return 0; 1694 case 0: 1695 /* The walk completed, we failed to find the entry */ 1696 return -ENOENT; 1697 default: 1698 /* Any other error... */ 1699 return ret; 1700 } 1701 } 1702 1703 #ifdef CONFIG_ARM64_LSE_ATOMICS 1704 static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new) 1705 { 1706 u64 tmp = old; 1707 int ret = 0; 1708 1709 uaccess_enable_privileged(); 1710 1711 asm volatile(__LSE_PREAMBLE 1712 "1: cas %[old], %[new], %[addr]\n" 1713 "2:\n" 1714 _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret]) 1715 : [old] "+r" (old), [addr] "+Q" (*ptep), [ret] "+r" (ret) 1716 : [new] "r" (new) 1717 : "memory"); 1718 1719 uaccess_disable_privileged(); 1720 1721 if (ret) 1722 return ret; 1723 if (tmp != old) 1724 return -EAGAIN; 1725 1726 return ret; 1727 } 1728 #else 1729 static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new) 1730 { 1731 return -EINVAL; 1732 } 1733 #endif 1734 1735 static int __llsc_swap_desc(u64 __user *ptep, u64 old, u64 new) 1736 { 1737 int ret = 1; 1738 u64 tmp; 1739 1740 uaccess_enable_privileged(); 1741 1742 asm volatile("prfm pstl1strm, %[addr]\n" 1743 "1: ldxr %[tmp], %[addr]\n" 1744 "sub %[tmp], %[tmp], %[old]\n" 1745 "cbnz %[tmp], 3f\n" 1746 "2: stlxr %w[ret], %[new], %[addr]\n" 1747 "3:\n" 1748 _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w[ret]) 1749 _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w[ret]) 1750 : [ret] "+r" (ret), [addr] "+Q" (*ptep), [tmp] "=&r" (tmp) 1751 : [old] "r" (old), [new] "r" (new) 1752 : "memory"); 1753 1754 uaccess_disable_privileged(); 1755 1756 /* STLXR didn't update the descriptor, or the compare failed */ 1757 if (ret == 1) 1758 return -EAGAIN; 1759 1760 return ret; 1761 } 1762 1763 int __kvm_at_swap_desc(struct kvm *kvm, gpa_t ipa, u64 old, u64 new) 1764 { 1765 struct kvm_memory_slot *slot; 1766 unsigned long hva; 1767 u64 __user *ptep; 1768 bool writable; 1769 int offset; 1770 gfn_t gfn; 1771 int r; 1772 1773 lockdep_assert(srcu_read_lock_held(&kvm->srcu)); 1774 1775 gfn = ipa >> PAGE_SHIFT; 1776 offset = offset_in_page(ipa); 1777 slot = gfn_to_memslot(kvm, gfn); 1778 hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); 1779 if (kvm_is_error_hva(hva)) 1780 return -EINVAL; 1781 if (!writable) 1782 return -EPERM; 1783 1784 ptep = (u64 __user *)hva + offset; 1785 if (cpus_have_final_cap(ARM64_HAS_LSE_ATOMICS)) 1786 r = __lse_swap_desc(ptep, old, new); 1787 else 1788 r = __llsc_swap_desc(ptep, old, new); 1789 1790 if (r < 0) 1791 return r; 1792 1793 mark_page_dirty_in_slot(kvm, slot, gfn); 1794 return 0; 1795 } 1796