1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2017 - Linaro Ltd 4 * Author: Jintack Lim <jintack.lim@linaro.org> 5 */ 6 7 #include <linux/kvm_host.h> 8 9 #include <asm/esr.h> 10 #include <asm/kvm_hyp.h> 11 #include <asm/kvm_mmu.h> 12 13 static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool s1ptw) 14 { 15 wr->fst = fst; 16 wr->ptw = s1ptw; 17 wr->s2 = s1ptw; 18 wr->failed = true; 19 } 20 21 #define S1_MMU_DISABLED (-127) 22 23 static int get_ia_size(struct s1_walk_info *wi) 24 { 25 return 64 - wi->txsz; 26 } 27 28 /* Return true if the IPA is out of the OA range */ 29 static bool check_output_size(u64 ipa, struct s1_walk_info *wi) 30 { 31 if (wi->pa52bit) 32 return wi->max_oa_bits < 52 && (ipa & GENMASK_ULL(51, wi->max_oa_bits)); 33 return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits)); 34 } 35 36 static bool has_52bit_pa(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, u64 tcr) 37 { 38 switch (BIT(wi->pgshift)) { 39 case SZ_64K: 40 default: /* IMPDEF: treat any other value as 64k */ 41 if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, PARANGE, 52)) 42 return false; 43 return ((wi->regime == TR_EL2 ? 44 FIELD_GET(TCR_EL2_PS_MASK, tcr) : 45 FIELD_GET(TCR_IPS_MASK, tcr)) == 0b0110); 46 case SZ_16K: 47 if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT)) 48 return false; 49 break; 50 case SZ_4K: 51 if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT)) 52 return false; 53 break; 54 } 55 56 return (tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS)); 57 } 58 59 static u64 desc_to_oa(struct s1_walk_info *wi, u64 desc) 60 { 61 u64 addr; 62 63 if (!wi->pa52bit) 64 return desc & GENMASK_ULL(47, wi->pgshift); 65 66 switch (BIT(wi->pgshift)) { 67 case SZ_4K: 68 case SZ_16K: 69 addr = desc & GENMASK_ULL(49, wi->pgshift); 70 addr |= FIELD_GET(KVM_PTE_ADDR_51_50_LPA2, desc) << 50; 71 break; 72 case SZ_64K: 73 default: /* IMPDEF: treat any other value as 64k */ 74 addr = desc & GENMASK_ULL(47, wi->pgshift); 75 addr |= FIELD_GET(KVM_PTE_ADDR_51_48, desc) << 48; 76 break; 77 } 78 79 return addr; 80 } 81 82 /* Return the translation regime that applies to an AT instruction */ 83 static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op) 84 { 85 /* 86 * We only get here from guest EL2, so the translation 87 * regime AT applies to is solely defined by {E2H,TGE}. 88 */ 89 switch (op) { 90 case OP_AT_S1E2R: 91 case OP_AT_S1E2W: 92 case OP_AT_S1E2A: 93 return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2; 94 default: 95 return (vcpu_el2_e2h_is_set(vcpu) && 96 vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10; 97 } 98 } 99 100 static u64 effective_tcr2(struct kvm_vcpu *vcpu, enum trans_regime regime) 101 { 102 if (regime == TR_EL10) { 103 if (vcpu_has_nv(vcpu) && 104 !(__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En)) 105 return 0; 106 107 return vcpu_read_sys_reg(vcpu, TCR2_EL1); 108 } 109 110 return vcpu_read_sys_reg(vcpu, TCR2_EL2); 111 } 112 113 static bool s1pie_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) 114 { 115 if (!kvm_has_s1pie(vcpu->kvm)) 116 return false; 117 118 /* Abuse TCR2_EL1_PIE and use it for EL2 as well */ 119 return effective_tcr2(vcpu, regime) & TCR2_EL1_PIE; 120 } 121 122 static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi) 123 { 124 u64 val; 125 126 if (!kvm_has_s1poe(vcpu->kvm)) { 127 wi->poe = wi->e0poe = false; 128 return; 129 } 130 131 val = effective_tcr2(vcpu, wi->regime); 132 133 /* Abuse TCR2_EL1_* for EL2 */ 134 wi->poe = val & TCR2_EL1_POE; 135 wi->e0poe = (wi->regime != TR_EL2) && (val & TCR2_EL1_E0POE); 136 } 137 138 static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, 139 struct s1_walk_result *wr, u64 va) 140 { 141 u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr; 142 unsigned int stride, x; 143 bool va55, tbi, lva; 144 145 va55 = va & BIT(55); 146 147 if (vcpu_has_nv(vcpu)) { 148 hcr = __vcpu_sys_reg(vcpu, HCR_EL2); 149 wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC)); 150 } else { 151 WARN_ON_ONCE(wi->regime != TR_EL10); 152 wi->s2 = false; 153 hcr = 0; 154 } 155 156 switch (wi->regime) { 157 case TR_EL10: 158 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); 159 tcr = vcpu_read_sys_reg(vcpu, TCR_EL1); 160 ttbr = (va55 ? 161 vcpu_read_sys_reg(vcpu, TTBR1_EL1) : 162 vcpu_read_sys_reg(vcpu, TTBR0_EL1)); 163 break; 164 case TR_EL2: 165 case TR_EL20: 166 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); 167 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); 168 ttbr = (va55 ? 169 vcpu_read_sys_reg(vcpu, TTBR1_EL2) : 170 vcpu_read_sys_reg(vcpu, TTBR0_EL2)); 171 break; 172 default: 173 BUG(); 174 } 175 176 /* Someone was silly enough to encode TG0/TG1 differently */ 177 if (va55 && wi->regime != TR_EL2) { 178 wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); 179 tg = FIELD_GET(TCR_TG1_MASK, tcr); 180 181 switch (tg << TCR_TG1_SHIFT) { 182 case TCR_TG1_4K: 183 wi->pgshift = 12; break; 184 case TCR_TG1_16K: 185 wi->pgshift = 14; break; 186 case TCR_TG1_64K: 187 default: /* IMPDEF: treat any other value as 64k */ 188 wi->pgshift = 16; break; 189 } 190 } else { 191 wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); 192 tg = FIELD_GET(TCR_TG0_MASK, tcr); 193 194 switch (tg << TCR_TG0_SHIFT) { 195 case TCR_TG0_4K: 196 wi->pgshift = 12; break; 197 case TCR_TG0_16K: 198 wi->pgshift = 14; break; 199 case TCR_TG0_64K: 200 default: /* IMPDEF: treat any other value as 64k */ 201 wi->pgshift = 16; break; 202 } 203 } 204 205 wi->pa52bit = has_52bit_pa(vcpu, wi, tcr); 206 207 ia_bits = get_ia_size(wi); 208 209 /* AArch64.S1StartLevel() */ 210 stride = wi->pgshift - 3; 211 wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride); 212 213 if (wi->regime == TR_EL2 && va55) 214 goto addrsz; 215 216 tbi = (wi->regime == TR_EL2 ? 217 FIELD_GET(TCR_EL2_TBI, tcr) : 218 (va55 ? 219 FIELD_GET(TCR_TBI1, tcr) : 220 FIELD_GET(TCR_TBI0, tcr))); 221 222 if (!tbi && (u64)sign_extend64(va, 55) != va) 223 goto addrsz; 224 225 wi->sh = (wi->regime == TR_EL2 ? 226 FIELD_GET(TCR_EL2_SH0_MASK, tcr) : 227 (va55 ? 228 FIELD_GET(TCR_SH1_MASK, tcr) : 229 FIELD_GET(TCR_SH0_MASK, tcr))); 230 231 va = (u64)sign_extend64(va, 55); 232 233 /* Let's put the MMU disabled case aside immediately */ 234 switch (wi->regime) { 235 case TR_EL10: 236 /* 237 * If dealing with the EL1&0 translation regime, 3 things 238 * can disable the S1 translation: 239 * 240 * - HCR_EL2.DC = 1 241 * - HCR_EL2.{E2H,TGE} = {0,1} 242 * - SCTLR_EL1.M = 0 243 * 244 * The TGE part is interesting. If we have decided that this 245 * is EL1&0, then it means that either {E2H,TGE} == {1,0} or 246 * {0,x}, and we only need to test for TGE == 1. 247 */ 248 if (hcr & (HCR_DC | HCR_TGE)) { 249 wr->level = S1_MMU_DISABLED; 250 break; 251 } 252 fallthrough; 253 case TR_EL2: 254 case TR_EL20: 255 if (!(sctlr & SCTLR_ELx_M)) 256 wr->level = S1_MMU_DISABLED; 257 break; 258 } 259 260 if (wr->level == S1_MMU_DISABLED) { 261 if (va >= BIT(kvm_get_pa_bits(vcpu->kvm))) 262 goto addrsz; 263 264 wr->pa = va; 265 return 0; 266 } 267 268 wi->be = sctlr & SCTLR_ELx_EE; 269 270 wi->hpd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP); 271 wi->hpd &= (wi->regime == TR_EL2 ? 272 FIELD_GET(TCR_EL2_HPD, tcr) : 273 (va55 ? 274 FIELD_GET(TCR_HPD1, tcr) : 275 FIELD_GET(TCR_HPD0, tcr))); 276 /* R_JHSVW */ 277 wi->hpd |= s1pie_enabled(vcpu, wi->regime); 278 279 /* Do we have POE? */ 280 compute_s1poe(vcpu, wi); 281 282 /* R_BVXDG */ 283 wi->hpd |= (wi->poe || wi->e0poe); 284 285 /* R_PLCGL, R_YXNYW */ 286 if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) { 287 if (wi->txsz > 39) 288 goto transfault; 289 } else { 290 if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47)) 291 goto transfault; 292 } 293 294 /* R_GTJBY, R_SXWGM */ 295 switch (BIT(wi->pgshift)) { 296 case SZ_4K: 297 case SZ_16K: 298 lva = wi->pa52bit; 299 break; 300 case SZ_64K: 301 lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52); 302 break; 303 } 304 305 if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16)) 306 goto transfault; 307 308 /* R_YYVYV, I_THCZK */ 309 if ((!va55 && va > GENMASK(ia_bits - 1, 0)) || 310 (va55 && va < GENMASK(63, ia_bits))) 311 goto transfault; 312 313 /* I_ZFSYQ */ 314 if (wi->regime != TR_EL2 && 315 (tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK))) 316 goto transfault; 317 318 /* R_BNDVG and following statements */ 319 if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) && 320 wi->as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0))) 321 goto transfault; 322 323 ps = (wi->regime == TR_EL2 ? 324 FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr)); 325 326 wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps, wi->pa52bit)); 327 328 /* Compute minimal alignment */ 329 x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift); 330 331 wi->baddr = ttbr & TTBRx_EL1_BADDR; 332 if (wi->pa52bit) { 333 /* 334 * Force the alignment on 64 bytes for top-level tables 335 * smaller than 8 entries, since TTBR.BADDR[5:2] are used to 336 * store bits [51:48] of the first level of lookup. 337 */ 338 x = max(x, 6); 339 340 wi->baddr |= FIELD_GET(GENMASK_ULL(5, 2), ttbr) << 48; 341 } 342 343 /* R_VPBBF */ 344 if (check_output_size(wi->baddr, wi)) 345 goto addrsz; 346 347 wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x); 348 349 wi->ha = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HAFDBS, AF); 350 wi->ha &= (wi->regime == TR_EL2 ? 351 FIELD_GET(TCR_EL2_HA, tcr) : 352 FIELD_GET(TCR_HA, tcr)); 353 354 return 0; 355 356 addrsz: 357 /* 358 * Address Size Fault level 0 to indicate it comes from TTBR. 359 * yes, this is an oddity. 360 */ 361 fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false); 362 return -EFAULT; 363 364 transfault: 365 /* Translation Fault on start level */ 366 fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(wi->sl), false); 367 return -EFAULT; 368 } 369 370 static int kvm_read_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 *desc, 371 struct s1_walk_info *wi) 372 { 373 u64 val; 374 int r; 375 376 r = kvm_read_guest(vcpu->kvm, pa, &val, sizeof(val)); 377 if (r) 378 return r; 379 380 if (wi->be) 381 *desc = be64_to_cpu((__force __be64)val); 382 else 383 *desc = le64_to_cpu((__force __le64)val); 384 385 return 0; 386 } 387 388 static int kvm_swap_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 old, u64 new, 389 struct s1_walk_info *wi) 390 { 391 if (wi->be) { 392 old = (__force u64)cpu_to_be64(old); 393 new = (__force u64)cpu_to_be64(new); 394 } else { 395 old = (__force u64)cpu_to_le64(old); 396 new = (__force u64)cpu_to_le64(new); 397 } 398 399 return __kvm_at_swap_desc(vcpu->kvm, pa, old, new); 400 } 401 402 static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, 403 struct s1_walk_result *wr, u64 va) 404 { 405 u64 va_top, va_bottom, baddr, desc, new_desc, ipa; 406 struct kvm_s2_trans s2_trans = {}; 407 int level, stride, ret; 408 409 level = wi->sl; 410 stride = wi->pgshift - 3; 411 baddr = wi->baddr; 412 413 va_top = get_ia_size(wi) - 1; 414 415 while (1) { 416 u64 index; 417 418 va_bottom = (3 - level) * stride + wi->pgshift; 419 index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3); 420 421 ipa = baddr | index; 422 423 if (wi->s2) { 424 ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans); 425 if (ret) { 426 fail_s1_walk(wr, 427 (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level, 428 true); 429 return ret; 430 } 431 432 if (!kvm_s2_trans_readable(&s2_trans)) { 433 fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level), 434 true); 435 436 return -EPERM; 437 } 438 439 ipa = kvm_s2_trans_output(&s2_trans); 440 } 441 442 if (wi->filter) { 443 ret = wi->filter->fn(&(struct s1_walk_context) 444 { 445 .wi = wi, 446 .table_ipa = baddr, 447 .level = level, 448 }, wi->filter->priv); 449 if (ret) 450 return ret; 451 } 452 453 ret = kvm_read_s1_desc(vcpu, ipa, &desc, wi); 454 if (ret) { 455 fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false); 456 return ret; 457 } 458 459 new_desc = desc; 460 461 /* Invalid descriptor */ 462 if (!(desc & BIT(0))) 463 goto transfault; 464 465 /* Block mapping, check validity down the line */ 466 if (!(desc & BIT(1))) 467 break; 468 469 /* Page mapping */ 470 if (level == 3) 471 break; 472 473 /* Table handling */ 474 if (!wi->hpd) { 475 wr->APTable |= FIELD_GET(S1_TABLE_AP, desc); 476 wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc); 477 wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc); 478 } 479 480 baddr = desc_to_oa(wi, desc); 481 482 /* Check for out-of-range OA */ 483 if (check_output_size(baddr, wi)) 484 goto addrsz; 485 486 /* Prepare for next round */ 487 va_top = va_bottom - 1; 488 level++; 489 } 490 491 /* Block mapping, check the validity of the level */ 492 if (!(desc & BIT(1))) { 493 bool valid_block = false; 494 495 switch (BIT(wi->pgshift)) { 496 case SZ_4K: 497 valid_block = level == 1 || level == 2 || (wi->pa52bit && level == 0); 498 break; 499 case SZ_16K: 500 case SZ_64K: 501 valid_block = level == 2 || (wi->pa52bit && level == 1); 502 break; 503 } 504 505 if (!valid_block) 506 goto transfault; 507 } 508 509 baddr = desc_to_oa(wi, desc); 510 if (check_output_size(baddr & GENMASK(52, va_bottom), wi)) 511 goto addrsz; 512 513 if (wi->ha) 514 new_desc |= PTE_AF; 515 516 if (new_desc != desc) { 517 if (wi->s2 && !kvm_s2_trans_writable(&s2_trans)) { 518 fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level), true); 519 return -EPERM; 520 } 521 522 ret = kvm_swap_s1_desc(vcpu, ipa, desc, new_desc, wi); 523 if (ret) 524 return ret; 525 526 desc = new_desc; 527 } 528 529 if (!(desc & PTE_AF)) { 530 fail_s1_walk(wr, ESR_ELx_FSC_ACCESS_L(level), false); 531 return -EACCES; 532 } 533 534 va_bottom += contiguous_bit_shift(desc, wi, level); 535 536 wr->failed = false; 537 wr->level = level; 538 wr->desc = desc; 539 wr->pa = baddr & GENMASK(52, va_bottom); 540 wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0); 541 542 wr->nG = (wi->regime != TR_EL2) && (desc & PTE_NG); 543 if (wr->nG) { 544 u64 asid_ttbr, tcr; 545 546 switch (wi->regime) { 547 case TR_EL10: 548 tcr = vcpu_read_sys_reg(vcpu, TCR_EL1); 549 asid_ttbr = ((tcr & TCR_A1) ? 550 vcpu_read_sys_reg(vcpu, TTBR1_EL1) : 551 vcpu_read_sys_reg(vcpu, TTBR0_EL1)); 552 break; 553 case TR_EL20: 554 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); 555 asid_ttbr = ((tcr & TCR_A1) ? 556 vcpu_read_sys_reg(vcpu, TTBR1_EL2) : 557 vcpu_read_sys_reg(vcpu, TTBR0_EL2)); 558 break; 559 default: 560 BUG(); 561 } 562 563 wr->asid = FIELD_GET(TTBR_ASID_MASK, asid_ttbr); 564 if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) || 565 !(tcr & TCR_ASID16)) 566 wr->asid &= GENMASK(7, 0); 567 } 568 569 return 0; 570 571 addrsz: 572 fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), false); 573 return -EINVAL; 574 transfault: 575 fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), false); 576 return -ENOENT; 577 } 578 579 struct mmu_config { 580 u64 ttbr0; 581 u64 ttbr1; 582 u64 tcr; 583 u64 mair; 584 u64 tcr2; 585 u64 pir; 586 u64 pire0; 587 u64 por_el0; 588 u64 por_el1; 589 u64 sctlr; 590 u64 vttbr; 591 u64 vtcr; 592 }; 593 594 static void __mmu_config_save(struct mmu_config *config) 595 { 596 config->ttbr0 = read_sysreg_el1(SYS_TTBR0); 597 config->ttbr1 = read_sysreg_el1(SYS_TTBR1); 598 config->tcr = read_sysreg_el1(SYS_TCR); 599 config->mair = read_sysreg_el1(SYS_MAIR); 600 if (cpus_have_final_cap(ARM64_HAS_TCR2)) { 601 config->tcr2 = read_sysreg_el1(SYS_TCR2); 602 if (cpus_have_final_cap(ARM64_HAS_S1PIE)) { 603 config->pir = read_sysreg_el1(SYS_PIR); 604 config->pire0 = read_sysreg_el1(SYS_PIRE0); 605 } 606 if (system_supports_poe()) { 607 config->por_el1 = read_sysreg_el1(SYS_POR); 608 config->por_el0 = read_sysreg_s(SYS_POR_EL0); 609 } 610 } 611 config->sctlr = read_sysreg_el1(SYS_SCTLR); 612 config->vttbr = read_sysreg(vttbr_el2); 613 config->vtcr = read_sysreg(vtcr_el2); 614 } 615 616 static void __mmu_config_restore(struct mmu_config *config) 617 { 618 /* 619 * ARM errata 1165522 and 1530923 require TGE to be 1 before 620 * we update the guest state. 621 */ 622 asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); 623 624 write_sysreg_el1(config->ttbr0, SYS_TTBR0); 625 write_sysreg_el1(config->ttbr1, SYS_TTBR1); 626 write_sysreg_el1(config->tcr, SYS_TCR); 627 write_sysreg_el1(config->mair, SYS_MAIR); 628 if (cpus_have_final_cap(ARM64_HAS_TCR2)) { 629 write_sysreg_el1(config->tcr2, SYS_TCR2); 630 if (cpus_have_final_cap(ARM64_HAS_S1PIE)) { 631 write_sysreg_el1(config->pir, SYS_PIR); 632 write_sysreg_el1(config->pire0, SYS_PIRE0); 633 } 634 if (system_supports_poe()) { 635 write_sysreg_el1(config->por_el1, SYS_POR); 636 write_sysreg_s(config->por_el0, SYS_POR_EL0); 637 } 638 } 639 write_sysreg_el1(config->sctlr, SYS_SCTLR); 640 write_sysreg(config->vttbr, vttbr_el2); 641 write_sysreg(config->vtcr, vtcr_el2); 642 } 643 644 static bool at_s1e1p_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) 645 { 646 u64 host_pan; 647 bool fail; 648 649 host_pan = read_sysreg_s(SYS_PSTATE_PAN); 650 write_sysreg_s(*vcpu_cpsr(vcpu) & PSTATE_PAN, SYS_PSTATE_PAN); 651 652 switch (op) { 653 case OP_AT_S1E1RP: 654 fail = __kvm_at(OP_AT_S1E1RP, vaddr); 655 break; 656 case OP_AT_S1E1WP: 657 fail = __kvm_at(OP_AT_S1E1WP, vaddr); 658 break; 659 } 660 661 write_sysreg_s(host_pan, SYS_PSTATE_PAN); 662 663 return fail; 664 } 665 666 #define MEMATTR(ic, oc) (MEMATTR_##oc << 4 | MEMATTR_##ic) 667 #define MEMATTR_NC 0b0100 668 #define MEMATTR_Wt 0b1000 669 #define MEMATTR_Wb 0b1100 670 #define MEMATTR_WbRaWa 0b1111 671 672 #define MEMATTR_IS_DEVICE(m) (((m) & GENMASK(7, 4)) == 0) 673 674 static u8 s2_memattr_to_attr(u8 memattr) 675 { 676 memattr &= 0b1111; 677 678 switch (memattr) { 679 case 0b0000: 680 case 0b0001: 681 case 0b0010: 682 case 0b0011: 683 return memattr << 2; 684 case 0b0100: 685 return MEMATTR(Wb, Wb); 686 case 0b0101: 687 return MEMATTR(NC, NC); 688 case 0b0110: 689 return MEMATTR(Wt, NC); 690 case 0b0111: 691 return MEMATTR(Wb, NC); 692 case 0b1000: 693 /* Reserved, assume NC */ 694 return MEMATTR(NC, NC); 695 case 0b1001: 696 return MEMATTR(NC, Wt); 697 case 0b1010: 698 return MEMATTR(Wt, Wt); 699 case 0b1011: 700 return MEMATTR(Wb, Wt); 701 case 0b1100: 702 /* Reserved, assume NC */ 703 return MEMATTR(NC, NC); 704 case 0b1101: 705 return MEMATTR(NC, Wb); 706 case 0b1110: 707 return MEMATTR(Wt, Wb); 708 case 0b1111: 709 return MEMATTR(Wb, Wb); 710 default: 711 unreachable(); 712 } 713 } 714 715 static u8 combine_s1_s2_attr(u8 s1, u8 s2) 716 { 717 bool transient; 718 u8 final = 0; 719 720 /* Upgrade transient s1 to non-transient to simplify things */ 721 switch (s1) { 722 case 0b0001 ... 0b0011: /* Normal, Write-Through Transient */ 723 transient = true; 724 s1 = MEMATTR_Wt | (s1 & GENMASK(1,0)); 725 break; 726 case 0b0101 ... 0b0111: /* Normal, Write-Back Transient */ 727 transient = true; 728 s1 = MEMATTR_Wb | (s1 & GENMASK(1,0)); 729 break; 730 default: 731 transient = false; 732 } 733 734 /* S2CombineS1AttrHints() */ 735 if ((s1 & GENMASK(3, 2)) == MEMATTR_NC || 736 (s2 & GENMASK(3, 2)) == MEMATTR_NC) 737 final = MEMATTR_NC; 738 else if ((s1 & GENMASK(3, 2)) == MEMATTR_Wt || 739 (s2 & GENMASK(3, 2)) == MEMATTR_Wt) 740 final = MEMATTR_Wt; 741 else 742 final = MEMATTR_Wb; 743 744 if (final != MEMATTR_NC) { 745 /* Inherit RaWa hints form S1 */ 746 if (transient) { 747 switch (s1 & GENMASK(3, 2)) { 748 case MEMATTR_Wt: 749 final = 0; 750 break; 751 case MEMATTR_Wb: 752 final = MEMATTR_NC; 753 break; 754 } 755 } 756 757 final |= s1 & GENMASK(1, 0); 758 } 759 760 return final; 761 } 762 763 #define ATTR_NSH 0b00 764 #define ATTR_RSV 0b01 765 #define ATTR_OSH 0b10 766 #define ATTR_ISH 0b11 767 768 static u8 compute_final_sh(u8 attr, u8 sh) 769 { 770 /* Any form of device, as well as NC has SH[1:0]=0b10 */ 771 if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC)) 772 return ATTR_OSH; 773 774 if (sh == ATTR_RSV) /* Reserved, mapped to NSH */ 775 sh = ATTR_NSH; 776 777 return sh; 778 } 779 780 static u8 compute_s1_sh(struct s1_walk_info *wi, struct s1_walk_result *wr, 781 u8 attr) 782 { 783 u8 sh; 784 785 /* 786 * non-52bit and LPA have their basic shareability described in the 787 * descriptor. LPA2 gets it from the corresponding field in TCR, 788 * conveniently recorded in the walk info. 789 */ 790 if (!wi->pa52bit || BIT(wi->pgshift) == SZ_64K) 791 sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_SH, wr->desc); 792 else 793 sh = wi->sh; 794 795 return compute_final_sh(attr, sh); 796 } 797 798 static u8 combine_sh(u8 s1_sh, u8 s2_sh) 799 { 800 if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH) 801 return ATTR_OSH; 802 if (s1_sh == ATTR_ISH || s2_sh == ATTR_ISH) 803 return ATTR_ISH; 804 805 return ATTR_NSH; 806 } 807 808 static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par, 809 struct kvm_s2_trans *tr) 810 { 811 u8 s1_parattr, s2_memattr, final_attr, s2_sh; 812 u64 par; 813 814 /* If S2 has failed to translate, report the damage */ 815 if (tr->esr) { 816 par = SYS_PAR_EL1_RES1; 817 par |= SYS_PAR_EL1_F; 818 par |= SYS_PAR_EL1_S; 819 par |= FIELD_PREP(SYS_PAR_EL1_FST, tr->esr); 820 return par; 821 } 822 823 s1_parattr = FIELD_GET(SYS_PAR_EL1_ATTR, s1_par); 824 s2_memattr = FIELD_GET(GENMASK(5, 2), tr->desc); 825 826 if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_FWB) { 827 if (!kvm_has_feat(vcpu->kvm, ID_AA64PFR2_EL1, MTEPERM, IMP)) 828 s2_memattr &= ~BIT(3); 829 830 /* Combination of R_VRJSW and R_RHWZM */ 831 switch (s2_memattr) { 832 case 0b0101: 833 if (MEMATTR_IS_DEVICE(s1_parattr)) 834 final_attr = s1_parattr; 835 else 836 final_attr = MEMATTR(NC, NC); 837 break; 838 case 0b0110: 839 case 0b1110: 840 final_attr = MEMATTR(WbRaWa, WbRaWa); 841 break; 842 case 0b0111: 843 case 0b1111: 844 /* Preserve S1 attribute */ 845 final_attr = s1_parattr; 846 break; 847 case 0b0100: 848 case 0b1100: 849 case 0b1101: 850 /* Reserved, do something non-silly */ 851 final_attr = s1_parattr; 852 break; 853 default: 854 /* 855 * MemAttr[2]=0, Device from S2. 856 * 857 * FWB does not influence the way that stage 1 858 * memory types and attributes are combined 859 * with stage 2 Device type and attributes. 860 */ 861 final_attr = min(s2_memattr_to_attr(s2_memattr), 862 s1_parattr); 863 } 864 } else { 865 /* Combination of R_HMNDG, R_TNHFM and R_GQFSF */ 866 u8 s2_parattr = s2_memattr_to_attr(s2_memattr); 867 868 if (MEMATTR_IS_DEVICE(s1_parattr) || 869 MEMATTR_IS_DEVICE(s2_parattr)) { 870 final_attr = min(s1_parattr, s2_parattr); 871 } else { 872 /* At this stage, this is memory vs memory */ 873 final_attr = combine_s1_s2_attr(s1_parattr & 0xf, 874 s2_parattr & 0xf); 875 final_attr |= combine_s1_s2_attr(s1_parattr >> 4, 876 s2_parattr >> 4) << 4; 877 } 878 } 879 880 if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_CD) && 881 !MEMATTR_IS_DEVICE(final_attr)) 882 final_attr = MEMATTR(NC, NC); 883 884 s2_sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_SH, tr->desc); 885 886 par = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr); 887 par |= tr->output & GENMASK(47, 12); 888 par |= FIELD_PREP(SYS_PAR_EL1_SH, 889 combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par), 890 compute_final_sh(final_attr, s2_sh))); 891 892 return par; 893 } 894 895 static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, 896 struct s1_walk_result *wr) 897 { 898 u64 par; 899 900 if (wr->failed) { 901 par = SYS_PAR_EL1_RES1; 902 par |= SYS_PAR_EL1_F; 903 par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst); 904 par |= wr->ptw ? SYS_PAR_EL1_PTW : 0; 905 par |= wr->s2 ? SYS_PAR_EL1_S : 0; 906 } else if (wr->level == S1_MMU_DISABLED) { 907 /* MMU off or HCR_EL2.DC == 1 */ 908 par = SYS_PAR_EL1_NSE; 909 par |= wr->pa & SYS_PAR_EL1_PA; 910 911 if (wi->regime == TR_EL10 && vcpu_has_nv(vcpu) && 912 (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) { 913 par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 914 MEMATTR(WbRaWa, WbRaWa)); 915 par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_NSH); 916 } else { 917 par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */ 918 par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_OSH); 919 } 920 } else { 921 u64 mair, sctlr; 922 u8 sh; 923 924 par = SYS_PAR_EL1_NSE; 925 926 mair = (wi->regime == TR_EL10 ? 927 vcpu_read_sys_reg(vcpu, MAIR_EL1) : 928 vcpu_read_sys_reg(vcpu, MAIR_EL2)); 929 930 mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8; 931 mair &= 0xff; 932 933 sctlr = (wi->regime == TR_EL10 ? 934 vcpu_read_sys_reg(vcpu, SCTLR_EL1) : 935 vcpu_read_sys_reg(vcpu, SCTLR_EL2)); 936 937 /* Force NC for memory if SCTLR_ELx.C is clear */ 938 if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair)) 939 mair = MEMATTR(NC, NC); 940 941 par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair); 942 par |= wr->pa & SYS_PAR_EL1_PA; 943 944 sh = compute_s1_sh(wi, wr, mair); 945 par |= FIELD_PREP(SYS_PAR_EL1_SH, sh); 946 } 947 948 return par; 949 } 950 951 static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) 952 { 953 u64 sctlr; 954 955 if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3)) 956 return false; 957 958 if (s1pie_enabled(vcpu, regime)) 959 return true; 960 961 if (regime == TR_EL10) 962 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); 963 else 964 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); 965 966 return sctlr & SCTLR_EL1_EPAN; 967 } 968 969 static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu, 970 struct s1_walk_info *wi, 971 struct s1_walk_result *wr) 972 { 973 bool wxn; 974 975 /* Non-hierarchical part of AArch64.S1DirectBasePermissions() */ 976 if (wi->regime != TR_EL2) { 977 switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr->desc)) { 978 case 0b00: 979 wr->pr = wr->pw = true; 980 wr->ur = wr->uw = false; 981 break; 982 case 0b01: 983 wr->pr = wr->pw = wr->ur = wr->uw = true; 984 break; 985 case 0b10: 986 wr->pr = true; 987 wr->pw = wr->ur = wr->uw = false; 988 break; 989 case 0b11: 990 wr->pr = wr->ur = true; 991 wr->pw = wr->uw = false; 992 break; 993 } 994 995 /* We don't use px for anything yet, but hey... */ 996 wr->px = !((wr->desc & PTE_PXN) || wr->uw); 997 wr->ux = !(wr->desc & PTE_UXN); 998 } else { 999 wr->ur = wr->uw = wr->ux = false; 1000 1001 if (!(wr->desc & PTE_RDONLY)) { 1002 wr->pr = wr->pw = true; 1003 } else { 1004 wr->pr = true; 1005 wr->pw = false; 1006 } 1007 1008 /* XN maps to UXN */ 1009 wr->px = !(wr->desc & PTE_UXN); 1010 } 1011 1012 switch (wi->regime) { 1013 case TR_EL2: 1014 case TR_EL20: 1015 wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_WXN); 1016 break; 1017 case TR_EL10: 1018 wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN); 1019 break; 1020 } 1021 1022 wr->pwxn = wr->uwxn = wxn; 1023 wr->pov = wi->poe; 1024 wr->uov = wi->e0poe; 1025 } 1026 1027 static void compute_s1_hierarchical_permissions(struct kvm_vcpu *vcpu, 1028 struct s1_walk_info *wi, 1029 struct s1_walk_result *wr) 1030 { 1031 /* Hierarchical part of AArch64.S1DirectBasePermissions() */ 1032 if (wi->regime != TR_EL2) { 1033 switch (wr->APTable) { 1034 case 0b00: 1035 break; 1036 case 0b01: 1037 wr->ur = wr->uw = false; 1038 break; 1039 case 0b10: 1040 wr->pw = wr->uw = false; 1041 break; 1042 case 0b11: 1043 wr->pw = wr->ur = wr->uw = false; 1044 break; 1045 } 1046 1047 wr->px &= !wr->PXNTable; 1048 wr->ux &= !wr->UXNTable; 1049 } else { 1050 if (wr->APTable & BIT(1)) 1051 wr->pw = false; 1052 1053 /* XN maps to UXN */ 1054 wr->px &= !wr->UXNTable; 1055 } 1056 } 1057 1058 #define perm_idx(v, r, i) ((vcpu_read_sys_reg((v), (r)) >> ((i) * 4)) & 0xf) 1059 1060 #define set_priv_perms(wr, r, w, x) \ 1061 do { \ 1062 (wr)->pr = (r); \ 1063 (wr)->pw = (w); \ 1064 (wr)->px = (x); \ 1065 } while (0) 1066 1067 #define set_unpriv_perms(wr, r, w, x) \ 1068 do { \ 1069 (wr)->ur = (r); \ 1070 (wr)->uw = (w); \ 1071 (wr)->ux = (x); \ 1072 } while (0) 1073 1074 #define set_priv_wxn(wr, v) \ 1075 do { \ 1076 (wr)->pwxn = (v); \ 1077 } while (0) 1078 1079 #define set_unpriv_wxn(wr, v) \ 1080 do { \ 1081 (wr)->uwxn = (v); \ 1082 } while (0) 1083 1084 /* Similar to AArch64.S1IndirectBasePermissions(), without GCS */ 1085 #define set_perms(w, wr, ip) \ 1086 do { \ 1087 /* R_LLZDZ */ \ 1088 switch ((ip)) { \ 1089 case 0b0000: \ 1090 set_ ## w ## _perms((wr), false, false, false); \ 1091 break; \ 1092 case 0b0001: \ 1093 set_ ## w ## _perms((wr), true , false, false); \ 1094 break; \ 1095 case 0b0010: \ 1096 set_ ## w ## _perms((wr), false, false, true ); \ 1097 break; \ 1098 case 0b0011: \ 1099 set_ ## w ## _perms((wr), true , false, true ); \ 1100 break; \ 1101 case 0b0100: \ 1102 set_ ## w ## _perms((wr), false, false, false); \ 1103 break; \ 1104 case 0b0101: \ 1105 set_ ## w ## _perms((wr), true , true , false); \ 1106 break; \ 1107 case 0b0110: \ 1108 set_ ## w ## _perms((wr), true , true , true ); \ 1109 break; \ 1110 case 0b0111: \ 1111 set_ ## w ## _perms((wr), true , true , true ); \ 1112 break; \ 1113 case 0b1000: \ 1114 set_ ## w ## _perms((wr), true , false, false); \ 1115 break; \ 1116 case 0b1001: \ 1117 set_ ## w ## _perms((wr), true , false, false); \ 1118 break; \ 1119 case 0b1010: \ 1120 set_ ## w ## _perms((wr), true , false, true ); \ 1121 break; \ 1122 case 0b1011: \ 1123 set_ ## w ## _perms((wr), false, false, false); \ 1124 break; \ 1125 case 0b1100: \ 1126 set_ ## w ## _perms((wr), true , true , false); \ 1127 break; \ 1128 case 0b1101: \ 1129 set_ ## w ## _perms((wr), false, false, false); \ 1130 break; \ 1131 case 0b1110: \ 1132 set_ ## w ## _perms((wr), true , true , true ); \ 1133 break; \ 1134 case 0b1111: \ 1135 set_ ## w ## _perms((wr), false, false, false); \ 1136 break; \ 1137 } \ 1138 \ 1139 /* R_HJYGR */ \ 1140 set_ ## w ## _wxn((wr), ((ip) == 0b0110)); \ 1141 \ 1142 } while (0) 1143 1144 static void compute_s1_indirect_permissions(struct kvm_vcpu *vcpu, 1145 struct s1_walk_info *wi, 1146 struct s1_walk_result *wr) 1147 { 1148 u8 up, pp, idx; 1149 1150 idx = pte_pi_index(wr->desc); 1151 1152 switch (wi->regime) { 1153 case TR_EL10: 1154 pp = perm_idx(vcpu, PIR_EL1, idx); 1155 up = perm_idx(vcpu, PIRE0_EL1, idx); 1156 break; 1157 case TR_EL20: 1158 pp = perm_idx(vcpu, PIR_EL2, idx); 1159 up = perm_idx(vcpu, PIRE0_EL2, idx); 1160 break; 1161 case TR_EL2: 1162 pp = perm_idx(vcpu, PIR_EL2, idx); 1163 up = 0; 1164 break; 1165 } 1166 1167 set_perms(priv, wr, pp); 1168 1169 if (wi->regime != TR_EL2) 1170 set_perms(unpriv, wr, up); 1171 else 1172 set_unpriv_perms(wr, false, false, false); 1173 1174 wr->pov = wi->poe && !(pp & BIT(3)); 1175 wr->uov = wi->e0poe && !(up & BIT(3)); 1176 1177 /* R_VFPJF */ 1178 if (wr->px && wr->uw) { 1179 set_priv_perms(wr, false, false, false); 1180 set_unpriv_perms(wr, false, false, false); 1181 } 1182 } 1183 1184 static void compute_s1_overlay_permissions(struct kvm_vcpu *vcpu, 1185 struct s1_walk_info *wi, 1186 struct s1_walk_result *wr) 1187 { 1188 u8 idx, pov_perms, uov_perms; 1189 1190 idx = FIELD_GET(PTE_PO_IDX_MASK, wr->desc); 1191 1192 if (wr->pov) { 1193 switch (wi->regime) { 1194 case TR_EL10: 1195 pov_perms = perm_idx(vcpu, POR_EL1, idx); 1196 break; 1197 case TR_EL20: 1198 pov_perms = perm_idx(vcpu, POR_EL2, idx); 1199 break; 1200 case TR_EL2: 1201 pov_perms = perm_idx(vcpu, POR_EL2, idx); 1202 break; 1203 } 1204 1205 if (pov_perms & ~POE_RWX) 1206 pov_perms = POE_NONE; 1207 1208 /* R_QXXPC, S1PrivOverflow enabled */ 1209 if (wr->pwxn && (pov_perms & POE_X)) 1210 pov_perms &= ~POE_W; 1211 1212 wr->pr &= pov_perms & POE_R; 1213 wr->pw &= pov_perms & POE_W; 1214 wr->px &= pov_perms & POE_X; 1215 } 1216 1217 if (wr->uov) { 1218 switch (wi->regime) { 1219 case TR_EL10: 1220 uov_perms = perm_idx(vcpu, POR_EL0, idx); 1221 break; 1222 case TR_EL20: 1223 uov_perms = perm_idx(vcpu, POR_EL0, idx); 1224 break; 1225 case TR_EL2: 1226 uov_perms = 0; 1227 break; 1228 } 1229 1230 if (uov_perms & ~POE_RWX) 1231 uov_perms = POE_NONE; 1232 1233 /* R_NPBXC, S1UnprivOverlay enabled */ 1234 if (wr->uwxn && (uov_perms & POE_X)) 1235 uov_perms &= ~POE_W; 1236 1237 wr->ur &= uov_perms & POE_R; 1238 wr->uw &= uov_perms & POE_W; 1239 wr->ux &= uov_perms & POE_X; 1240 } 1241 } 1242 1243 static void compute_s1_permissions(struct kvm_vcpu *vcpu, 1244 struct s1_walk_info *wi, 1245 struct s1_walk_result *wr) 1246 { 1247 bool pan; 1248 1249 if (!s1pie_enabled(vcpu, wi->regime)) 1250 compute_s1_direct_permissions(vcpu, wi, wr); 1251 else 1252 compute_s1_indirect_permissions(vcpu, wi, wr); 1253 1254 if (!wi->hpd) 1255 compute_s1_hierarchical_permissions(vcpu, wi, wr); 1256 1257 compute_s1_overlay_permissions(vcpu, wi, wr); 1258 1259 /* R_QXXPC, S1PrivOverlay disabled */ 1260 if (!wr->pov) 1261 wr->px &= !(wr->pwxn && wr->pw); 1262 1263 /* R_NPBXC, S1UnprivOverlay disabled */ 1264 if (!wr->uov) 1265 wr->ux &= !(wr->uwxn && wr->uw); 1266 1267 pan = wi->pan && (wr->ur || wr->uw || 1268 (pan3_enabled(vcpu, wi->regime) && wr->ux)); 1269 wr->pw &= !pan; 1270 wr->pr &= !pan; 1271 } 1272 1273 static int handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr, u64 *par) 1274 { 1275 struct s1_walk_result wr = {}; 1276 struct s1_walk_info wi = {}; 1277 bool perm_fail = false; 1278 int ret, idx; 1279 1280 wi.regime = compute_translation_regime(vcpu, op); 1281 wi.as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W); 1282 wi.pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) && 1283 (*vcpu_cpsr(vcpu) & PSR_PAN_BIT); 1284 1285 ret = setup_s1_walk(vcpu, &wi, &wr, vaddr); 1286 if (ret) 1287 goto compute_par; 1288 1289 if (wr.level == S1_MMU_DISABLED) 1290 goto compute_par; 1291 1292 idx = srcu_read_lock(&vcpu->kvm->srcu); 1293 1294 ret = walk_s1(vcpu, &wi, &wr, vaddr); 1295 1296 srcu_read_unlock(&vcpu->kvm->srcu, idx); 1297 1298 /* 1299 * Race to update a descriptor -- restart the walk. 1300 */ 1301 if (ret == -EAGAIN) 1302 return ret; 1303 if (ret) 1304 goto compute_par; 1305 1306 compute_s1_permissions(vcpu, &wi, &wr); 1307 1308 switch (op) { 1309 case OP_AT_S1E1RP: 1310 case OP_AT_S1E1R: 1311 case OP_AT_S1E2R: 1312 perm_fail = !wr.pr; 1313 break; 1314 case OP_AT_S1E1WP: 1315 case OP_AT_S1E1W: 1316 case OP_AT_S1E2W: 1317 perm_fail = !wr.pw; 1318 break; 1319 case OP_AT_S1E0R: 1320 perm_fail = !wr.ur; 1321 break; 1322 case OP_AT_S1E0W: 1323 perm_fail = !wr.uw; 1324 break; 1325 case OP_AT_S1E1A: 1326 case OP_AT_S1E2A: 1327 break; 1328 default: 1329 BUG(); 1330 } 1331 1332 if (perm_fail) 1333 fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false); 1334 1335 compute_par: 1336 *par = compute_par_s1(vcpu, &wi, &wr); 1337 return 0; 1338 } 1339 1340 /* 1341 * Return the PAR_EL1 value as the result of a valid translation. 1342 * 1343 * If the translation is unsuccessful, the value may only contain 1344 * PAR_EL1.F, and cannot be taken at face value. It isn't an 1345 * indication of the translation having failed, only that the fast 1346 * path did not succeed, *unless* it indicates a S1 permission or 1347 * access fault. 1348 */ 1349 static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) 1350 { 1351 struct mmu_config config; 1352 struct kvm_s2_mmu *mmu; 1353 bool fail, mmu_cs; 1354 u64 par; 1355 1356 par = SYS_PAR_EL1_F; 1357 1358 /* 1359 * We've trapped, so everything is live on the CPU. As we will 1360 * be switching contexts behind everybody's back, disable 1361 * interrupts while holding the mmu lock. 1362 */ 1363 guard(write_lock_irqsave)(&vcpu->kvm->mmu_lock); 1364 1365 /* 1366 * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already 1367 * the right one (as we trapped from vEL2). If not, save the 1368 * full MMU context. 1369 * 1370 * We are also guaranteed to be in the correct context if 1371 * we're not in a nested VM. 1372 */ 1373 mmu_cs = (vcpu_has_nv(vcpu) && 1374 !(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))); 1375 if (!mmu_cs) 1376 goto skip_mmu_switch; 1377 1378 /* 1379 * Obtaining the S2 MMU for a L2 is horribly racy, and we may not 1380 * find it (recycled by another vcpu, for example). When this 1381 * happens, admit defeat immediately and use the SW (slow) path. 1382 */ 1383 mmu = lookup_s2_mmu(vcpu); 1384 if (!mmu) 1385 return par; 1386 1387 __mmu_config_save(&config); 1388 1389 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1), SYS_TTBR0); 1390 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1), SYS_TTBR1); 1391 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1), SYS_TCR); 1392 write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1), SYS_MAIR); 1393 if (kvm_has_tcr2(vcpu->kvm)) { 1394 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR2_EL1), SYS_TCR2); 1395 if (kvm_has_s1pie(vcpu->kvm)) { 1396 write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIR_EL1), SYS_PIR); 1397 write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIRE0_EL1), SYS_PIRE0); 1398 } 1399 if (kvm_has_s1poe(vcpu->kvm)) { 1400 write_sysreg_el1(vcpu_read_sys_reg(vcpu, POR_EL1), SYS_POR); 1401 write_sysreg_s(vcpu_read_sys_reg(vcpu, POR_EL0), SYS_POR_EL0); 1402 } 1403 } 1404 write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1), SYS_SCTLR); 1405 __load_stage2(mmu, mmu->arch); 1406 1407 skip_mmu_switch: 1408 /* Temporarily switch back to guest context */ 1409 write_sysreg_hcr(vcpu->arch.hcr_el2); 1410 isb(); 1411 1412 switch (op) { 1413 case OP_AT_S1E1RP: 1414 case OP_AT_S1E1WP: 1415 fail = at_s1e1p_fast(vcpu, op, vaddr); 1416 break; 1417 case OP_AT_S1E1R: 1418 fail = __kvm_at(OP_AT_S1E1R, vaddr); 1419 break; 1420 case OP_AT_S1E1W: 1421 fail = __kvm_at(OP_AT_S1E1W, vaddr); 1422 break; 1423 case OP_AT_S1E0R: 1424 fail = __kvm_at(OP_AT_S1E0R, vaddr); 1425 break; 1426 case OP_AT_S1E0W: 1427 fail = __kvm_at(OP_AT_S1E0W, vaddr); 1428 break; 1429 case OP_AT_S1E1A: 1430 fail = __kvm_at(OP_AT_S1E1A, vaddr); 1431 break; 1432 default: 1433 WARN_ON_ONCE(1); 1434 fail = true; 1435 break; 1436 } 1437 1438 if (!fail) 1439 par = read_sysreg_par(); 1440 1441 write_sysreg_hcr(HCR_HOST_VHE_FLAGS); 1442 1443 if (mmu_cs) 1444 __mmu_config_restore(&config); 1445 1446 return par; 1447 } 1448 1449 static bool par_check_s1_perm_fault(u64 par) 1450 { 1451 u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par); 1452 1453 return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM && 1454 !(par & SYS_PAR_EL1_S)); 1455 } 1456 1457 static bool par_check_s1_access_fault(u64 par) 1458 { 1459 u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par); 1460 1461 return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_ACCESS && 1462 !(par & SYS_PAR_EL1_S)); 1463 } 1464 1465 int __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) 1466 { 1467 u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr); 1468 int ret; 1469 1470 /* 1471 * If PAR_EL1 reports that AT failed on a S1 permission or access 1472 * fault, we know for sure that the PTW was able to walk the S1 1473 * tables and there's nothing else to do. 1474 * 1475 * If AT failed for any other reason, then we must walk the guest S1 1476 * to emulate the instruction. 1477 */ 1478 if ((par & SYS_PAR_EL1_F) && 1479 !par_check_s1_perm_fault(par) && 1480 !par_check_s1_access_fault(par)) { 1481 ret = handle_at_slow(vcpu, op, vaddr, &par); 1482 if (ret) 1483 return ret; 1484 } 1485 1486 vcpu_write_sys_reg(vcpu, par, PAR_EL1); 1487 return 0; 1488 } 1489 1490 int __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) 1491 { 1492 u64 par; 1493 int ret; 1494 1495 /* 1496 * We've trapped, so everything is live on the CPU. As we will be 1497 * switching context behind everybody's back, disable interrupts... 1498 */ 1499 scoped_guard(write_lock_irqsave, &vcpu->kvm->mmu_lock) { 1500 u64 val, hcr; 1501 bool fail; 1502 1503 val = hcr = read_sysreg(hcr_el2); 1504 val &= ~HCR_TGE; 1505 val |= HCR_VM; 1506 1507 if (!vcpu_el2_e2h_is_set(vcpu)) 1508 val |= HCR_NV | HCR_NV1; 1509 1510 write_sysreg_hcr(val); 1511 isb(); 1512 1513 par = SYS_PAR_EL1_F; 1514 1515 switch (op) { 1516 case OP_AT_S1E2R: 1517 fail = __kvm_at(OP_AT_S1E1R, vaddr); 1518 break; 1519 case OP_AT_S1E2W: 1520 fail = __kvm_at(OP_AT_S1E1W, vaddr); 1521 break; 1522 case OP_AT_S1E2A: 1523 fail = __kvm_at(OP_AT_S1E1A, vaddr); 1524 break; 1525 default: 1526 WARN_ON_ONCE(1); 1527 fail = true; 1528 } 1529 1530 isb(); 1531 1532 if (!fail) 1533 par = read_sysreg_par(); 1534 1535 write_sysreg_hcr(hcr); 1536 isb(); 1537 } 1538 1539 /* We failed the translation, let's replay it in slow motion */ 1540 if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) { 1541 ret = handle_at_slow(vcpu, op, vaddr, &par); 1542 if (ret) 1543 return ret; 1544 } 1545 1546 vcpu_write_sys_reg(vcpu, par, PAR_EL1); 1547 return 0; 1548 } 1549 1550 int __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) 1551 { 1552 struct kvm_s2_trans out = {}; 1553 u64 ipa, par; 1554 bool write; 1555 int ret; 1556 1557 /* Do the stage-1 translation */ 1558 switch (op) { 1559 case OP_AT_S12E1R: 1560 op = OP_AT_S1E1R; 1561 write = false; 1562 break; 1563 case OP_AT_S12E1W: 1564 op = OP_AT_S1E1W; 1565 write = true; 1566 break; 1567 case OP_AT_S12E0R: 1568 op = OP_AT_S1E0R; 1569 write = false; 1570 break; 1571 case OP_AT_S12E0W: 1572 op = OP_AT_S1E0W; 1573 write = true; 1574 break; 1575 default: 1576 WARN_ON_ONCE(1); 1577 return 0; 1578 } 1579 1580 __kvm_at_s1e01(vcpu, op, vaddr); 1581 par = vcpu_read_sys_reg(vcpu, PAR_EL1); 1582 if (par & SYS_PAR_EL1_F) 1583 return 0; 1584 1585 /* 1586 * If we only have a single stage of translation (EL2&0), exit 1587 * early. Same thing if {VM,DC}=={0,0}. 1588 */ 1589 if (compute_translation_regime(vcpu, op) == TR_EL20 || 1590 !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC))) 1591 return 0; 1592 1593 /* Do the stage-2 translation */ 1594 ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0)); 1595 out.esr = 0; 1596 ret = kvm_walk_nested_s2(vcpu, ipa, &out); 1597 if (ret < 0) 1598 return ret; 1599 1600 /* Check the access permission */ 1601 if (!out.esr && 1602 ((!write && !out.readable) || (write && !out.writable))) 1603 out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3); 1604 1605 par = compute_par_s12(vcpu, par, &out); 1606 vcpu_write_sys_reg(vcpu, par, PAR_EL1); 1607 return 0; 1608 } 1609 1610 /* 1611 * Translate a VA for a given EL in a given translation regime, with 1612 * or without PAN. This requires wi->{regime, as_el0, pan} to be 1613 * set. The rest of the wi and wr should be 0-initialised. 1614 */ 1615 int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, 1616 struct s1_walk_result *wr, u64 va) 1617 { 1618 int ret; 1619 1620 ret = setup_s1_walk(vcpu, wi, wr, va); 1621 if (ret) 1622 return ret; 1623 1624 if (wr->level == S1_MMU_DISABLED) { 1625 wr->ur = wr->uw = wr->ux = true; 1626 wr->pr = wr->pw = wr->px = true; 1627 } else { 1628 ret = walk_s1(vcpu, wi, wr, va); 1629 if (ret) 1630 return ret; 1631 1632 compute_s1_permissions(vcpu, wi, wr); 1633 } 1634 1635 return 0; 1636 } 1637 1638 struct desc_match { 1639 u64 ipa; 1640 int level; 1641 }; 1642 1643 static int match_s1_desc(struct s1_walk_context *ctxt, void *priv) 1644 { 1645 struct desc_match *dm = priv; 1646 u64 ipa = dm->ipa; 1647 1648 /* Use S1 granule alignment */ 1649 ipa &= GENMASK(51, ctxt->wi->pgshift); 1650 1651 /* Not the IPA we're looking for? Continue. */ 1652 if (ipa != ctxt->table_ipa) 1653 return 0; 1654 1655 /* Note the level and interrupt the walk */ 1656 dm->level = ctxt->level; 1657 return -EINTR; 1658 } 1659 1660 int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level) 1661 { 1662 struct desc_match dm = { 1663 .ipa = ipa, 1664 }; 1665 struct s1_walk_info wi = { 1666 .filter = &(struct s1_walk_filter){ 1667 .fn = match_s1_desc, 1668 .priv = &dm, 1669 }, 1670 .as_el0 = false, 1671 .pan = false, 1672 }; 1673 struct s1_walk_result wr = {}; 1674 int ret; 1675 1676 if (is_hyp_ctxt(vcpu)) 1677 wi.regime = vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2; 1678 else 1679 wi.regime = TR_EL10; 1680 1681 ret = setup_s1_walk(vcpu, &wi, &wr, va); 1682 if (ret) 1683 return ret; 1684 1685 /* We really expect the S1 MMU to be on here... */ 1686 if (WARN_ON_ONCE(wr.level == S1_MMU_DISABLED)) { 1687 *level = 0; 1688 return 0; 1689 } 1690 1691 /* Walk the guest's PT, looking for a match along the way */ 1692 ret = walk_s1(vcpu, &wi, &wr, va); 1693 switch (ret) { 1694 case -EINTR: 1695 /* We interrupted the walk on a match, return the level */ 1696 *level = dm.level; 1697 return 0; 1698 case 0: 1699 /* The walk completed, we failed to find the entry */ 1700 return -ENOENT; 1701 default: 1702 /* Any other error... */ 1703 return ret; 1704 } 1705 } 1706 1707 #ifdef CONFIG_ARM64_LSE_ATOMICS 1708 static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new) 1709 { 1710 u64 tmp = old; 1711 int ret = 0; 1712 1713 uaccess_enable_privileged(); 1714 1715 asm volatile(__LSE_PREAMBLE 1716 "1: cas %[old], %[new], %[addr]\n" 1717 "2:\n" 1718 _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret]) 1719 : [old] "+r" (old), [addr] "+Q" (*ptep), [ret] "+r" (ret) 1720 : [new] "r" (new) 1721 : "memory"); 1722 1723 uaccess_disable_privileged(); 1724 1725 if (ret) 1726 return ret; 1727 if (tmp != old) 1728 return -EAGAIN; 1729 1730 return ret; 1731 } 1732 #else 1733 static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new) 1734 { 1735 return -EINVAL; 1736 } 1737 #endif 1738 1739 static int __llsc_swap_desc(u64 __user *ptep, u64 old, u64 new) 1740 { 1741 int ret = 1; 1742 u64 tmp; 1743 1744 uaccess_enable_privileged(); 1745 1746 asm volatile("prfm pstl1strm, %[addr]\n" 1747 "1: ldxr %[tmp], %[addr]\n" 1748 "sub %[tmp], %[tmp], %[old]\n" 1749 "cbnz %[tmp], 3f\n" 1750 "2: stlxr %w[ret], %[new], %[addr]\n" 1751 "3:\n" 1752 _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w[ret]) 1753 _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w[ret]) 1754 : [ret] "+r" (ret), [addr] "+Q" (*ptep), [tmp] "=&r" (tmp) 1755 : [old] "r" (old), [new] "r" (new) 1756 : "memory"); 1757 1758 uaccess_disable_privileged(); 1759 1760 /* STLXR didn't update the descriptor, or the compare failed */ 1761 if (ret == 1) 1762 return -EAGAIN; 1763 1764 return ret; 1765 } 1766 1767 int __kvm_at_swap_desc(struct kvm *kvm, gpa_t ipa, u64 old, u64 new) 1768 { 1769 struct kvm_memory_slot *slot; 1770 unsigned long hva; 1771 u64 __user *ptep; 1772 bool writable; 1773 int offset; 1774 gfn_t gfn; 1775 int r; 1776 1777 lockdep_assert(srcu_read_lock_held(&kvm->srcu)); 1778 1779 gfn = ipa >> PAGE_SHIFT; 1780 offset = offset_in_page(ipa); 1781 slot = gfn_to_memslot(kvm, gfn); 1782 hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); 1783 if (kvm_is_error_hva(hva)) 1784 return -EINVAL; 1785 if (!writable) 1786 return -EPERM; 1787 1788 ptep = (u64 __user *)hva + offset; 1789 if (cpus_have_final_cap(ARM64_HAS_LSE_ATOMICS)) 1790 r = __lse_swap_desc(ptep, old, new); 1791 else 1792 r = __llsc_swap_desc(ptep, old, new); 1793 1794 if (r < 0) 1795 return r; 1796 1797 mark_page_dirty_in_slot(kvm, slot, gfn); 1798 return 0; 1799 } 1800