1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2017 - Linaro Ltd 4 * Author: Jintack Lim <jintack.lim@linaro.org> 5 */ 6 7 #include <linux/kvm_host.h> 8 9 #include <asm/esr.h> 10 #include <asm/kvm_hyp.h> 11 #include <asm/kvm_mmu.h> 12 #include <asm/lsui.h> 13 14 static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool s1ptw) 15 { 16 wr->fst = fst; 17 wr->ptw = s1ptw; 18 wr->s2 = s1ptw; 19 wr->failed = true; 20 } 21 22 #define S1_MMU_DISABLED (-127) 23 24 static int get_ia_size(struct s1_walk_info *wi) 25 { 26 return 64 - wi->txsz; 27 } 28 29 /* Return true if the IPA is out of the OA range */ 30 static bool check_output_size(u64 ipa, struct s1_walk_info *wi) 31 { 32 if (wi->pa52bit) 33 return wi->max_oa_bits < 52 && (ipa & GENMASK_ULL(51, wi->max_oa_bits)); 34 return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits)); 35 } 36 37 static bool has_52bit_pa(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, u64 tcr) 38 { 39 switch (BIT(wi->pgshift)) { 40 case SZ_64K: 41 default: /* IMPDEF: treat any other value as 64k */ 42 if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, PARANGE, 52)) 43 return false; 44 return ((wi->regime == TR_EL2 ? 45 FIELD_GET(TCR_EL2_PS_MASK, tcr) : 46 FIELD_GET(TCR_IPS_MASK, tcr)) == 0b0110); 47 case SZ_16K: 48 if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT)) 49 return false; 50 break; 51 case SZ_4K: 52 if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT)) 53 return false; 54 break; 55 } 56 57 return (tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS)); 58 } 59 60 static u64 desc_to_oa(struct s1_walk_info *wi, u64 desc) 61 { 62 u64 addr; 63 64 if (!wi->pa52bit) 65 return desc & GENMASK_ULL(47, wi->pgshift); 66 67 switch (BIT(wi->pgshift)) { 68 case SZ_4K: 69 case SZ_16K: 70 addr = desc & GENMASK_ULL(49, wi->pgshift); 71 addr |= FIELD_GET(KVM_PTE_ADDR_51_50_LPA2, desc) << 50; 72 break; 73 case SZ_64K: 74 default: /* IMPDEF: treat any other value as 64k */ 75 addr = desc & GENMASK_ULL(47, wi->pgshift); 76 addr |= FIELD_GET(KVM_PTE_ADDR_51_48, desc) << 48; 77 break; 78 } 79 80 return addr; 81 } 82 83 /* Return the translation regime that applies to an AT instruction */ 84 static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op) 85 { 86 /* 87 * We only get here from guest EL2, so the translation 88 * regime AT applies to is solely defined by {E2H,TGE}. 89 */ 90 switch (op) { 91 case OP_AT_S1E2R: 92 case OP_AT_S1E2W: 93 case OP_AT_S1E2A: 94 return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2; 95 default: 96 return (vcpu_el2_e2h_is_set(vcpu) && 97 vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10; 98 } 99 } 100 101 static u64 effective_tcr2(struct kvm_vcpu *vcpu, enum trans_regime regime) 102 { 103 if (regime == TR_EL10) { 104 if (vcpu_has_nv(vcpu) && 105 !(__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En)) 106 return 0; 107 108 return vcpu_read_sys_reg(vcpu, TCR2_EL1); 109 } 110 111 return vcpu_read_sys_reg(vcpu, TCR2_EL2); 112 } 113 114 static bool s1pie_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) 115 { 116 if (!kvm_has_s1pie(vcpu->kvm)) 117 return false; 118 119 /* Abuse TCR2_EL1_PIE and use it for EL2 as well */ 120 return effective_tcr2(vcpu, regime) & TCR2_EL1_PIE; 121 } 122 123 static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi) 124 { 125 u64 val; 126 127 if (!kvm_has_s1poe(vcpu->kvm)) { 128 wi->poe = wi->e0poe = false; 129 return; 130 } 131 132 val = effective_tcr2(vcpu, wi->regime); 133 134 /* Abuse TCR2_EL1_* for EL2 */ 135 wi->poe = val & TCR2_EL1_POE; 136 wi->e0poe = (wi->regime != TR_EL2) && (val & TCR2_EL1_E0POE); 137 } 138 139 static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, 140 struct s1_walk_result *wr, u64 va) 141 { 142 u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr; 143 unsigned int stride, x; 144 bool va55, tbi, lva; 145 146 va55 = va & BIT(55); 147 148 if (vcpu_has_nv(vcpu)) { 149 hcr = __vcpu_sys_reg(vcpu, HCR_EL2); 150 wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC)); 151 } else { 152 WARN_ON_ONCE(wi->regime != TR_EL10); 153 wi->s2 = false; 154 hcr = 0; 155 } 156 157 switch (wi->regime) { 158 case TR_EL10: 159 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); 160 tcr = vcpu_read_sys_reg(vcpu, TCR_EL1); 161 ttbr = (va55 ? 162 vcpu_read_sys_reg(vcpu, TTBR1_EL1) : 163 vcpu_read_sys_reg(vcpu, TTBR0_EL1)); 164 break; 165 case TR_EL2: 166 case TR_EL20: 167 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); 168 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); 169 ttbr = (va55 ? 170 vcpu_read_sys_reg(vcpu, TTBR1_EL2) : 171 vcpu_read_sys_reg(vcpu, TTBR0_EL2)); 172 break; 173 default: 174 BUG(); 175 } 176 177 /* Someone was silly enough to encode TG0/TG1 differently */ 178 if (va55 && wi->regime != TR_EL2) { 179 wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); 180 tg = FIELD_GET(TCR_TG1_MASK, tcr); 181 182 switch (tg << TCR_TG1_SHIFT) { 183 case TCR_TG1_4K: 184 wi->pgshift = 12; break; 185 case TCR_TG1_16K: 186 wi->pgshift = 14; break; 187 case TCR_TG1_64K: 188 default: /* IMPDEF: treat any other value as 64k */ 189 wi->pgshift = 16; break; 190 } 191 } else { 192 wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); 193 tg = FIELD_GET(TCR_TG0_MASK, tcr); 194 195 switch (tg << TCR_TG0_SHIFT) { 196 case TCR_TG0_4K: 197 wi->pgshift = 12; break; 198 case TCR_TG0_16K: 199 wi->pgshift = 14; break; 200 case TCR_TG0_64K: 201 default: /* IMPDEF: treat any other value as 64k */ 202 wi->pgshift = 16; break; 203 } 204 } 205 206 wi->pa52bit = has_52bit_pa(vcpu, wi, tcr); 207 208 ia_bits = get_ia_size(wi); 209 210 /* AArch64.S1StartLevel() */ 211 stride = wi->pgshift - 3; 212 wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride); 213 214 if (wi->regime == TR_EL2 && va55) 215 goto addrsz; 216 217 tbi = (wi->regime == TR_EL2 ? 218 FIELD_GET(TCR_EL2_TBI, tcr) : 219 (va55 ? 220 FIELD_GET(TCR_TBI1, tcr) : 221 FIELD_GET(TCR_TBI0, tcr))); 222 223 if (!tbi && (u64)sign_extend64(va, 55) != va) 224 goto addrsz; 225 226 wi->sh = (wi->regime == TR_EL2 ? 227 FIELD_GET(TCR_EL2_SH0_MASK, tcr) : 228 (va55 ? 229 FIELD_GET(TCR_SH1_MASK, tcr) : 230 FIELD_GET(TCR_SH0_MASK, tcr))); 231 232 va = (u64)sign_extend64(va, 55); 233 234 /* Let's put the MMU disabled case aside immediately */ 235 switch (wi->regime) { 236 case TR_EL10: 237 /* 238 * If dealing with the EL1&0 translation regime, 3 things 239 * can disable the S1 translation: 240 * 241 * - HCR_EL2.DC = 1 242 * - HCR_EL2.{E2H,TGE} = {0,1} 243 * - SCTLR_EL1.M = 0 244 * 245 * The TGE part is interesting. If we have decided that this 246 * is EL1&0, then it means that either {E2H,TGE} == {1,0} or 247 * {0,x}, and we only need to test for TGE == 1. 248 */ 249 if (hcr & (HCR_DC | HCR_TGE)) { 250 wr->level = S1_MMU_DISABLED; 251 break; 252 } 253 fallthrough; 254 case TR_EL2: 255 case TR_EL20: 256 if (!(sctlr & SCTLR_ELx_M)) 257 wr->level = S1_MMU_DISABLED; 258 break; 259 } 260 261 if (wr->level == S1_MMU_DISABLED) { 262 if (va >= BIT(kvm_get_pa_bits(vcpu->kvm))) 263 goto addrsz; 264 265 wr->pa = va; 266 return 0; 267 } 268 269 wi->be = sctlr & SCTLR_ELx_EE; 270 271 wi->hpd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP); 272 wi->hpd &= (wi->regime == TR_EL2 ? 273 FIELD_GET(TCR_EL2_HPD, tcr) : 274 (va55 ? 275 FIELD_GET(TCR_HPD1, tcr) : 276 FIELD_GET(TCR_HPD0, tcr))); 277 /* R_JHSVW */ 278 wi->hpd |= s1pie_enabled(vcpu, wi->regime); 279 280 /* Do we have POE? */ 281 compute_s1poe(vcpu, wi); 282 283 /* R_BVXDG */ 284 wi->hpd |= (wi->poe || wi->e0poe); 285 286 /* R_PLCGL, R_YXNYW */ 287 if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) { 288 if (wi->txsz > 39) 289 goto transfault; 290 } else { 291 if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47)) 292 goto transfault; 293 } 294 295 /* R_GTJBY, R_SXWGM */ 296 switch (BIT(wi->pgshift)) { 297 case SZ_4K: 298 case SZ_16K: 299 lva = wi->pa52bit; 300 break; 301 case SZ_64K: 302 lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52); 303 break; 304 } 305 306 if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16)) 307 goto transfault; 308 309 /* R_YYVYV, I_THCZK */ 310 if ((!va55 && va > GENMASK(ia_bits - 1, 0)) || 311 (va55 && va < GENMASK(63, ia_bits))) 312 goto transfault; 313 314 /* I_ZFSYQ */ 315 if (wi->regime != TR_EL2 && 316 (tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK))) 317 goto transfault; 318 319 /* R_BNDVG and following statements */ 320 if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) && 321 wi->as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0))) 322 goto transfault; 323 324 ps = (wi->regime == TR_EL2 ? 325 FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr)); 326 327 wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps, wi->pa52bit)); 328 329 /* Compute minimal alignment */ 330 x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift); 331 332 wi->baddr = ttbr & TTBRx_EL1_BADDR; 333 if (wi->pa52bit) { 334 /* 335 * Force the alignment on 64 bytes for top-level tables 336 * smaller than 8 entries, since TTBR.BADDR[5:2] are used to 337 * store bits [51:48] of the first level of lookup. 338 */ 339 x = max(x, 6); 340 341 wi->baddr |= FIELD_GET(GENMASK_ULL(5, 2), ttbr) << 48; 342 } 343 344 /* R_VPBBF */ 345 if (check_output_size(wi->baddr, wi)) 346 goto addrsz; 347 348 wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x); 349 350 wi->ha = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HAFDBS, AF); 351 wi->ha &= (wi->regime == TR_EL2 ? 352 FIELD_GET(TCR_EL2_HA, tcr) : 353 FIELD_GET(TCR_HA, tcr)); 354 355 return 0; 356 357 addrsz: 358 /* 359 * Address Size Fault level 0 to indicate it comes from TTBR. 360 * yes, this is an oddity. 361 */ 362 fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false); 363 return -EFAULT; 364 365 transfault: 366 /* Translation Fault on start level */ 367 fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(wi->sl), false); 368 return -EFAULT; 369 } 370 371 static int kvm_read_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 *desc, 372 struct s1_walk_info *wi) 373 { 374 u64 val; 375 int r; 376 377 r = kvm_read_guest(vcpu->kvm, pa, &val, sizeof(val)); 378 if (r) 379 return r; 380 381 if (wi->be) 382 *desc = be64_to_cpu((__force __be64)val); 383 else 384 *desc = le64_to_cpu((__force __le64)val); 385 386 return 0; 387 } 388 389 static int kvm_swap_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 old, u64 new, 390 struct s1_walk_info *wi) 391 { 392 if (wi->be) { 393 old = (__force u64)cpu_to_be64(old); 394 new = (__force u64)cpu_to_be64(new); 395 } else { 396 old = (__force u64)cpu_to_le64(old); 397 new = (__force u64)cpu_to_le64(new); 398 } 399 400 return __kvm_at_swap_desc(vcpu->kvm, pa, old, new); 401 } 402 403 static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, 404 struct s1_walk_result *wr, u64 va) 405 { 406 u64 va_top, va_bottom, baddr, desc, new_desc, ipa; 407 struct kvm_s2_trans s2_trans = {}; 408 int level, stride, ret; 409 410 level = wi->sl; 411 stride = wi->pgshift - 3; 412 baddr = wi->baddr; 413 414 va_top = get_ia_size(wi) - 1; 415 416 while (1) { 417 u64 index; 418 419 va_bottom = (3 - level) * stride + wi->pgshift; 420 index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3); 421 422 ipa = baddr | index; 423 424 if (wi->s2) { 425 ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans); 426 if (ret) { 427 fail_s1_walk(wr, 428 (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level, 429 true); 430 return ret; 431 } 432 433 if (!kvm_s2_trans_readable(&s2_trans)) { 434 fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level), 435 true); 436 437 return -EPERM; 438 } 439 440 ipa = kvm_s2_trans_output(&s2_trans); 441 } 442 443 if (wi->filter) { 444 ret = wi->filter->fn(&(struct s1_walk_context) 445 { 446 .wi = wi, 447 .table_ipa = baddr, 448 .level = level, 449 }, wi->filter->priv); 450 if (ret) 451 return ret; 452 } 453 454 ret = kvm_read_s1_desc(vcpu, ipa, &desc, wi); 455 if (ret) { 456 fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false); 457 return ret; 458 } 459 460 new_desc = desc; 461 462 /* Invalid descriptor */ 463 if (!(desc & BIT(0))) 464 goto transfault; 465 466 /* Block mapping, check validity down the line */ 467 if (!(desc & BIT(1))) 468 break; 469 470 /* Page mapping */ 471 if (level == 3) 472 break; 473 474 /* Table handling */ 475 if (!wi->hpd) { 476 wr->APTable |= FIELD_GET(S1_TABLE_AP, desc); 477 wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc); 478 wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc); 479 } 480 481 baddr = desc_to_oa(wi, desc); 482 483 /* Check for out-of-range OA */ 484 if (check_output_size(baddr, wi)) 485 goto addrsz; 486 487 /* Prepare for next round */ 488 va_top = va_bottom - 1; 489 level++; 490 } 491 492 /* Block mapping, check the validity of the level */ 493 if (!(desc & BIT(1))) { 494 bool valid_block = false; 495 496 switch (BIT(wi->pgshift)) { 497 case SZ_4K: 498 valid_block = level == 1 || level == 2 || (wi->pa52bit && level == 0); 499 break; 500 case SZ_16K: 501 case SZ_64K: 502 valid_block = level == 2 || (wi->pa52bit && level == 1); 503 break; 504 } 505 506 if (!valid_block) 507 goto transfault; 508 } 509 510 baddr = desc_to_oa(wi, desc); 511 if (check_output_size(baddr & GENMASK(52, va_bottom), wi)) 512 goto addrsz; 513 514 if (wi->ha) 515 new_desc |= PTE_AF; 516 517 if (new_desc != desc) { 518 if (wi->s2 && !kvm_s2_trans_writable(&s2_trans)) { 519 fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level), true); 520 return -EPERM; 521 } 522 523 ret = kvm_swap_s1_desc(vcpu, ipa, desc, new_desc, wi); 524 if (ret) 525 return ret; 526 527 desc = new_desc; 528 } 529 530 if (!(desc & PTE_AF)) { 531 fail_s1_walk(wr, ESR_ELx_FSC_ACCESS_L(level), false); 532 return -EACCES; 533 } 534 535 va_bottom += contiguous_bit_shift(desc, wi, level); 536 537 wr->failed = false; 538 wr->level = level; 539 wr->desc = desc; 540 wr->pa = baddr & GENMASK(52, va_bottom); 541 wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0); 542 543 wr->nG = (wi->regime != TR_EL2) && (desc & PTE_NG); 544 if (wr->nG) 545 wr->asid = get_asid_by_regime(vcpu, wi->regime); 546 547 return 0; 548 549 addrsz: 550 fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), false); 551 return -EINVAL; 552 transfault: 553 fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), false); 554 return -ENOENT; 555 } 556 557 struct mmu_config { 558 u64 ttbr0; 559 u64 ttbr1; 560 u64 tcr; 561 u64 mair; 562 u64 tcr2; 563 u64 pir; 564 u64 pire0; 565 u64 por_el0; 566 u64 por_el1; 567 u64 sctlr; 568 u64 vttbr; 569 u64 vtcr; 570 }; 571 572 static void __mmu_config_save(struct mmu_config *config) 573 { 574 config->ttbr0 = read_sysreg_el1(SYS_TTBR0); 575 config->ttbr1 = read_sysreg_el1(SYS_TTBR1); 576 config->tcr = read_sysreg_el1(SYS_TCR); 577 config->mair = read_sysreg_el1(SYS_MAIR); 578 if (cpus_have_final_cap(ARM64_HAS_TCR2)) { 579 config->tcr2 = read_sysreg_el1(SYS_TCR2); 580 if (cpus_have_final_cap(ARM64_HAS_S1PIE)) { 581 config->pir = read_sysreg_el1(SYS_PIR); 582 config->pire0 = read_sysreg_el1(SYS_PIRE0); 583 } 584 if (system_supports_poe()) { 585 config->por_el1 = read_sysreg_el1(SYS_POR); 586 config->por_el0 = read_sysreg_s(SYS_POR_EL0); 587 } 588 } 589 config->sctlr = read_sysreg_el1(SYS_SCTLR); 590 config->vttbr = read_sysreg(vttbr_el2); 591 config->vtcr = read_sysreg(vtcr_el2); 592 } 593 594 static void __mmu_config_restore(struct mmu_config *config) 595 { 596 /* 597 * ARM errata 1165522 and 1530923 require TGE to be 1 before 598 * we update the guest state. 599 */ 600 asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); 601 602 write_sysreg_el1(config->ttbr0, SYS_TTBR0); 603 write_sysreg_el1(config->ttbr1, SYS_TTBR1); 604 write_sysreg_el1(config->tcr, SYS_TCR); 605 write_sysreg_el1(config->mair, SYS_MAIR); 606 if (cpus_have_final_cap(ARM64_HAS_TCR2)) { 607 write_sysreg_el1(config->tcr2, SYS_TCR2); 608 if (cpus_have_final_cap(ARM64_HAS_S1PIE)) { 609 write_sysreg_el1(config->pir, SYS_PIR); 610 write_sysreg_el1(config->pire0, SYS_PIRE0); 611 } 612 if (system_supports_poe()) { 613 write_sysreg_el1(config->por_el1, SYS_POR); 614 write_sysreg_s(config->por_el0, SYS_POR_EL0); 615 } 616 } 617 write_sysreg_el1(config->sctlr, SYS_SCTLR); 618 write_sysreg(config->vttbr, vttbr_el2); 619 write_sysreg(config->vtcr, vtcr_el2); 620 } 621 622 static bool at_s1e1p_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) 623 { 624 u64 host_pan; 625 bool fail; 626 627 host_pan = read_sysreg_s(SYS_PSTATE_PAN); 628 write_sysreg_s(*vcpu_cpsr(vcpu) & PSTATE_PAN, SYS_PSTATE_PAN); 629 630 switch (op) { 631 case OP_AT_S1E1RP: 632 fail = __kvm_at(OP_AT_S1E1RP, vaddr); 633 break; 634 case OP_AT_S1E1WP: 635 fail = __kvm_at(OP_AT_S1E1WP, vaddr); 636 break; 637 } 638 639 write_sysreg_s(host_pan, SYS_PSTATE_PAN); 640 641 return fail; 642 } 643 644 #define MEMATTR(ic, oc) (MEMATTR_##oc << 4 | MEMATTR_##ic) 645 #define MEMATTR_NC 0b0100 646 #define MEMATTR_Wt 0b1000 647 #define MEMATTR_Wb 0b1100 648 #define MEMATTR_WbRaWa 0b1111 649 650 #define MEMATTR_IS_DEVICE(m) (((m) & GENMASK(7, 4)) == 0) 651 652 static u8 s2_memattr_to_attr(u8 memattr) 653 { 654 memattr &= 0b1111; 655 656 switch (memattr) { 657 case 0b0000: 658 case 0b0001: 659 case 0b0010: 660 case 0b0011: 661 return memattr << 2; 662 case 0b0100: 663 return MEMATTR(Wb, Wb); 664 case 0b0101: 665 return MEMATTR(NC, NC); 666 case 0b0110: 667 return MEMATTR(Wt, NC); 668 case 0b0111: 669 return MEMATTR(Wb, NC); 670 case 0b1000: 671 /* Reserved, assume NC */ 672 return MEMATTR(NC, NC); 673 case 0b1001: 674 return MEMATTR(NC, Wt); 675 case 0b1010: 676 return MEMATTR(Wt, Wt); 677 case 0b1011: 678 return MEMATTR(Wb, Wt); 679 case 0b1100: 680 /* Reserved, assume NC */ 681 return MEMATTR(NC, NC); 682 case 0b1101: 683 return MEMATTR(NC, Wb); 684 case 0b1110: 685 return MEMATTR(Wt, Wb); 686 case 0b1111: 687 return MEMATTR(Wb, Wb); 688 default: 689 unreachable(); 690 } 691 } 692 693 static u8 combine_s1_s2_attr(u8 s1, u8 s2) 694 { 695 bool transient; 696 u8 final = 0; 697 698 /* Upgrade transient s1 to non-transient to simplify things */ 699 switch (s1) { 700 case 0b0001 ... 0b0011: /* Normal, Write-Through Transient */ 701 transient = true; 702 s1 = MEMATTR_Wt | (s1 & GENMASK(1,0)); 703 break; 704 case 0b0101 ... 0b0111: /* Normal, Write-Back Transient */ 705 transient = true; 706 s1 = MEMATTR_Wb | (s1 & GENMASK(1,0)); 707 break; 708 default: 709 transient = false; 710 } 711 712 /* S2CombineS1AttrHints() */ 713 if ((s1 & GENMASK(3, 2)) == MEMATTR_NC || 714 (s2 & GENMASK(3, 2)) == MEMATTR_NC) 715 final = MEMATTR_NC; 716 else if ((s1 & GENMASK(3, 2)) == MEMATTR_Wt || 717 (s2 & GENMASK(3, 2)) == MEMATTR_Wt) 718 final = MEMATTR_Wt; 719 else 720 final = MEMATTR_Wb; 721 722 if (final != MEMATTR_NC) { 723 /* Inherit RaWa hints form S1 */ 724 if (transient) { 725 switch (s1 & GENMASK(3, 2)) { 726 case MEMATTR_Wt: 727 final = 0; 728 break; 729 case MEMATTR_Wb: 730 final = MEMATTR_NC; 731 break; 732 } 733 } 734 735 final |= s1 & GENMASK(1, 0); 736 } 737 738 return final; 739 } 740 741 #define ATTR_NSH 0b00 742 #define ATTR_RSV 0b01 743 #define ATTR_OSH 0b10 744 #define ATTR_ISH 0b11 745 746 static u8 compute_final_sh(u8 attr, u8 sh) 747 { 748 /* Any form of device, as well as NC has SH[1:0]=0b10 */ 749 if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC)) 750 return ATTR_OSH; 751 752 if (sh == ATTR_RSV) /* Reserved, mapped to NSH */ 753 sh = ATTR_NSH; 754 755 return sh; 756 } 757 758 static u8 compute_s1_sh(struct s1_walk_info *wi, struct s1_walk_result *wr, 759 u8 attr) 760 { 761 u8 sh; 762 763 /* 764 * non-52bit and LPA have their basic shareability described in the 765 * descriptor. LPA2 gets it from the corresponding field in TCR, 766 * conveniently recorded in the walk info. 767 */ 768 if (!wi->pa52bit || BIT(wi->pgshift) == SZ_64K) 769 sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_SH, wr->desc); 770 else 771 sh = wi->sh; 772 773 return compute_final_sh(attr, sh); 774 } 775 776 static u8 combine_sh(u8 s1_sh, u8 s2_sh) 777 { 778 if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH) 779 return ATTR_OSH; 780 if (s1_sh == ATTR_ISH || s2_sh == ATTR_ISH) 781 return ATTR_ISH; 782 783 return ATTR_NSH; 784 } 785 786 static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par, 787 struct kvm_s2_trans *tr) 788 { 789 u8 s1_parattr, s2_memattr, final_attr, s2_sh; 790 u64 par; 791 792 /* If S2 has failed to translate, report the damage */ 793 if (tr->esr) { 794 par = SYS_PAR_EL1_RES1; 795 par |= SYS_PAR_EL1_F; 796 par |= SYS_PAR_EL1_S; 797 par |= FIELD_PREP(SYS_PAR_EL1_FST, tr->esr); 798 return par; 799 } 800 801 s1_parattr = FIELD_GET(SYS_PAR_EL1_ATTR, s1_par); 802 s2_memattr = FIELD_GET(GENMASK(5, 2), tr->desc); 803 804 if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_FWB) { 805 if (!kvm_has_feat(vcpu->kvm, ID_AA64PFR2_EL1, MTEPERM, IMP)) 806 s2_memattr &= ~BIT(3); 807 808 /* Combination of R_VRJSW and R_RHWZM */ 809 switch (s2_memattr) { 810 case 0b0101: 811 if (MEMATTR_IS_DEVICE(s1_parattr)) 812 final_attr = s1_parattr; 813 else 814 final_attr = MEMATTR(NC, NC); 815 break; 816 case 0b0110: 817 case 0b1110: 818 final_attr = MEMATTR(WbRaWa, WbRaWa); 819 break; 820 case 0b0111: 821 case 0b1111: 822 /* Preserve S1 attribute */ 823 final_attr = s1_parattr; 824 break; 825 case 0b0100: 826 case 0b1100: 827 case 0b1101: 828 /* Reserved, do something non-silly */ 829 final_attr = s1_parattr; 830 break; 831 default: 832 /* 833 * MemAttr[2]=0, Device from S2. 834 * 835 * FWB does not influence the way that stage 1 836 * memory types and attributes are combined 837 * with stage 2 Device type and attributes. 838 */ 839 final_attr = min(s2_memattr_to_attr(s2_memattr), 840 s1_parattr); 841 } 842 } else { 843 /* Combination of R_HMNDG, R_TNHFM and R_GQFSF */ 844 u8 s2_parattr = s2_memattr_to_attr(s2_memattr); 845 846 if (MEMATTR_IS_DEVICE(s1_parattr) || 847 MEMATTR_IS_DEVICE(s2_parattr)) { 848 final_attr = min(s1_parattr, s2_parattr); 849 } else { 850 /* At this stage, this is memory vs memory */ 851 final_attr = combine_s1_s2_attr(s1_parattr & 0xf, 852 s2_parattr & 0xf); 853 final_attr |= combine_s1_s2_attr(s1_parattr >> 4, 854 s2_parattr >> 4) << 4; 855 } 856 } 857 858 if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_CD) && 859 !MEMATTR_IS_DEVICE(final_attr)) 860 final_attr = MEMATTR(NC, NC); 861 862 s2_sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_SH, tr->desc); 863 864 par = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr); 865 par |= tr->output & GENMASK(47, 12); 866 par |= FIELD_PREP(SYS_PAR_EL1_SH, 867 combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par), 868 compute_final_sh(final_attr, s2_sh))); 869 870 return par; 871 } 872 873 static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, 874 struct s1_walk_result *wr) 875 { 876 u64 par; 877 878 if (wr->failed) { 879 par = SYS_PAR_EL1_RES1; 880 par |= SYS_PAR_EL1_F; 881 par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst); 882 par |= wr->ptw ? SYS_PAR_EL1_PTW : 0; 883 par |= wr->s2 ? SYS_PAR_EL1_S : 0; 884 } else if (wr->level == S1_MMU_DISABLED) { 885 /* MMU off or HCR_EL2.DC == 1 */ 886 par = SYS_PAR_EL1_NSE; 887 par |= wr->pa & SYS_PAR_EL1_PA; 888 889 if (wi->regime == TR_EL10 && vcpu_has_nv(vcpu) && 890 (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) { 891 par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 892 MEMATTR(WbRaWa, WbRaWa)); 893 par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_NSH); 894 } else { 895 par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */ 896 par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_OSH); 897 } 898 } else { 899 u64 mair, sctlr; 900 u8 sh; 901 902 par = SYS_PAR_EL1_NSE; 903 904 mair = (wi->regime == TR_EL10 ? 905 vcpu_read_sys_reg(vcpu, MAIR_EL1) : 906 vcpu_read_sys_reg(vcpu, MAIR_EL2)); 907 908 mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8; 909 mair &= 0xff; 910 911 sctlr = (wi->regime == TR_EL10 ? 912 vcpu_read_sys_reg(vcpu, SCTLR_EL1) : 913 vcpu_read_sys_reg(vcpu, SCTLR_EL2)); 914 915 /* Force NC for memory if SCTLR_ELx.C is clear */ 916 if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair)) 917 mair = MEMATTR(NC, NC); 918 919 par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair); 920 par |= wr->pa & SYS_PAR_EL1_PA; 921 922 sh = compute_s1_sh(wi, wr, mair); 923 par |= FIELD_PREP(SYS_PAR_EL1_SH, sh); 924 } 925 926 return par; 927 } 928 929 static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) 930 { 931 u64 sctlr; 932 933 if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3)) 934 return false; 935 936 if (s1pie_enabled(vcpu, regime)) 937 return true; 938 939 if (regime == TR_EL10) 940 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); 941 else 942 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); 943 944 return sctlr & SCTLR_EL1_EPAN; 945 } 946 947 static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu, 948 struct s1_walk_info *wi, 949 struct s1_walk_result *wr) 950 { 951 bool wxn; 952 953 /* Non-hierarchical part of AArch64.S1DirectBasePermissions() */ 954 if (wi->regime != TR_EL2) { 955 switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr->desc)) { 956 case 0b00: 957 wr->pr = wr->pw = true; 958 wr->ur = wr->uw = false; 959 break; 960 case 0b01: 961 wr->pr = wr->pw = wr->ur = wr->uw = true; 962 break; 963 case 0b10: 964 wr->pr = true; 965 wr->pw = wr->ur = wr->uw = false; 966 break; 967 case 0b11: 968 wr->pr = wr->ur = true; 969 wr->pw = wr->uw = false; 970 break; 971 } 972 973 /* We don't use px for anything yet, but hey... */ 974 wr->px = !((wr->desc & PTE_PXN) || wr->uw); 975 wr->ux = !(wr->desc & PTE_UXN); 976 } else { 977 wr->ur = wr->uw = wr->ux = false; 978 979 if (!(wr->desc & PTE_RDONLY)) { 980 wr->pr = wr->pw = true; 981 } else { 982 wr->pr = true; 983 wr->pw = false; 984 } 985 986 /* XN maps to UXN */ 987 wr->px = !(wr->desc & PTE_UXN); 988 } 989 990 switch (wi->regime) { 991 case TR_EL2: 992 case TR_EL20: 993 wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_WXN); 994 break; 995 case TR_EL10: 996 wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN); 997 break; 998 } 999 1000 wr->pwxn = wr->uwxn = wxn; 1001 wr->pov = wi->poe; 1002 wr->uov = wi->e0poe; 1003 } 1004 1005 static void compute_s1_hierarchical_permissions(struct kvm_vcpu *vcpu, 1006 struct s1_walk_info *wi, 1007 struct s1_walk_result *wr) 1008 { 1009 /* Hierarchical part of AArch64.S1DirectBasePermissions() */ 1010 if (wi->regime != TR_EL2) { 1011 switch (wr->APTable) { 1012 case 0b00: 1013 break; 1014 case 0b01: 1015 wr->ur = wr->uw = false; 1016 break; 1017 case 0b10: 1018 wr->pw = wr->uw = false; 1019 break; 1020 case 0b11: 1021 wr->pw = wr->ur = wr->uw = false; 1022 break; 1023 } 1024 1025 wr->px &= !wr->PXNTable; 1026 wr->ux &= !wr->UXNTable; 1027 } else { 1028 if (wr->APTable & BIT(1)) 1029 wr->pw = false; 1030 1031 /* XN maps to UXN */ 1032 wr->px &= !wr->UXNTable; 1033 } 1034 } 1035 1036 #define perm_idx(v, r, i) ((vcpu_read_sys_reg((v), (r)) >> ((i) * 4)) & 0xf) 1037 1038 #define set_priv_perms(wr, r, w, x) \ 1039 do { \ 1040 (wr)->pr = (r); \ 1041 (wr)->pw = (w); \ 1042 (wr)->px = (x); \ 1043 } while (0) 1044 1045 #define set_unpriv_perms(wr, r, w, x) \ 1046 do { \ 1047 (wr)->ur = (r); \ 1048 (wr)->uw = (w); \ 1049 (wr)->ux = (x); \ 1050 } while (0) 1051 1052 #define set_priv_wxn(wr, v) \ 1053 do { \ 1054 (wr)->pwxn = (v); \ 1055 } while (0) 1056 1057 #define set_unpriv_wxn(wr, v) \ 1058 do { \ 1059 (wr)->uwxn = (v); \ 1060 } while (0) 1061 1062 /* Similar to AArch64.S1IndirectBasePermissions(), without GCS */ 1063 #define set_perms(w, wr, ip) \ 1064 do { \ 1065 /* R_LLZDZ */ \ 1066 switch ((ip)) { \ 1067 case 0b0000: \ 1068 set_ ## w ## _perms((wr), false, false, false); \ 1069 break; \ 1070 case 0b0001: \ 1071 set_ ## w ## _perms((wr), true , false, false); \ 1072 break; \ 1073 case 0b0010: \ 1074 set_ ## w ## _perms((wr), false, false, true ); \ 1075 break; \ 1076 case 0b0011: \ 1077 set_ ## w ## _perms((wr), true , false, true ); \ 1078 break; \ 1079 case 0b0100: \ 1080 set_ ## w ## _perms((wr), false, false, false); \ 1081 break; \ 1082 case 0b0101: \ 1083 set_ ## w ## _perms((wr), true , true , false); \ 1084 break; \ 1085 case 0b0110: \ 1086 set_ ## w ## _perms((wr), true , true , true ); \ 1087 break; \ 1088 case 0b0111: \ 1089 set_ ## w ## _perms((wr), true , true , true ); \ 1090 break; \ 1091 case 0b1000: \ 1092 set_ ## w ## _perms((wr), true , false, false); \ 1093 break; \ 1094 case 0b1001: \ 1095 set_ ## w ## _perms((wr), true , false, false); \ 1096 break; \ 1097 case 0b1010: \ 1098 set_ ## w ## _perms((wr), true , false, true ); \ 1099 break; \ 1100 case 0b1011: \ 1101 set_ ## w ## _perms((wr), false, false, false); \ 1102 break; \ 1103 case 0b1100: \ 1104 set_ ## w ## _perms((wr), true , true , false); \ 1105 break; \ 1106 case 0b1101: \ 1107 set_ ## w ## _perms((wr), false, false, false); \ 1108 break; \ 1109 case 0b1110: \ 1110 set_ ## w ## _perms((wr), true , true , true ); \ 1111 break; \ 1112 case 0b1111: \ 1113 set_ ## w ## _perms((wr), false, false, false); \ 1114 break; \ 1115 } \ 1116 \ 1117 /* R_HJYGR */ \ 1118 set_ ## w ## _wxn((wr), ((ip) == 0b0110)); \ 1119 \ 1120 } while (0) 1121 1122 static void compute_s1_indirect_permissions(struct kvm_vcpu *vcpu, 1123 struct s1_walk_info *wi, 1124 struct s1_walk_result *wr) 1125 { 1126 u8 up, pp, idx; 1127 1128 idx = pte_pi_index(wr->desc); 1129 1130 switch (wi->regime) { 1131 case TR_EL10: 1132 pp = perm_idx(vcpu, PIR_EL1, idx); 1133 up = perm_idx(vcpu, PIRE0_EL1, idx); 1134 break; 1135 case TR_EL20: 1136 pp = perm_idx(vcpu, PIR_EL2, idx); 1137 up = perm_idx(vcpu, PIRE0_EL2, idx); 1138 break; 1139 case TR_EL2: 1140 pp = perm_idx(vcpu, PIR_EL2, idx); 1141 up = 0; 1142 break; 1143 } 1144 1145 set_perms(priv, wr, pp); 1146 1147 if (wi->regime != TR_EL2) 1148 set_perms(unpriv, wr, up); 1149 else 1150 set_unpriv_perms(wr, false, false, false); 1151 1152 wr->pov = wi->poe && !(pp & BIT(3)); 1153 wr->uov = wi->e0poe && !(up & BIT(3)); 1154 1155 /* R_VFPJF */ 1156 if (wr->px && wr->uw) { 1157 set_priv_perms(wr, false, false, false); 1158 set_unpriv_perms(wr, false, false, false); 1159 } 1160 } 1161 1162 static void compute_s1_overlay_permissions(struct kvm_vcpu *vcpu, 1163 struct s1_walk_info *wi, 1164 struct s1_walk_result *wr) 1165 { 1166 u8 idx, pov_perms, uov_perms; 1167 1168 idx = FIELD_GET(PTE_PO_IDX_MASK, wr->desc); 1169 1170 if (wr->pov) { 1171 switch (wi->regime) { 1172 case TR_EL10: 1173 pov_perms = perm_idx(vcpu, POR_EL1, idx); 1174 break; 1175 case TR_EL20: 1176 pov_perms = perm_idx(vcpu, POR_EL2, idx); 1177 break; 1178 case TR_EL2: 1179 pov_perms = perm_idx(vcpu, POR_EL2, idx); 1180 break; 1181 } 1182 1183 if (pov_perms & ~POE_RWX) 1184 pov_perms = POE_NONE; 1185 1186 /* R_QXXPC, S1PrivOverflow enabled */ 1187 if (wr->pwxn && (pov_perms & POE_X)) 1188 pov_perms &= ~POE_W; 1189 1190 wr->pr &= pov_perms & POE_R; 1191 wr->pw &= pov_perms & POE_W; 1192 wr->px &= pov_perms & POE_X; 1193 } 1194 1195 if (wr->uov) { 1196 switch (wi->regime) { 1197 case TR_EL10: 1198 uov_perms = perm_idx(vcpu, POR_EL0, idx); 1199 break; 1200 case TR_EL20: 1201 uov_perms = perm_idx(vcpu, POR_EL0, idx); 1202 break; 1203 case TR_EL2: 1204 uov_perms = 0; 1205 break; 1206 } 1207 1208 if (uov_perms & ~POE_RWX) 1209 uov_perms = POE_NONE; 1210 1211 /* R_NPBXC, S1UnprivOverlay enabled */ 1212 if (wr->uwxn && (uov_perms & POE_X)) 1213 uov_perms &= ~POE_W; 1214 1215 wr->ur &= uov_perms & POE_R; 1216 wr->uw &= uov_perms & POE_W; 1217 wr->ux &= uov_perms & POE_X; 1218 } 1219 } 1220 1221 static void compute_s1_permissions(struct kvm_vcpu *vcpu, 1222 struct s1_walk_info *wi, 1223 struct s1_walk_result *wr) 1224 { 1225 bool pan; 1226 1227 if (!s1pie_enabled(vcpu, wi->regime)) 1228 compute_s1_direct_permissions(vcpu, wi, wr); 1229 else 1230 compute_s1_indirect_permissions(vcpu, wi, wr); 1231 1232 if (!wi->hpd) 1233 compute_s1_hierarchical_permissions(vcpu, wi, wr); 1234 1235 compute_s1_overlay_permissions(vcpu, wi, wr); 1236 1237 /* R_QXXPC, S1PrivOverlay disabled */ 1238 if (!wr->pov) 1239 wr->px &= !(wr->pwxn && wr->pw); 1240 1241 /* R_NPBXC, S1UnprivOverlay disabled */ 1242 if (!wr->uov) 1243 wr->ux &= !(wr->uwxn && wr->uw); 1244 1245 pan = wi->pan && (wr->ur || wr->uw || 1246 (pan3_enabled(vcpu, wi->regime) && wr->ux)); 1247 wr->pw &= !pan; 1248 wr->pr &= !pan; 1249 } 1250 1251 static int handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr, u64 *par) 1252 { 1253 struct s1_walk_result wr = {}; 1254 struct s1_walk_info wi = {}; 1255 bool perm_fail = false; 1256 int ret, idx; 1257 1258 wi.regime = compute_translation_regime(vcpu, op); 1259 wi.as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W); 1260 wi.pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) && 1261 (*vcpu_cpsr(vcpu) & PSR_PAN_BIT); 1262 1263 ret = setup_s1_walk(vcpu, &wi, &wr, vaddr); 1264 if (ret) 1265 goto compute_par; 1266 1267 if (wr.level == S1_MMU_DISABLED) 1268 goto compute_par; 1269 1270 idx = srcu_read_lock(&vcpu->kvm->srcu); 1271 1272 ret = walk_s1(vcpu, &wi, &wr, vaddr); 1273 1274 srcu_read_unlock(&vcpu->kvm->srcu, idx); 1275 1276 /* 1277 * Race to update a descriptor -- restart the walk. 1278 */ 1279 if (ret == -EAGAIN) 1280 return ret; 1281 if (ret) 1282 goto compute_par; 1283 1284 compute_s1_permissions(vcpu, &wi, &wr); 1285 1286 switch (op) { 1287 case OP_AT_S1E1RP: 1288 case OP_AT_S1E1R: 1289 case OP_AT_S1E2R: 1290 perm_fail = !wr.pr; 1291 break; 1292 case OP_AT_S1E1WP: 1293 case OP_AT_S1E1W: 1294 case OP_AT_S1E2W: 1295 perm_fail = !wr.pw; 1296 break; 1297 case OP_AT_S1E0R: 1298 perm_fail = !wr.ur; 1299 break; 1300 case OP_AT_S1E0W: 1301 perm_fail = !wr.uw; 1302 break; 1303 case OP_AT_S1E1A: 1304 case OP_AT_S1E2A: 1305 break; 1306 default: 1307 BUG(); 1308 } 1309 1310 if (perm_fail) 1311 fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false); 1312 1313 compute_par: 1314 *par = compute_par_s1(vcpu, &wi, &wr); 1315 return 0; 1316 } 1317 1318 /* 1319 * Return the PAR_EL1 value as the result of a valid translation. 1320 * 1321 * If the translation is unsuccessful, the value may only contain 1322 * PAR_EL1.F, and cannot be taken at face value. It isn't an 1323 * indication of the translation having failed, only that the fast 1324 * path did not succeed, *unless* it indicates a S1 permission or 1325 * access fault. 1326 */ 1327 static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) 1328 { 1329 struct mmu_config config; 1330 struct kvm_s2_mmu *mmu; 1331 bool fail, mmu_cs; 1332 u64 par; 1333 1334 par = SYS_PAR_EL1_F; 1335 1336 /* 1337 * We've trapped, so everything is live on the CPU. As we will 1338 * be switching contexts behind everybody's back, disable 1339 * interrupts while holding the mmu lock. 1340 */ 1341 guard(write_lock_irqsave)(&vcpu->kvm->mmu_lock); 1342 1343 /* 1344 * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already 1345 * the right one (as we trapped from vEL2). If not, save the 1346 * full MMU context. 1347 * 1348 * We are also guaranteed to be in the correct context if 1349 * we're not in a nested VM. 1350 */ 1351 mmu_cs = (vcpu_has_nv(vcpu) && 1352 !(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))); 1353 if (!mmu_cs) 1354 goto skip_mmu_switch; 1355 1356 /* 1357 * Obtaining the S2 MMU for a L2 is horribly racy, and we may not 1358 * find it (recycled by another vcpu, for example). When this 1359 * happens, admit defeat immediately and use the SW (slow) path. 1360 */ 1361 mmu = lookup_s2_mmu(vcpu); 1362 if (!mmu) 1363 return par; 1364 1365 __mmu_config_save(&config); 1366 1367 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1), SYS_TTBR0); 1368 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1), SYS_TTBR1); 1369 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1), SYS_TCR); 1370 write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1), SYS_MAIR); 1371 if (kvm_has_tcr2(vcpu->kvm)) { 1372 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR2_EL1), SYS_TCR2); 1373 if (kvm_has_s1pie(vcpu->kvm)) { 1374 write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIR_EL1), SYS_PIR); 1375 write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIRE0_EL1), SYS_PIRE0); 1376 } 1377 if (kvm_has_s1poe(vcpu->kvm)) { 1378 write_sysreg_el1(vcpu_read_sys_reg(vcpu, POR_EL1), SYS_POR); 1379 write_sysreg_s(vcpu_read_sys_reg(vcpu, POR_EL0), SYS_POR_EL0); 1380 } 1381 } 1382 write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1), SYS_SCTLR); 1383 __load_stage2(mmu, mmu->arch); 1384 1385 skip_mmu_switch: 1386 /* Temporarily switch back to guest context */ 1387 write_sysreg_hcr(vcpu->arch.hcr_el2); 1388 isb(); 1389 1390 switch (op) { 1391 case OP_AT_S1E1RP: 1392 case OP_AT_S1E1WP: 1393 fail = at_s1e1p_fast(vcpu, op, vaddr); 1394 break; 1395 case OP_AT_S1E1R: 1396 fail = __kvm_at(OP_AT_S1E1R, vaddr); 1397 break; 1398 case OP_AT_S1E1W: 1399 fail = __kvm_at(OP_AT_S1E1W, vaddr); 1400 break; 1401 case OP_AT_S1E0R: 1402 fail = __kvm_at(OP_AT_S1E0R, vaddr); 1403 break; 1404 case OP_AT_S1E0W: 1405 fail = __kvm_at(OP_AT_S1E0W, vaddr); 1406 break; 1407 case OP_AT_S1E1A: 1408 fail = __kvm_at(OP_AT_S1E1A, vaddr); 1409 break; 1410 default: 1411 WARN_ON_ONCE(1); 1412 fail = true; 1413 break; 1414 } 1415 1416 if (!fail) 1417 par = read_sysreg_par(); 1418 1419 write_sysreg_hcr(HCR_HOST_VHE_FLAGS); 1420 1421 if (mmu_cs) 1422 __mmu_config_restore(&config); 1423 1424 return par; 1425 } 1426 1427 static bool par_check_s1_perm_fault(u64 par) 1428 { 1429 u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par); 1430 1431 return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM && 1432 !(par & SYS_PAR_EL1_S)); 1433 } 1434 1435 static bool par_check_s1_access_fault(u64 par) 1436 { 1437 u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par); 1438 1439 return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_ACCESS && 1440 !(par & SYS_PAR_EL1_S)); 1441 } 1442 1443 int __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) 1444 { 1445 u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr); 1446 int ret; 1447 1448 /* 1449 * If PAR_EL1 reports that AT failed on a S1 permission or access 1450 * fault, we know for sure that the PTW was able to walk the S1 1451 * tables and there's nothing else to do. 1452 * 1453 * If AT failed for any other reason, then we must walk the guest S1 1454 * to emulate the instruction. 1455 */ 1456 if ((par & SYS_PAR_EL1_F) && 1457 !par_check_s1_perm_fault(par) && 1458 !par_check_s1_access_fault(par)) { 1459 ret = handle_at_slow(vcpu, op, vaddr, &par); 1460 if (ret) 1461 return ret; 1462 } 1463 1464 vcpu_write_sys_reg(vcpu, par, PAR_EL1); 1465 return 0; 1466 } 1467 1468 int __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) 1469 { 1470 u64 par; 1471 int ret; 1472 1473 /* 1474 * We've trapped, so everything is live on the CPU. As we will be 1475 * switching context behind everybody's back, disable interrupts... 1476 */ 1477 scoped_guard(write_lock_irqsave, &vcpu->kvm->mmu_lock) { 1478 u64 val, hcr; 1479 bool fail; 1480 1481 val = hcr = read_sysreg(hcr_el2); 1482 val &= ~HCR_TGE; 1483 val |= HCR_VM; 1484 1485 if (!vcpu_el2_e2h_is_set(vcpu)) 1486 val |= HCR_NV | HCR_NV1; 1487 1488 write_sysreg_hcr(val); 1489 isb(); 1490 1491 par = SYS_PAR_EL1_F; 1492 1493 switch (op) { 1494 case OP_AT_S1E2R: 1495 fail = __kvm_at(OP_AT_S1E1R, vaddr); 1496 break; 1497 case OP_AT_S1E2W: 1498 fail = __kvm_at(OP_AT_S1E1W, vaddr); 1499 break; 1500 case OP_AT_S1E2A: 1501 fail = __kvm_at(OP_AT_S1E1A, vaddr); 1502 break; 1503 default: 1504 WARN_ON_ONCE(1); 1505 fail = true; 1506 } 1507 1508 if (!fail) 1509 par = read_sysreg_par(); 1510 1511 write_sysreg_hcr(hcr); 1512 isb(); 1513 } 1514 1515 /* We failed the translation, let's replay it in slow motion */ 1516 if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) { 1517 ret = handle_at_slow(vcpu, op, vaddr, &par); 1518 if (ret) 1519 return ret; 1520 } 1521 1522 vcpu_write_sys_reg(vcpu, par, PAR_EL1); 1523 return 0; 1524 } 1525 1526 int __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) 1527 { 1528 struct kvm_s2_trans out = {}; 1529 u64 ipa, par; 1530 bool write; 1531 int ret; 1532 1533 /* Do the stage-1 translation */ 1534 switch (op) { 1535 case OP_AT_S12E1R: 1536 op = OP_AT_S1E1R; 1537 write = false; 1538 break; 1539 case OP_AT_S12E1W: 1540 op = OP_AT_S1E1W; 1541 write = true; 1542 break; 1543 case OP_AT_S12E0R: 1544 op = OP_AT_S1E0R; 1545 write = false; 1546 break; 1547 case OP_AT_S12E0W: 1548 op = OP_AT_S1E0W; 1549 write = true; 1550 break; 1551 default: 1552 WARN_ON_ONCE(1); 1553 return 0; 1554 } 1555 1556 __kvm_at_s1e01(vcpu, op, vaddr); 1557 par = vcpu_read_sys_reg(vcpu, PAR_EL1); 1558 if (par & SYS_PAR_EL1_F) 1559 return 0; 1560 1561 /* 1562 * If we only have a single stage of translation (EL2&0), exit 1563 * early. Same thing if {VM,DC}=={0,0}. 1564 */ 1565 if (compute_translation_regime(vcpu, op) == TR_EL20 || 1566 !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC))) 1567 return 0; 1568 1569 /* Do the stage-2 translation */ 1570 ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0)); 1571 out.esr = 0; 1572 ret = kvm_walk_nested_s2(vcpu, ipa, &out); 1573 if (ret < 0) 1574 return ret; 1575 1576 /* Check the access permission */ 1577 if (!out.esr && 1578 ((!write && !out.readable) || (write && !out.writable))) 1579 out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3); 1580 1581 par = compute_par_s12(vcpu, par, &out); 1582 vcpu_write_sys_reg(vcpu, par, PAR_EL1); 1583 return 0; 1584 } 1585 1586 /* 1587 * Translate a VA for a given EL in a given translation regime, with 1588 * or without PAN. This requires wi->{regime, as_el0, pan} to be 1589 * set. The rest of the wi and wr should be 0-initialised. 1590 */ 1591 int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, 1592 struct s1_walk_result *wr, u64 va) 1593 { 1594 int ret; 1595 1596 ret = setup_s1_walk(vcpu, wi, wr, va); 1597 if (ret) 1598 return ret; 1599 1600 if (wr->level == S1_MMU_DISABLED) { 1601 wr->ur = wr->uw = wr->ux = true; 1602 wr->pr = wr->pw = wr->px = true; 1603 } else { 1604 ret = walk_s1(vcpu, wi, wr, va); 1605 if (ret) 1606 return ret; 1607 1608 compute_s1_permissions(vcpu, wi, wr); 1609 } 1610 1611 return 0; 1612 } 1613 1614 struct desc_match { 1615 u64 ipa; 1616 int level; 1617 }; 1618 1619 static int match_s1_desc(struct s1_walk_context *ctxt, void *priv) 1620 { 1621 struct desc_match *dm = priv; 1622 u64 ipa = dm->ipa; 1623 1624 /* Use S1 granule alignment */ 1625 ipa &= GENMASK(51, ctxt->wi->pgshift); 1626 1627 /* Not the IPA we're looking for? Continue. */ 1628 if (ipa != ctxt->table_ipa) 1629 return 0; 1630 1631 /* Note the level and interrupt the walk */ 1632 dm->level = ctxt->level; 1633 return -EINTR; 1634 } 1635 1636 int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level) 1637 { 1638 struct desc_match dm = { 1639 .ipa = ipa, 1640 }; 1641 struct s1_walk_info wi = { 1642 .filter = &(struct s1_walk_filter){ 1643 .fn = match_s1_desc, 1644 .priv = &dm, 1645 }, 1646 .as_el0 = false, 1647 .pan = false, 1648 }; 1649 struct s1_walk_result wr = {}; 1650 int ret; 1651 1652 if (is_hyp_ctxt(vcpu)) 1653 wi.regime = vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2; 1654 else 1655 wi.regime = TR_EL10; 1656 1657 ret = setup_s1_walk(vcpu, &wi, &wr, va); 1658 if (ret) 1659 return ret; 1660 1661 /* We really expect the S1 MMU to be on here... */ 1662 if (WARN_ON_ONCE(wr.level == S1_MMU_DISABLED)) { 1663 *level = 0; 1664 return 0; 1665 } 1666 1667 /* Walk the guest's PT, looking for a match along the way */ 1668 ret = walk_s1(vcpu, &wi, &wr, va); 1669 switch (ret) { 1670 case -EINTR: 1671 /* We interrupted the walk on a match, return the level */ 1672 *level = dm.level; 1673 return 0; 1674 case 0: 1675 /* The walk completed, we failed to find the entry */ 1676 return -ENOENT; 1677 default: 1678 /* Any other error... */ 1679 return ret; 1680 } 1681 } 1682 1683 static int __lsui_swap_desc(u64 __user *ptep, u64 old, u64 new) 1684 { 1685 u64 tmp = old; 1686 int ret = 0; 1687 1688 /* 1689 * Wrap LSUI instructions with uaccess_ttbr0_enable()/disable(), 1690 * as PAN toggling is not required. 1691 */ 1692 uaccess_ttbr0_enable(); 1693 1694 asm volatile(__LSUI_PREAMBLE 1695 "1: cast %[old], %[new], %[addr]\n" 1696 "2:\n" 1697 _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret]) 1698 : [old] "+r" (old), [addr] "+Q" (*ptep), [ret] "+r" (ret) 1699 : [new] "r" (new) 1700 : "memory"); 1701 1702 uaccess_ttbr0_disable(); 1703 1704 if (ret) 1705 return ret; 1706 if (tmp != old) 1707 return -EAGAIN; 1708 1709 return ret; 1710 } 1711 1712 static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new) 1713 { 1714 u64 tmp = old; 1715 int ret = 0; 1716 1717 uaccess_enable_privileged(); 1718 1719 asm volatile(__LSE_PREAMBLE 1720 "1: cas %[old], %[new], %[addr]\n" 1721 "2:\n" 1722 _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret]) 1723 : [old] "+r" (old), [addr] "+Q" (*ptep), [ret] "+r" (ret) 1724 : [new] "r" (new) 1725 : "memory"); 1726 1727 uaccess_disable_privileged(); 1728 1729 if (ret) 1730 return ret; 1731 if (tmp != old) 1732 return -EAGAIN; 1733 1734 return ret; 1735 } 1736 1737 static int __llsc_swap_desc(u64 __user *ptep, u64 old, u64 new) 1738 { 1739 int ret = 1; 1740 u64 tmp; 1741 1742 uaccess_enable_privileged(); 1743 1744 asm volatile("prfm pstl1strm, %[addr]\n" 1745 "1: ldxr %[tmp], %[addr]\n" 1746 "sub %[tmp], %[tmp], %[old]\n" 1747 "cbnz %[tmp], 3f\n" 1748 "2: stlxr %w[ret], %[new], %[addr]\n" 1749 "3:\n" 1750 _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w[ret]) 1751 _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w[ret]) 1752 : [ret] "+r" (ret), [addr] "+Q" (*ptep), [tmp] "=&r" (tmp) 1753 : [old] "r" (old), [new] "r" (new) 1754 : "memory"); 1755 1756 uaccess_disable_privileged(); 1757 1758 /* STLXR didn't update the descriptor, or the compare failed */ 1759 if (ret == 1) 1760 return -EAGAIN; 1761 1762 return ret; 1763 } 1764 1765 int __kvm_at_swap_desc(struct kvm *kvm, gpa_t ipa, u64 old, u64 new) 1766 { 1767 struct kvm_memory_slot *slot; 1768 unsigned long hva; 1769 u64 __user *ptep; 1770 bool writable; 1771 int offset; 1772 gfn_t gfn; 1773 int r; 1774 1775 lockdep_assert(srcu_read_lock_held(&kvm->srcu)); 1776 1777 gfn = ipa >> PAGE_SHIFT; 1778 offset = offset_in_page(ipa); 1779 slot = gfn_to_memslot(kvm, gfn); 1780 hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); 1781 if (kvm_is_error_hva(hva)) 1782 return -EINVAL; 1783 if (!writable) 1784 return -EPERM; 1785 1786 ptep = (void __user *)hva + offset; 1787 if (cpus_have_final_cap(ARM64_HAS_LSUI)) 1788 r = __lsui_swap_desc(ptep, old, new); 1789 else if (cpus_have_final_cap(ARM64_HAS_LSE_ATOMICS)) 1790 r = __lse_swap_desc(ptep, old, new); 1791 else 1792 r = __llsc_swap_desc(ptep, old, new); 1793 1794 if (r < 0) 1795 return r; 1796 1797 mark_page_dirty_in_slot(kvm, slot, gfn); 1798 return 0; 1799 } 1800