1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University 4 * Author: Christoffer Dall <c.dall@virtualopensystems.com> 5 */ 6 7 #include <linux/bug.h> 8 #include <linux/cpu_pm.h> 9 #include <linux/entry-kvm.h> 10 #include <linux/errno.h> 11 #include <linux/err.h> 12 #include <linux/kvm_host.h> 13 #include <linux/list.h> 14 #include <linux/module.h> 15 #include <linux/vmalloc.h> 16 #include <linux/fs.h> 17 #include <linux/mman.h> 18 #include <linux/sched.h> 19 #include <linux/kmemleak.h> 20 #include <linux/kvm.h> 21 #include <linux/kvm_irqfd.h> 22 #include <linux/irqbypass.h> 23 #include <linux/sched/stat.h> 24 #include <linux/psci.h> 25 #include <trace/events/kvm.h> 26 27 #define CREATE_TRACE_POINTS 28 #include "trace_arm.h" 29 30 #include <linux/uaccess.h> 31 #include <asm/ptrace.h> 32 #include <asm/mman.h> 33 #include <asm/tlbflush.h> 34 #include <asm/cacheflush.h> 35 #include <asm/cpufeature.h> 36 #include <asm/virt.h> 37 #include <asm/kvm_arm.h> 38 #include <asm/kvm_asm.h> 39 #include <asm/kvm_mmu.h> 40 #include <asm/kvm_emulate.h> 41 #include <asm/sections.h> 42 43 #include <kvm/arm_hypercalls.h> 44 #include <kvm/arm_pmu.h> 45 #include <kvm/arm_psci.h> 46 47 static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT; 48 DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized); 49 50 DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); 51 52 static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); 53 unsigned long kvm_arm_hyp_percpu_base[NR_CPUS]; 54 DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); 55 56 /* The VMID used in the VTTBR */ 57 static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); 58 static u32 kvm_next_vmid; 59 static DEFINE_SPINLOCK(kvm_vmid_lock); 60 61 static bool vgic_present; 62 63 static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled); 64 DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use); 65 66 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) 67 { 68 return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; 69 } 70 71 int kvm_arch_hardware_setup(void *opaque) 72 { 73 return 0; 74 } 75 76 int kvm_arch_check_processor_compat(void *opaque) 77 { 78 return 0; 79 } 80 81 int kvm_vm_ioctl_enable_cap(struct kvm *kvm, 82 struct kvm_enable_cap *cap) 83 { 84 int r; 85 86 if (cap->flags) 87 return -EINVAL; 88 89 switch (cap->cap) { 90 case KVM_CAP_ARM_NISV_TO_USER: 91 r = 0; 92 kvm->arch.return_nisv_io_abort_to_user = true; 93 break; 94 case KVM_CAP_ARM_MTE: 95 mutex_lock(&kvm->lock); 96 if (!system_supports_mte() || kvm->created_vcpus) { 97 r = -EINVAL; 98 } else { 99 r = 0; 100 kvm->arch.mte_enabled = true; 101 } 102 mutex_unlock(&kvm->lock); 103 break; 104 default: 105 r = -EINVAL; 106 break; 107 } 108 109 return r; 110 } 111 112 static int kvm_arm_default_max_vcpus(void) 113 { 114 return vgic_present ? kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS; 115 } 116 117 static void set_default_spectre(struct kvm *kvm) 118 { 119 /* 120 * The default is to expose CSV2 == 1 if the HW isn't affected. 121 * Although this is a per-CPU feature, we make it global because 122 * asymmetric systems are just a nuisance. 123 * 124 * Userspace can override this as long as it doesn't promise 125 * the impossible. 126 */ 127 if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED) 128 kvm->arch.pfr0_csv2 = 1; 129 if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED) 130 kvm->arch.pfr0_csv3 = 1; 131 } 132 133 /** 134 * kvm_arch_init_vm - initializes a VM data structure 135 * @kvm: pointer to the KVM struct 136 */ 137 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) 138 { 139 int ret; 140 141 ret = kvm_arm_setup_stage2(kvm, type); 142 if (ret) 143 return ret; 144 145 ret = kvm_init_stage2_mmu(kvm, &kvm->arch.mmu); 146 if (ret) 147 return ret; 148 149 ret = create_hyp_mappings(kvm, kvm + 1, PAGE_HYP); 150 if (ret) 151 goto out_free_stage2_pgd; 152 153 kvm_vgic_early_init(kvm); 154 155 /* The maximum number of VCPUs is limited by the host's GIC model */ 156 kvm->arch.max_vcpus = kvm_arm_default_max_vcpus(); 157 158 set_default_spectre(kvm); 159 160 return ret; 161 out_free_stage2_pgd: 162 kvm_free_stage2_pgd(&kvm->arch.mmu); 163 return ret; 164 } 165 166 vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) 167 { 168 return VM_FAULT_SIGBUS; 169 } 170 171 172 /** 173 * kvm_arch_destroy_vm - destroy the VM data structure 174 * @kvm: pointer to the KVM struct 175 */ 176 void kvm_arch_destroy_vm(struct kvm *kvm) 177 { 178 int i; 179 180 bitmap_free(kvm->arch.pmu_filter); 181 182 kvm_vgic_destroy(kvm); 183 184 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 185 if (kvm->vcpus[i]) { 186 kvm_vcpu_destroy(kvm->vcpus[i]); 187 kvm->vcpus[i] = NULL; 188 } 189 } 190 atomic_set(&kvm->online_vcpus, 0); 191 } 192 193 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) 194 { 195 int r; 196 switch (ext) { 197 case KVM_CAP_IRQCHIP: 198 r = vgic_present; 199 break; 200 case KVM_CAP_IOEVENTFD: 201 case KVM_CAP_DEVICE_CTRL: 202 case KVM_CAP_USER_MEMORY: 203 case KVM_CAP_SYNC_MMU: 204 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 205 case KVM_CAP_ONE_REG: 206 case KVM_CAP_ARM_PSCI: 207 case KVM_CAP_ARM_PSCI_0_2: 208 case KVM_CAP_READONLY_MEM: 209 case KVM_CAP_MP_STATE: 210 case KVM_CAP_IMMEDIATE_EXIT: 211 case KVM_CAP_VCPU_EVENTS: 212 case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2: 213 case KVM_CAP_ARM_NISV_TO_USER: 214 case KVM_CAP_ARM_INJECT_EXT_DABT: 215 case KVM_CAP_SET_GUEST_DEBUG: 216 case KVM_CAP_VCPU_ATTRIBUTES: 217 case KVM_CAP_PTP_KVM: 218 r = 1; 219 break; 220 case KVM_CAP_SET_GUEST_DEBUG2: 221 return KVM_GUESTDBG_VALID_MASK; 222 case KVM_CAP_ARM_SET_DEVICE_ADDR: 223 r = 1; 224 break; 225 case KVM_CAP_NR_VCPUS: 226 /* 227 * ARM64 treats KVM_CAP_NR_CPUS differently from all other 228 * architectures, as it does not always bound it to 229 * KVM_CAP_MAX_VCPUS. It should not matter much because 230 * this is just an advisory value. 231 */ 232 r = min_t(unsigned int, num_online_cpus(), 233 kvm_arm_default_max_vcpus()); 234 break; 235 case KVM_CAP_MAX_VCPUS: 236 case KVM_CAP_MAX_VCPU_ID: 237 if (kvm) 238 r = kvm->arch.max_vcpus; 239 else 240 r = kvm_arm_default_max_vcpus(); 241 break; 242 case KVM_CAP_MSI_DEVID: 243 if (!kvm) 244 r = -EINVAL; 245 else 246 r = kvm->arch.vgic.msis_require_devid; 247 break; 248 case KVM_CAP_ARM_USER_IRQ: 249 /* 250 * 1: EL1_VTIMER, EL1_PTIMER, and PMU. 251 * (bump this number if adding more devices) 252 */ 253 r = 1; 254 break; 255 case KVM_CAP_ARM_MTE: 256 r = system_supports_mte(); 257 break; 258 case KVM_CAP_STEAL_TIME: 259 r = kvm_arm_pvtime_supported(); 260 break; 261 case KVM_CAP_ARM_EL1_32BIT: 262 r = cpus_have_const_cap(ARM64_HAS_32BIT_EL1); 263 break; 264 case KVM_CAP_GUEST_DEBUG_HW_BPS: 265 r = get_num_brps(); 266 break; 267 case KVM_CAP_GUEST_DEBUG_HW_WPS: 268 r = get_num_wrps(); 269 break; 270 case KVM_CAP_ARM_PMU_V3: 271 r = kvm_arm_support_pmu_v3(); 272 break; 273 case KVM_CAP_ARM_INJECT_SERROR_ESR: 274 r = cpus_have_const_cap(ARM64_HAS_RAS_EXTN); 275 break; 276 case KVM_CAP_ARM_VM_IPA_SIZE: 277 r = get_kvm_ipa_limit(); 278 break; 279 case KVM_CAP_ARM_SVE: 280 r = system_supports_sve(); 281 break; 282 case KVM_CAP_ARM_PTRAUTH_ADDRESS: 283 case KVM_CAP_ARM_PTRAUTH_GENERIC: 284 r = system_has_full_ptr_auth(); 285 break; 286 default: 287 r = 0; 288 } 289 290 return r; 291 } 292 293 long kvm_arch_dev_ioctl(struct file *filp, 294 unsigned int ioctl, unsigned long arg) 295 { 296 return -EINVAL; 297 } 298 299 struct kvm *kvm_arch_alloc_vm(void) 300 { 301 size_t sz = sizeof(struct kvm); 302 303 if (!has_vhe()) 304 return kzalloc(sz, GFP_KERNEL_ACCOUNT); 305 306 return __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM | __GFP_ZERO); 307 } 308 309 int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) 310 { 311 if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) 312 return -EBUSY; 313 314 if (id >= kvm->arch.max_vcpus) 315 return -EINVAL; 316 317 return 0; 318 } 319 320 int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) 321 { 322 int err; 323 324 /* Force users to call KVM_ARM_VCPU_INIT */ 325 vcpu->arch.target = -1; 326 bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); 327 328 vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO; 329 330 /* Set up the timer */ 331 kvm_timer_vcpu_init(vcpu); 332 333 kvm_pmu_vcpu_init(vcpu); 334 335 kvm_arm_reset_debug_ptr(vcpu); 336 337 kvm_arm_pvtime_vcpu_init(&vcpu->arch); 338 339 vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu; 340 341 err = kvm_vgic_vcpu_init(vcpu); 342 if (err) 343 return err; 344 345 return create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP); 346 } 347 348 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) 349 { 350 } 351 352 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 353 { 354 if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm))) 355 static_branch_dec(&userspace_irqchip_in_use); 356 357 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); 358 kvm_timer_vcpu_terminate(vcpu); 359 kvm_pmu_vcpu_destroy(vcpu); 360 361 kvm_arm_vcpu_destroy(vcpu); 362 } 363 364 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) 365 { 366 return kvm_timer_is_pending(vcpu); 367 } 368 369 void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) 370 { 371 /* 372 * If we're about to block (most likely because we've just hit a 373 * WFI), we need to sync back the state of the GIC CPU interface 374 * so that we have the latest PMR and group enables. This ensures 375 * that kvm_arch_vcpu_runnable has up-to-date data to decide 376 * whether we have pending interrupts. 377 * 378 * For the same reason, we want to tell GICv4 that we need 379 * doorbells to be signalled, should an interrupt become pending. 380 */ 381 preempt_disable(); 382 kvm_vgic_vmcr_sync(vcpu); 383 vgic_v4_put(vcpu, true); 384 preempt_enable(); 385 } 386 387 void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) 388 { 389 preempt_disable(); 390 vgic_v4_load(vcpu); 391 preempt_enable(); 392 } 393 394 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 395 { 396 struct kvm_s2_mmu *mmu; 397 int *last_ran; 398 399 mmu = vcpu->arch.hw_mmu; 400 last_ran = this_cpu_ptr(mmu->last_vcpu_ran); 401 402 /* 403 * We guarantee that both TLBs and I-cache are private to each 404 * vcpu. If detecting that a vcpu from the same VM has 405 * previously run on the same physical CPU, call into the 406 * hypervisor code to nuke the relevant contexts. 407 * 408 * We might get preempted before the vCPU actually runs, but 409 * over-invalidation doesn't affect correctness. 410 */ 411 if (*last_ran != vcpu->vcpu_id) { 412 kvm_call_hyp(__kvm_flush_cpu_context, mmu); 413 *last_ran = vcpu->vcpu_id; 414 } 415 416 vcpu->cpu = cpu; 417 418 kvm_vgic_load(vcpu); 419 kvm_timer_vcpu_load(vcpu); 420 if (has_vhe()) 421 kvm_vcpu_load_sysregs_vhe(vcpu); 422 kvm_arch_vcpu_load_fp(vcpu); 423 kvm_vcpu_pmu_restore_guest(vcpu); 424 if (kvm_arm_is_pvtime_enabled(&vcpu->arch)) 425 kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu); 426 427 if (single_task_running()) 428 vcpu_clear_wfx_traps(vcpu); 429 else 430 vcpu_set_wfx_traps(vcpu); 431 432 if (vcpu_has_ptrauth(vcpu)) 433 vcpu_ptrauth_disable(vcpu); 434 kvm_arch_vcpu_load_debug_state_flags(vcpu); 435 } 436 437 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 438 { 439 kvm_arch_vcpu_put_debug_state_flags(vcpu); 440 kvm_arch_vcpu_put_fp(vcpu); 441 if (has_vhe()) 442 kvm_vcpu_put_sysregs_vhe(vcpu); 443 kvm_timer_vcpu_put(vcpu); 444 kvm_vgic_put(vcpu); 445 kvm_vcpu_pmu_restore_host(vcpu); 446 447 vcpu->cpu = -1; 448 } 449 450 static void vcpu_power_off(struct kvm_vcpu *vcpu) 451 { 452 vcpu->arch.power_off = true; 453 kvm_make_request(KVM_REQ_SLEEP, vcpu); 454 kvm_vcpu_kick(vcpu); 455 } 456 457 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 458 struct kvm_mp_state *mp_state) 459 { 460 if (vcpu->arch.power_off) 461 mp_state->mp_state = KVM_MP_STATE_STOPPED; 462 else 463 mp_state->mp_state = KVM_MP_STATE_RUNNABLE; 464 465 return 0; 466 } 467 468 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 469 struct kvm_mp_state *mp_state) 470 { 471 int ret = 0; 472 473 switch (mp_state->mp_state) { 474 case KVM_MP_STATE_RUNNABLE: 475 vcpu->arch.power_off = false; 476 break; 477 case KVM_MP_STATE_STOPPED: 478 vcpu_power_off(vcpu); 479 break; 480 default: 481 ret = -EINVAL; 482 } 483 484 return ret; 485 } 486 487 /** 488 * kvm_arch_vcpu_runnable - determine if the vcpu can be scheduled 489 * @v: The VCPU pointer 490 * 491 * If the guest CPU is not waiting for interrupts or an interrupt line is 492 * asserted, the CPU is by definition runnable. 493 */ 494 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) 495 { 496 bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF); 497 return ((irq_lines || kvm_vgic_vcpu_pending_irq(v)) 498 && !v->arch.power_off && !v->arch.pause); 499 } 500 501 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) 502 { 503 return vcpu_mode_priv(vcpu); 504 } 505 506 /* Just ensure a guest exit from a particular CPU */ 507 static void exit_vm_noop(void *info) 508 { 509 } 510 511 void force_vm_exit(const cpumask_t *mask) 512 { 513 preempt_disable(); 514 smp_call_function_many(mask, exit_vm_noop, NULL, true); 515 preempt_enable(); 516 } 517 518 /** 519 * need_new_vmid_gen - check that the VMID is still valid 520 * @vmid: The VMID to check 521 * 522 * return true if there is a new generation of VMIDs being used 523 * 524 * The hardware supports a limited set of values with the value zero reserved 525 * for the host, so we check if an assigned value belongs to a previous 526 * generation, which requires us to assign a new value. If we're the first to 527 * use a VMID for the new generation, we must flush necessary caches and TLBs 528 * on all CPUs. 529 */ 530 static bool need_new_vmid_gen(struct kvm_vmid *vmid) 531 { 532 u64 current_vmid_gen = atomic64_read(&kvm_vmid_gen); 533 smp_rmb(); /* Orders read of kvm_vmid_gen and kvm->arch.vmid */ 534 return unlikely(READ_ONCE(vmid->vmid_gen) != current_vmid_gen); 535 } 536 537 /** 538 * update_vmid - Update the vmid with a valid VMID for the current generation 539 * @vmid: The stage-2 VMID information struct 540 */ 541 static void update_vmid(struct kvm_vmid *vmid) 542 { 543 if (!need_new_vmid_gen(vmid)) 544 return; 545 546 spin_lock(&kvm_vmid_lock); 547 548 /* 549 * We need to re-check the vmid_gen here to ensure that if another vcpu 550 * already allocated a valid vmid for this vm, then this vcpu should 551 * use the same vmid. 552 */ 553 if (!need_new_vmid_gen(vmid)) { 554 spin_unlock(&kvm_vmid_lock); 555 return; 556 } 557 558 /* First user of a new VMID generation? */ 559 if (unlikely(kvm_next_vmid == 0)) { 560 atomic64_inc(&kvm_vmid_gen); 561 kvm_next_vmid = 1; 562 563 /* 564 * On SMP we know no other CPUs can use this CPU's or each 565 * other's VMID after force_vm_exit returns since the 566 * kvm_vmid_lock blocks them from reentry to the guest. 567 */ 568 force_vm_exit(cpu_all_mask); 569 /* 570 * Now broadcast TLB + ICACHE invalidation over the inner 571 * shareable domain to make sure all data structures are 572 * clean. 573 */ 574 kvm_call_hyp(__kvm_flush_vm_context); 575 } 576 577 WRITE_ONCE(vmid->vmid, kvm_next_vmid); 578 kvm_next_vmid++; 579 kvm_next_vmid &= (1 << kvm_get_vmid_bits()) - 1; 580 581 smp_wmb(); 582 WRITE_ONCE(vmid->vmid_gen, atomic64_read(&kvm_vmid_gen)); 583 584 spin_unlock(&kvm_vmid_lock); 585 } 586 587 static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) 588 { 589 struct kvm *kvm = vcpu->kvm; 590 int ret = 0; 591 592 if (likely(vcpu->arch.has_run_once)) 593 return 0; 594 595 if (!kvm_arm_vcpu_is_finalized(vcpu)) 596 return -EPERM; 597 598 vcpu->arch.has_run_once = true; 599 600 kvm_arm_vcpu_init_debug(vcpu); 601 602 if (likely(irqchip_in_kernel(kvm))) { 603 /* 604 * Map the VGIC hardware resources before running a vcpu the 605 * first time on this VM. 606 */ 607 ret = kvm_vgic_map_resources(kvm); 608 if (ret) 609 return ret; 610 } else { 611 /* 612 * Tell the rest of the code that there are userspace irqchip 613 * VMs in the wild. 614 */ 615 static_branch_inc(&userspace_irqchip_in_use); 616 } 617 618 ret = kvm_timer_enable(vcpu); 619 if (ret) 620 return ret; 621 622 ret = kvm_arm_pmu_v3_enable(vcpu); 623 624 /* 625 * Initialize traps for protected VMs. 626 * NOTE: Move to run in EL2 directly, rather than via a hypercall, once 627 * the code is in place for first run initialization at EL2. 628 */ 629 if (kvm_vm_is_protected(kvm)) 630 kvm_call_hyp_nvhe(__pkvm_vcpu_init_traps, vcpu); 631 632 return ret; 633 } 634 635 bool kvm_arch_intc_initialized(struct kvm *kvm) 636 { 637 return vgic_initialized(kvm); 638 } 639 640 void kvm_arm_halt_guest(struct kvm *kvm) 641 { 642 int i; 643 struct kvm_vcpu *vcpu; 644 645 kvm_for_each_vcpu(i, vcpu, kvm) 646 vcpu->arch.pause = true; 647 kvm_make_all_cpus_request(kvm, KVM_REQ_SLEEP); 648 } 649 650 void kvm_arm_resume_guest(struct kvm *kvm) 651 { 652 int i; 653 struct kvm_vcpu *vcpu; 654 655 kvm_for_each_vcpu(i, vcpu, kvm) { 656 vcpu->arch.pause = false; 657 rcuwait_wake_up(kvm_arch_vcpu_get_wait(vcpu)); 658 } 659 } 660 661 static void vcpu_req_sleep(struct kvm_vcpu *vcpu) 662 { 663 struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu); 664 665 rcuwait_wait_event(wait, 666 (!vcpu->arch.power_off) &&(!vcpu->arch.pause), 667 TASK_INTERRUPTIBLE); 668 669 if (vcpu->arch.power_off || vcpu->arch.pause) { 670 /* Awaken to handle a signal, request we sleep again later. */ 671 kvm_make_request(KVM_REQ_SLEEP, vcpu); 672 } 673 674 /* 675 * Make sure we will observe a potential reset request if we've 676 * observed a change to the power state. Pairs with the smp_wmb() in 677 * kvm_psci_vcpu_on(). 678 */ 679 smp_rmb(); 680 } 681 682 static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu) 683 { 684 return vcpu->arch.target >= 0; 685 } 686 687 static void check_vcpu_requests(struct kvm_vcpu *vcpu) 688 { 689 if (kvm_request_pending(vcpu)) { 690 if (kvm_check_request(KVM_REQ_SLEEP, vcpu)) 691 vcpu_req_sleep(vcpu); 692 693 if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu)) 694 kvm_reset_vcpu(vcpu); 695 696 /* 697 * Clear IRQ_PENDING requests that were made to guarantee 698 * that a VCPU sees new virtual interrupts. 699 */ 700 kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu); 701 702 if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu)) 703 kvm_update_stolen_time(vcpu); 704 705 if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) { 706 /* The distributor enable bits were changed */ 707 preempt_disable(); 708 vgic_v4_put(vcpu, false); 709 vgic_v4_load(vcpu); 710 preempt_enable(); 711 } 712 713 if (kvm_check_request(KVM_REQ_RELOAD_PMU, vcpu)) 714 kvm_pmu_handle_pmcr(vcpu, 715 __vcpu_sys_reg(vcpu, PMCR_EL0)); 716 } 717 } 718 719 static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu) 720 { 721 if (likely(!vcpu_mode_is_32bit(vcpu))) 722 return false; 723 724 return !system_supports_32bit_el0() || 725 static_branch_unlikely(&arm64_mismatched_32bit_el0); 726 } 727 728 /** 729 * kvm_vcpu_exit_request - returns true if the VCPU should *not* enter the guest 730 * @vcpu: The VCPU pointer 731 * @ret: Pointer to write optional return code 732 * 733 * Returns: true if the VCPU needs to return to a preemptible + interruptible 734 * and skip guest entry. 735 * 736 * This function disambiguates between two different types of exits: exits to a 737 * preemptible + interruptible kernel context and exits to userspace. For an 738 * exit to userspace, this function will write the return code to ret and return 739 * true. For an exit to preemptible + interruptible kernel context (i.e. check 740 * for pending work and re-enter), return true without writing to ret. 741 */ 742 static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret) 743 { 744 struct kvm_run *run = vcpu->run; 745 746 /* 747 * If we're using a userspace irqchip, then check if we need 748 * to tell a userspace irqchip about timer or PMU level 749 * changes and if so, exit to userspace (the actual level 750 * state gets updated in kvm_timer_update_run and 751 * kvm_pmu_update_run below). 752 */ 753 if (static_branch_unlikely(&userspace_irqchip_in_use)) { 754 if (kvm_timer_should_notify_user(vcpu) || 755 kvm_pmu_should_notify_user(vcpu)) { 756 *ret = -EINTR; 757 run->exit_reason = KVM_EXIT_INTR; 758 return true; 759 } 760 } 761 762 return kvm_request_pending(vcpu) || 763 need_new_vmid_gen(&vcpu->arch.hw_mmu->vmid) || 764 xfer_to_guest_mode_work_pending(); 765 } 766 767 /** 768 * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code 769 * @vcpu: The VCPU pointer 770 * 771 * This function is called through the VCPU_RUN ioctl called from user space. It 772 * will execute VM code in a loop until the time slice for the process is used 773 * or some emulation is needed from user space in which case the function will 774 * return with return value 0 and with the kvm_run structure filled in with the 775 * required data for the requested emulation. 776 */ 777 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) 778 { 779 struct kvm_run *run = vcpu->run; 780 int ret; 781 782 if (unlikely(!kvm_vcpu_initialized(vcpu))) 783 return -ENOEXEC; 784 785 ret = kvm_vcpu_first_run_init(vcpu); 786 if (ret) 787 return ret; 788 789 if (run->exit_reason == KVM_EXIT_MMIO) { 790 ret = kvm_handle_mmio_return(vcpu); 791 if (ret) 792 return ret; 793 } 794 795 vcpu_load(vcpu); 796 797 if (run->immediate_exit) { 798 ret = -EINTR; 799 goto out; 800 } 801 802 kvm_sigset_activate(vcpu); 803 804 ret = 1; 805 run->exit_reason = KVM_EXIT_UNKNOWN; 806 while (ret > 0) { 807 /* 808 * Check conditions before entering the guest 809 */ 810 ret = xfer_to_guest_mode_handle_work(vcpu); 811 if (!ret) 812 ret = 1; 813 814 update_vmid(&vcpu->arch.hw_mmu->vmid); 815 816 check_vcpu_requests(vcpu); 817 818 /* 819 * Preparing the interrupts to be injected also 820 * involves poking the GIC, which must be done in a 821 * non-preemptible context. 822 */ 823 preempt_disable(); 824 825 kvm_pmu_flush_hwstate(vcpu); 826 827 local_irq_disable(); 828 829 kvm_vgic_flush_hwstate(vcpu); 830 831 /* 832 * Ensure we set mode to IN_GUEST_MODE after we disable 833 * interrupts and before the final VCPU requests check. 834 * See the comment in kvm_vcpu_exiting_guest_mode() and 835 * Documentation/virt/kvm/vcpu-requests.rst 836 */ 837 smp_store_mb(vcpu->mode, IN_GUEST_MODE); 838 839 if (ret <= 0 || kvm_vcpu_exit_request(vcpu, &ret)) { 840 vcpu->mode = OUTSIDE_GUEST_MODE; 841 isb(); /* Ensure work in x_flush_hwstate is committed */ 842 kvm_pmu_sync_hwstate(vcpu); 843 if (static_branch_unlikely(&userspace_irqchip_in_use)) 844 kvm_timer_sync_user(vcpu); 845 kvm_vgic_sync_hwstate(vcpu); 846 local_irq_enable(); 847 preempt_enable(); 848 continue; 849 } 850 851 kvm_arm_setup_debug(vcpu); 852 853 /************************************************************** 854 * Enter the guest 855 */ 856 trace_kvm_entry(*vcpu_pc(vcpu)); 857 guest_enter_irqoff(); 858 859 ret = kvm_call_hyp_ret(__kvm_vcpu_run, vcpu); 860 861 vcpu->mode = OUTSIDE_GUEST_MODE; 862 vcpu->stat.exits++; 863 /* 864 * Back from guest 865 *************************************************************/ 866 867 kvm_arm_clear_debug(vcpu); 868 869 /* 870 * We must sync the PMU state before the vgic state so 871 * that the vgic can properly sample the updated state of the 872 * interrupt line. 873 */ 874 kvm_pmu_sync_hwstate(vcpu); 875 876 /* 877 * Sync the vgic state before syncing the timer state because 878 * the timer code needs to know if the virtual timer 879 * interrupts are active. 880 */ 881 kvm_vgic_sync_hwstate(vcpu); 882 883 /* 884 * Sync the timer hardware state before enabling interrupts as 885 * we don't want vtimer interrupts to race with syncing the 886 * timer virtual interrupt state. 887 */ 888 if (static_branch_unlikely(&userspace_irqchip_in_use)) 889 kvm_timer_sync_user(vcpu); 890 891 kvm_arch_vcpu_ctxsync_fp(vcpu); 892 893 /* 894 * We may have taken a host interrupt in HYP mode (ie 895 * while executing the guest). This interrupt is still 896 * pending, as we haven't serviced it yet! 897 * 898 * We're now back in SVC mode, with interrupts 899 * disabled. Enabling the interrupts now will have 900 * the effect of taking the interrupt again, in SVC 901 * mode this time. 902 */ 903 local_irq_enable(); 904 905 /* 906 * We do local_irq_enable() before calling guest_exit() so 907 * that if a timer interrupt hits while running the guest we 908 * account that tick as being spent in the guest. We enable 909 * preemption after calling guest_exit() so that if we get 910 * preempted we make sure ticks after that is not counted as 911 * guest time. 912 */ 913 guest_exit(); 914 trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); 915 916 /* Exit types that need handling before we can be preempted */ 917 handle_exit_early(vcpu, ret); 918 919 preempt_enable(); 920 921 /* 922 * The ARMv8 architecture doesn't give the hypervisor 923 * a mechanism to prevent a guest from dropping to AArch32 EL0 924 * if implemented by the CPU. If we spot the guest in such 925 * state and that we decided it wasn't supposed to do so (like 926 * with the asymmetric AArch32 case), return to userspace with 927 * a fatal error. 928 */ 929 if (vcpu_mode_is_bad_32bit(vcpu)) { 930 /* 931 * As we have caught the guest red-handed, decide that 932 * it isn't fit for purpose anymore by making the vcpu 933 * invalid. The VMM can try and fix it by issuing a 934 * KVM_ARM_VCPU_INIT if it really wants to. 935 */ 936 vcpu->arch.target = -1; 937 ret = ARM_EXCEPTION_IL; 938 } 939 940 ret = handle_exit(vcpu, ret); 941 } 942 943 /* Tell userspace about in-kernel device output levels */ 944 if (unlikely(!irqchip_in_kernel(vcpu->kvm))) { 945 kvm_timer_update_run(vcpu); 946 kvm_pmu_update_run(vcpu); 947 } 948 949 kvm_sigset_deactivate(vcpu); 950 951 out: 952 /* 953 * In the unlikely event that we are returning to userspace 954 * with pending exceptions or PC adjustment, commit these 955 * adjustments in order to give userspace a consistent view of 956 * the vcpu state. Note that this relies on __kvm_adjust_pc() 957 * being preempt-safe on VHE. 958 */ 959 if (unlikely(vcpu->arch.flags & (KVM_ARM64_PENDING_EXCEPTION | 960 KVM_ARM64_INCREMENT_PC))) 961 kvm_call_hyp(__kvm_adjust_pc, vcpu); 962 963 vcpu_put(vcpu); 964 return ret; 965 } 966 967 static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level) 968 { 969 int bit_index; 970 bool set; 971 unsigned long *hcr; 972 973 if (number == KVM_ARM_IRQ_CPU_IRQ) 974 bit_index = __ffs(HCR_VI); 975 else /* KVM_ARM_IRQ_CPU_FIQ */ 976 bit_index = __ffs(HCR_VF); 977 978 hcr = vcpu_hcr(vcpu); 979 if (level) 980 set = test_and_set_bit(bit_index, hcr); 981 else 982 set = test_and_clear_bit(bit_index, hcr); 983 984 /* 985 * If we didn't change anything, no need to wake up or kick other CPUs 986 */ 987 if (set == level) 988 return 0; 989 990 /* 991 * The vcpu irq_lines field was updated, wake up sleeping VCPUs and 992 * trigger a world-switch round on the running physical CPU to set the 993 * virtual IRQ/FIQ fields in the HCR appropriately. 994 */ 995 kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); 996 kvm_vcpu_kick(vcpu); 997 998 return 0; 999 } 1000 1001 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, 1002 bool line_status) 1003 { 1004 u32 irq = irq_level->irq; 1005 unsigned int irq_type, vcpu_idx, irq_num; 1006 int nrcpus = atomic_read(&kvm->online_vcpus); 1007 struct kvm_vcpu *vcpu = NULL; 1008 bool level = irq_level->level; 1009 1010 irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK; 1011 vcpu_idx = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK; 1012 vcpu_idx += ((irq >> KVM_ARM_IRQ_VCPU2_SHIFT) & KVM_ARM_IRQ_VCPU2_MASK) * (KVM_ARM_IRQ_VCPU_MASK + 1); 1013 irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK; 1014 1015 trace_kvm_irq_line(irq_type, vcpu_idx, irq_num, irq_level->level); 1016 1017 switch (irq_type) { 1018 case KVM_ARM_IRQ_TYPE_CPU: 1019 if (irqchip_in_kernel(kvm)) 1020 return -ENXIO; 1021 1022 if (vcpu_idx >= nrcpus) 1023 return -EINVAL; 1024 1025 vcpu = kvm_get_vcpu(kvm, vcpu_idx); 1026 if (!vcpu) 1027 return -EINVAL; 1028 1029 if (irq_num > KVM_ARM_IRQ_CPU_FIQ) 1030 return -EINVAL; 1031 1032 return vcpu_interrupt_line(vcpu, irq_num, level); 1033 case KVM_ARM_IRQ_TYPE_PPI: 1034 if (!irqchip_in_kernel(kvm)) 1035 return -ENXIO; 1036 1037 if (vcpu_idx >= nrcpus) 1038 return -EINVAL; 1039 1040 vcpu = kvm_get_vcpu(kvm, vcpu_idx); 1041 if (!vcpu) 1042 return -EINVAL; 1043 1044 if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS) 1045 return -EINVAL; 1046 1047 return kvm_vgic_inject_irq(kvm, vcpu->vcpu_id, irq_num, level, NULL); 1048 case KVM_ARM_IRQ_TYPE_SPI: 1049 if (!irqchip_in_kernel(kvm)) 1050 return -ENXIO; 1051 1052 if (irq_num < VGIC_NR_PRIVATE_IRQS) 1053 return -EINVAL; 1054 1055 return kvm_vgic_inject_irq(kvm, 0, irq_num, level, NULL); 1056 } 1057 1058 return -EINVAL; 1059 } 1060 1061 static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, 1062 const struct kvm_vcpu_init *init) 1063 { 1064 unsigned int i, ret; 1065 u32 phys_target = kvm_target_cpu(); 1066 1067 if (init->target != phys_target) 1068 return -EINVAL; 1069 1070 /* 1071 * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must 1072 * use the same target. 1073 */ 1074 if (vcpu->arch.target != -1 && vcpu->arch.target != init->target) 1075 return -EINVAL; 1076 1077 /* -ENOENT for unknown features, -EINVAL for invalid combinations. */ 1078 for (i = 0; i < sizeof(init->features) * 8; i++) { 1079 bool set = (init->features[i / 32] & (1 << (i % 32))); 1080 1081 if (set && i >= KVM_VCPU_MAX_FEATURES) 1082 return -ENOENT; 1083 1084 /* 1085 * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must 1086 * use the same feature set. 1087 */ 1088 if (vcpu->arch.target != -1 && i < KVM_VCPU_MAX_FEATURES && 1089 test_bit(i, vcpu->arch.features) != set) 1090 return -EINVAL; 1091 1092 if (set) 1093 set_bit(i, vcpu->arch.features); 1094 } 1095 1096 vcpu->arch.target = phys_target; 1097 1098 /* Now we know what it is, we can reset it. */ 1099 ret = kvm_reset_vcpu(vcpu); 1100 if (ret) { 1101 vcpu->arch.target = -1; 1102 bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); 1103 } 1104 1105 return ret; 1106 } 1107 1108 static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, 1109 struct kvm_vcpu_init *init) 1110 { 1111 int ret; 1112 1113 ret = kvm_vcpu_set_target(vcpu, init); 1114 if (ret) 1115 return ret; 1116 1117 /* 1118 * Ensure a rebooted VM will fault in RAM pages and detect if the 1119 * guest MMU is turned off and flush the caches as needed. 1120 * 1121 * S2FWB enforces all memory accesses to RAM being cacheable, 1122 * ensuring that the data side is always coherent. We still 1123 * need to invalidate the I-cache though, as FWB does *not* 1124 * imply CTR_EL0.DIC. 1125 */ 1126 if (vcpu->arch.has_run_once) { 1127 if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) 1128 stage2_unmap_vm(vcpu->kvm); 1129 else 1130 icache_inval_all_pou(); 1131 } 1132 1133 vcpu_reset_hcr(vcpu); 1134 vcpu->arch.cptr_el2 = CPTR_EL2_DEFAULT; 1135 1136 /* 1137 * Handle the "start in power-off" case. 1138 */ 1139 if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features)) 1140 vcpu_power_off(vcpu); 1141 else 1142 vcpu->arch.power_off = false; 1143 1144 return 0; 1145 } 1146 1147 static int kvm_arm_vcpu_set_attr(struct kvm_vcpu *vcpu, 1148 struct kvm_device_attr *attr) 1149 { 1150 int ret = -ENXIO; 1151 1152 switch (attr->group) { 1153 default: 1154 ret = kvm_arm_vcpu_arch_set_attr(vcpu, attr); 1155 break; 1156 } 1157 1158 return ret; 1159 } 1160 1161 static int kvm_arm_vcpu_get_attr(struct kvm_vcpu *vcpu, 1162 struct kvm_device_attr *attr) 1163 { 1164 int ret = -ENXIO; 1165 1166 switch (attr->group) { 1167 default: 1168 ret = kvm_arm_vcpu_arch_get_attr(vcpu, attr); 1169 break; 1170 } 1171 1172 return ret; 1173 } 1174 1175 static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu, 1176 struct kvm_device_attr *attr) 1177 { 1178 int ret = -ENXIO; 1179 1180 switch (attr->group) { 1181 default: 1182 ret = kvm_arm_vcpu_arch_has_attr(vcpu, attr); 1183 break; 1184 } 1185 1186 return ret; 1187 } 1188 1189 static int kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, 1190 struct kvm_vcpu_events *events) 1191 { 1192 memset(events, 0, sizeof(*events)); 1193 1194 return __kvm_arm_vcpu_get_events(vcpu, events); 1195 } 1196 1197 static int kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, 1198 struct kvm_vcpu_events *events) 1199 { 1200 int i; 1201 1202 /* check whether the reserved field is zero */ 1203 for (i = 0; i < ARRAY_SIZE(events->reserved); i++) 1204 if (events->reserved[i]) 1205 return -EINVAL; 1206 1207 /* check whether the pad field is zero */ 1208 for (i = 0; i < ARRAY_SIZE(events->exception.pad); i++) 1209 if (events->exception.pad[i]) 1210 return -EINVAL; 1211 1212 return __kvm_arm_vcpu_set_events(vcpu, events); 1213 } 1214 1215 long kvm_arch_vcpu_ioctl(struct file *filp, 1216 unsigned int ioctl, unsigned long arg) 1217 { 1218 struct kvm_vcpu *vcpu = filp->private_data; 1219 void __user *argp = (void __user *)arg; 1220 struct kvm_device_attr attr; 1221 long r; 1222 1223 switch (ioctl) { 1224 case KVM_ARM_VCPU_INIT: { 1225 struct kvm_vcpu_init init; 1226 1227 r = -EFAULT; 1228 if (copy_from_user(&init, argp, sizeof(init))) 1229 break; 1230 1231 r = kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init); 1232 break; 1233 } 1234 case KVM_SET_ONE_REG: 1235 case KVM_GET_ONE_REG: { 1236 struct kvm_one_reg reg; 1237 1238 r = -ENOEXEC; 1239 if (unlikely(!kvm_vcpu_initialized(vcpu))) 1240 break; 1241 1242 r = -EFAULT; 1243 if (copy_from_user(®, argp, sizeof(reg))) 1244 break; 1245 1246 /* 1247 * We could owe a reset due to PSCI. Handle the pending reset 1248 * here to ensure userspace register accesses are ordered after 1249 * the reset. 1250 */ 1251 if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu)) 1252 kvm_reset_vcpu(vcpu); 1253 1254 if (ioctl == KVM_SET_ONE_REG) 1255 r = kvm_arm_set_reg(vcpu, ®); 1256 else 1257 r = kvm_arm_get_reg(vcpu, ®); 1258 break; 1259 } 1260 case KVM_GET_REG_LIST: { 1261 struct kvm_reg_list __user *user_list = argp; 1262 struct kvm_reg_list reg_list; 1263 unsigned n; 1264 1265 r = -ENOEXEC; 1266 if (unlikely(!kvm_vcpu_initialized(vcpu))) 1267 break; 1268 1269 r = -EPERM; 1270 if (!kvm_arm_vcpu_is_finalized(vcpu)) 1271 break; 1272 1273 r = -EFAULT; 1274 if (copy_from_user(®_list, user_list, sizeof(reg_list))) 1275 break; 1276 n = reg_list.n; 1277 reg_list.n = kvm_arm_num_regs(vcpu); 1278 if (copy_to_user(user_list, ®_list, sizeof(reg_list))) 1279 break; 1280 r = -E2BIG; 1281 if (n < reg_list.n) 1282 break; 1283 r = kvm_arm_copy_reg_indices(vcpu, user_list->reg); 1284 break; 1285 } 1286 case KVM_SET_DEVICE_ATTR: { 1287 r = -EFAULT; 1288 if (copy_from_user(&attr, argp, sizeof(attr))) 1289 break; 1290 r = kvm_arm_vcpu_set_attr(vcpu, &attr); 1291 break; 1292 } 1293 case KVM_GET_DEVICE_ATTR: { 1294 r = -EFAULT; 1295 if (copy_from_user(&attr, argp, sizeof(attr))) 1296 break; 1297 r = kvm_arm_vcpu_get_attr(vcpu, &attr); 1298 break; 1299 } 1300 case KVM_HAS_DEVICE_ATTR: { 1301 r = -EFAULT; 1302 if (copy_from_user(&attr, argp, sizeof(attr))) 1303 break; 1304 r = kvm_arm_vcpu_has_attr(vcpu, &attr); 1305 break; 1306 } 1307 case KVM_GET_VCPU_EVENTS: { 1308 struct kvm_vcpu_events events; 1309 1310 if (kvm_arm_vcpu_get_events(vcpu, &events)) 1311 return -EINVAL; 1312 1313 if (copy_to_user(argp, &events, sizeof(events))) 1314 return -EFAULT; 1315 1316 return 0; 1317 } 1318 case KVM_SET_VCPU_EVENTS: { 1319 struct kvm_vcpu_events events; 1320 1321 if (copy_from_user(&events, argp, sizeof(events))) 1322 return -EFAULT; 1323 1324 return kvm_arm_vcpu_set_events(vcpu, &events); 1325 } 1326 case KVM_ARM_VCPU_FINALIZE: { 1327 int what; 1328 1329 if (!kvm_vcpu_initialized(vcpu)) 1330 return -ENOEXEC; 1331 1332 if (get_user(what, (const int __user *)argp)) 1333 return -EFAULT; 1334 1335 return kvm_arm_vcpu_finalize(vcpu, what); 1336 } 1337 default: 1338 r = -EINVAL; 1339 } 1340 1341 return r; 1342 } 1343 1344 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) 1345 { 1346 1347 } 1348 1349 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, 1350 const struct kvm_memory_slot *memslot) 1351 { 1352 kvm_flush_remote_tlbs(kvm); 1353 } 1354 1355 static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, 1356 struct kvm_arm_device_addr *dev_addr) 1357 { 1358 unsigned long dev_id, type; 1359 1360 dev_id = (dev_addr->id & KVM_ARM_DEVICE_ID_MASK) >> 1361 KVM_ARM_DEVICE_ID_SHIFT; 1362 type = (dev_addr->id & KVM_ARM_DEVICE_TYPE_MASK) >> 1363 KVM_ARM_DEVICE_TYPE_SHIFT; 1364 1365 switch (dev_id) { 1366 case KVM_ARM_DEVICE_VGIC_V2: 1367 if (!vgic_present) 1368 return -ENXIO; 1369 return kvm_vgic_addr(kvm, type, &dev_addr->addr, true); 1370 default: 1371 return -ENODEV; 1372 } 1373 } 1374 1375 long kvm_arch_vm_ioctl(struct file *filp, 1376 unsigned int ioctl, unsigned long arg) 1377 { 1378 struct kvm *kvm = filp->private_data; 1379 void __user *argp = (void __user *)arg; 1380 1381 switch (ioctl) { 1382 case KVM_CREATE_IRQCHIP: { 1383 int ret; 1384 if (!vgic_present) 1385 return -ENXIO; 1386 mutex_lock(&kvm->lock); 1387 ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); 1388 mutex_unlock(&kvm->lock); 1389 return ret; 1390 } 1391 case KVM_ARM_SET_DEVICE_ADDR: { 1392 struct kvm_arm_device_addr dev_addr; 1393 1394 if (copy_from_user(&dev_addr, argp, sizeof(dev_addr))) 1395 return -EFAULT; 1396 return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr); 1397 } 1398 case KVM_ARM_PREFERRED_TARGET: { 1399 struct kvm_vcpu_init init; 1400 1401 kvm_vcpu_preferred_target(&init); 1402 1403 if (copy_to_user(argp, &init, sizeof(init))) 1404 return -EFAULT; 1405 1406 return 0; 1407 } 1408 case KVM_ARM_MTE_COPY_TAGS: { 1409 struct kvm_arm_copy_mte_tags copy_tags; 1410 1411 if (copy_from_user(©_tags, argp, sizeof(copy_tags))) 1412 return -EFAULT; 1413 return kvm_vm_ioctl_mte_copy_tags(kvm, ©_tags); 1414 } 1415 default: 1416 return -EINVAL; 1417 } 1418 } 1419 1420 static unsigned long nvhe_percpu_size(void) 1421 { 1422 return (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_end) - 1423 (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_start); 1424 } 1425 1426 static unsigned long nvhe_percpu_order(void) 1427 { 1428 unsigned long size = nvhe_percpu_size(); 1429 1430 return size ? get_order(size) : 0; 1431 } 1432 1433 /* A lookup table holding the hypervisor VA for each vector slot */ 1434 static void *hyp_spectre_vector_selector[BP_HARDEN_EL2_SLOTS]; 1435 1436 static void kvm_init_vector_slot(void *base, enum arm64_hyp_spectre_vector slot) 1437 { 1438 hyp_spectre_vector_selector[slot] = __kvm_vector_slot2addr(base, slot); 1439 } 1440 1441 static int kvm_init_vector_slots(void) 1442 { 1443 int err; 1444 void *base; 1445 1446 base = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector)); 1447 kvm_init_vector_slot(base, HYP_VECTOR_DIRECT); 1448 1449 base = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs)); 1450 kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_DIRECT); 1451 1452 if (!cpus_have_const_cap(ARM64_SPECTRE_V3A)) 1453 return 0; 1454 1455 if (!has_vhe()) { 1456 err = create_hyp_exec_mappings(__pa_symbol(__bp_harden_hyp_vecs), 1457 __BP_HARDEN_HYP_VECS_SZ, &base); 1458 if (err) 1459 return err; 1460 } 1461 1462 kvm_init_vector_slot(base, HYP_VECTOR_INDIRECT); 1463 kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_INDIRECT); 1464 return 0; 1465 } 1466 1467 static void cpu_prepare_hyp_mode(int cpu) 1468 { 1469 struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); 1470 unsigned long tcr; 1471 1472 /* 1473 * Calculate the raw per-cpu offset without a translation from the 1474 * kernel's mapping to the linear mapping, and store it in tpidr_el2 1475 * so that we can use adr_l to access per-cpu variables in EL2. 1476 * Also drop the KASAN tag which gets in the way... 1477 */ 1478 params->tpidr_el2 = (unsigned long)kasan_reset_tag(per_cpu_ptr_nvhe_sym(__per_cpu_start, cpu)) - 1479 (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start)); 1480 1481 params->mair_el2 = read_sysreg(mair_el1); 1482 1483 /* 1484 * The ID map may be configured to use an extended virtual address 1485 * range. This is only the case if system RAM is out of range for the 1486 * currently configured page size and VA_BITS, in which case we will 1487 * also need the extended virtual range for the HYP ID map, or we won't 1488 * be able to enable the EL2 MMU. 1489 * 1490 * However, at EL2, there is only one TTBR register, and we can't switch 1491 * between translation tables *and* update TCR_EL2.T0SZ at the same 1492 * time. Bottom line: we need to use the extended range with *both* our 1493 * translation tables. 1494 * 1495 * So use the same T0SZ value we use for the ID map. 1496 */ 1497 tcr = (read_sysreg(tcr_el1) & TCR_EL2_MASK) | TCR_EL2_RES1; 1498 tcr &= ~TCR_T0SZ_MASK; 1499 tcr |= (idmap_t0sz & GENMASK(TCR_TxSZ_WIDTH - 1, 0)) << TCR_T0SZ_OFFSET; 1500 params->tcr_el2 = tcr; 1501 1502 params->stack_hyp_va = kern_hyp_va(per_cpu(kvm_arm_hyp_stack_page, cpu) + PAGE_SIZE); 1503 params->pgd_pa = kvm_mmu_get_httbr(); 1504 if (is_protected_kvm_enabled()) 1505 params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS; 1506 else 1507 params->hcr_el2 = HCR_HOST_NVHE_FLAGS; 1508 params->vttbr = params->vtcr = 0; 1509 1510 /* 1511 * Flush the init params from the data cache because the struct will 1512 * be read while the MMU is off. 1513 */ 1514 kvm_flush_dcache_to_poc(params, sizeof(*params)); 1515 } 1516 1517 static void hyp_install_host_vector(void) 1518 { 1519 struct kvm_nvhe_init_params *params; 1520 struct arm_smccc_res res; 1521 1522 /* Switch from the HYP stub to our own HYP init vector */ 1523 __hyp_set_vectors(kvm_get_idmap_vector()); 1524 1525 /* 1526 * Call initialization code, and switch to the full blown HYP code. 1527 * If the cpucaps haven't been finalized yet, something has gone very 1528 * wrong, and hyp will crash and burn when it uses any 1529 * cpus_have_const_cap() wrapper. 1530 */ 1531 BUG_ON(!system_capabilities_finalized()); 1532 params = this_cpu_ptr_nvhe_sym(kvm_init_params); 1533 arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init), virt_to_phys(params), &res); 1534 WARN_ON(res.a0 != SMCCC_RET_SUCCESS); 1535 } 1536 1537 static void cpu_init_hyp_mode(void) 1538 { 1539 hyp_install_host_vector(); 1540 1541 /* 1542 * Disabling SSBD on a non-VHE system requires us to enable SSBS 1543 * at EL2. 1544 */ 1545 if (this_cpu_has_cap(ARM64_SSBS) && 1546 arm64_get_spectre_v4_state() == SPECTRE_VULNERABLE) { 1547 kvm_call_hyp_nvhe(__kvm_enable_ssbs); 1548 } 1549 } 1550 1551 static void cpu_hyp_reset(void) 1552 { 1553 if (!is_kernel_in_hyp_mode()) 1554 __hyp_reset_vectors(); 1555 } 1556 1557 /* 1558 * EL2 vectors can be mapped and rerouted in a number of ways, 1559 * depending on the kernel configuration and CPU present: 1560 * 1561 * - If the CPU is affected by Spectre-v2, the hardening sequence is 1562 * placed in one of the vector slots, which is executed before jumping 1563 * to the real vectors. 1564 * 1565 * - If the CPU also has the ARM64_SPECTRE_V3A cap, the slot 1566 * containing the hardening sequence is mapped next to the idmap page, 1567 * and executed before jumping to the real vectors. 1568 * 1569 * - If the CPU only has the ARM64_SPECTRE_V3A cap, then an 1570 * empty slot is selected, mapped next to the idmap page, and 1571 * executed before jumping to the real vectors. 1572 * 1573 * Note that ARM64_SPECTRE_V3A is somewhat incompatible with 1574 * VHE, as we don't have hypervisor-specific mappings. If the system 1575 * is VHE and yet selects this capability, it will be ignored. 1576 */ 1577 static void cpu_set_hyp_vector(void) 1578 { 1579 struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data); 1580 void *vector = hyp_spectre_vector_selector[data->slot]; 1581 1582 if (!is_protected_kvm_enabled()) 1583 *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vector; 1584 else 1585 kvm_call_hyp_nvhe(__pkvm_cpu_set_vector, data->slot); 1586 } 1587 1588 static void cpu_hyp_init_context(void) 1589 { 1590 kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt); 1591 1592 if (!is_kernel_in_hyp_mode()) 1593 cpu_init_hyp_mode(); 1594 } 1595 1596 static void cpu_hyp_init_features(void) 1597 { 1598 cpu_set_hyp_vector(); 1599 kvm_arm_init_debug(); 1600 1601 if (is_kernel_in_hyp_mode()) 1602 kvm_timer_init_vhe(); 1603 1604 if (vgic_present) 1605 kvm_vgic_init_cpu_hardware(); 1606 } 1607 1608 static void cpu_hyp_reinit(void) 1609 { 1610 cpu_hyp_reset(); 1611 cpu_hyp_init_context(); 1612 cpu_hyp_init_features(); 1613 } 1614 1615 static void _kvm_arch_hardware_enable(void *discard) 1616 { 1617 if (!__this_cpu_read(kvm_arm_hardware_enabled)) { 1618 cpu_hyp_reinit(); 1619 __this_cpu_write(kvm_arm_hardware_enabled, 1); 1620 } 1621 } 1622 1623 int kvm_arch_hardware_enable(void) 1624 { 1625 _kvm_arch_hardware_enable(NULL); 1626 return 0; 1627 } 1628 1629 static void _kvm_arch_hardware_disable(void *discard) 1630 { 1631 if (__this_cpu_read(kvm_arm_hardware_enabled)) { 1632 cpu_hyp_reset(); 1633 __this_cpu_write(kvm_arm_hardware_enabled, 0); 1634 } 1635 } 1636 1637 void kvm_arch_hardware_disable(void) 1638 { 1639 if (!is_protected_kvm_enabled()) 1640 _kvm_arch_hardware_disable(NULL); 1641 } 1642 1643 #ifdef CONFIG_CPU_PM 1644 static int hyp_init_cpu_pm_notifier(struct notifier_block *self, 1645 unsigned long cmd, 1646 void *v) 1647 { 1648 /* 1649 * kvm_arm_hardware_enabled is left with its old value over 1650 * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should 1651 * re-enable hyp. 1652 */ 1653 switch (cmd) { 1654 case CPU_PM_ENTER: 1655 if (__this_cpu_read(kvm_arm_hardware_enabled)) 1656 /* 1657 * don't update kvm_arm_hardware_enabled here 1658 * so that the hardware will be re-enabled 1659 * when we resume. See below. 1660 */ 1661 cpu_hyp_reset(); 1662 1663 return NOTIFY_OK; 1664 case CPU_PM_ENTER_FAILED: 1665 case CPU_PM_EXIT: 1666 if (__this_cpu_read(kvm_arm_hardware_enabled)) 1667 /* The hardware was enabled before suspend. */ 1668 cpu_hyp_reinit(); 1669 1670 return NOTIFY_OK; 1671 1672 default: 1673 return NOTIFY_DONE; 1674 } 1675 } 1676 1677 static struct notifier_block hyp_init_cpu_pm_nb = { 1678 .notifier_call = hyp_init_cpu_pm_notifier, 1679 }; 1680 1681 static void hyp_cpu_pm_init(void) 1682 { 1683 if (!is_protected_kvm_enabled()) 1684 cpu_pm_register_notifier(&hyp_init_cpu_pm_nb); 1685 } 1686 static void hyp_cpu_pm_exit(void) 1687 { 1688 if (!is_protected_kvm_enabled()) 1689 cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb); 1690 } 1691 #else 1692 static inline void hyp_cpu_pm_init(void) 1693 { 1694 } 1695 static inline void hyp_cpu_pm_exit(void) 1696 { 1697 } 1698 #endif 1699 1700 static void init_cpu_logical_map(void) 1701 { 1702 unsigned int cpu; 1703 1704 /* 1705 * Copy the MPIDR <-> logical CPU ID mapping to hyp. 1706 * Only copy the set of online CPUs whose features have been chacked 1707 * against the finalized system capabilities. The hypervisor will not 1708 * allow any other CPUs from the `possible` set to boot. 1709 */ 1710 for_each_online_cpu(cpu) 1711 hyp_cpu_logical_map[cpu] = cpu_logical_map(cpu); 1712 } 1713 1714 #define init_psci_0_1_impl_state(config, what) \ 1715 config.psci_0_1_ ## what ## _implemented = psci_ops.what 1716 1717 static bool init_psci_relay(void) 1718 { 1719 /* 1720 * If PSCI has not been initialized, protected KVM cannot install 1721 * itself on newly booted CPUs. 1722 */ 1723 if (!psci_ops.get_version) { 1724 kvm_err("Cannot initialize protected mode without PSCI\n"); 1725 return false; 1726 } 1727 1728 kvm_host_psci_config.version = psci_ops.get_version(); 1729 1730 if (kvm_host_psci_config.version == PSCI_VERSION(0, 1)) { 1731 kvm_host_psci_config.function_ids_0_1 = get_psci_0_1_function_ids(); 1732 init_psci_0_1_impl_state(kvm_host_psci_config, cpu_suspend); 1733 init_psci_0_1_impl_state(kvm_host_psci_config, cpu_on); 1734 init_psci_0_1_impl_state(kvm_host_psci_config, cpu_off); 1735 init_psci_0_1_impl_state(kvm_host_psci_config, migrate); 1736 } 1737 return true; 1738 } 1739 1740 static int init_subsystems(void) 1741 { 1742 int err = 0; 1743 1744 /* 1745 * Enable hardware so that subsystem initialisation can access EL2. 1746 */ 1747 on_each_cpu(_kvm_arch_hardware_enable, NULL, 1); 1748 1749 /* 1750 * Register CPU lower-power notifier 1751 */ 1752 hyp_cpu_pm_init(); 1753 1754 /* 1755 * Init HYP view of VGIC 1756 */ 1757 err = kvm_vgic_hyp_init(); 1758 switch (err) { 1759 case 0: 1760 vgic_present = true; 1761 break; 1762 case -ENODEV: 1763 case -ENXIO: 1764 vgic_present = false; 1765 err = 0; 1766 break; 1767 default: 1768 goto out; 1769 } 1770 1771 /* 1772 * Init HYP architected timer support 1773 */ 1774 err = kvm_timer_hyp_init(vgic_present); 1775 if (err) 1776 goto out; 1777 1778 kvm_perf_init(); 1779 kvm_sys_reg_table_init(); 1780 1781 out: 1782 if (err || !is_protected_kvm_enabled()) 1783 on_each_cpu(_kvm_arch_hardware_disable, NULL, 1); 1784 1785 return err; 1786 } 1787 1788 static void teardown_hyp_mode(void) 1789 { 1790 int cpu; 1791 1792 free_hyp_pgds(); 1793 for_each_possible_cpu(cpu) { 1794 free_page(per_cpu(kvm_arm_hyp_stack_page, cpu)); 1795 free_pages(kvm_arm_hyp_percpu_base[cpu], nvhe_percpu_order()); 1796 } 1797 } 1798 1799 static int do_pkvm_init(u32 hyp_va_bits) 1800 { 1801 void *per_cpu_base = kvm_ksym_ref(kvm_arm_hyp_percpu_base); 1802 int ret; 1803 1804 preempt_disable(); 1805 cpu_hyp_init_context(); 1806 ret = kvm_call_hyp_nvhe(__pkvm_init, hyp_mem_base, hyp_mem_size, 1807 num_possible_cpus(), kern_hyp_va(per_cpu_base), 1808 hyp_va_bits); 1809 cpu_hyp_init_features(); 1810 1811 /* 1812 * The stub hypercalls are now disabled, so set our local flag to 1813 * prevent a later re-init attempt in kvm_arch_hardware_enable(). 1814 */ 1815 __this_cpu_write(kvm_arm_hardware_enabled, 1); 1816 preempt_enable(); 1817 1818 return ret; 1819 } 1820 1821 static int kvm_hyp_init_protection(u32 hyp_va_bits) 1822 { 1823 void *addr = phys_to_virt(hyp_mem_base); 1824 int ret; 1825 1826 kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); 1827 kvm_nvhe_sym(id_aa64pfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); 1828 kvm_nvhe_sym(id_aa64isar0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR0_EL1); 1829 kvm_nvhe_sym(id_aa64isar1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1); 1830 kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); 1831 kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); 1832 kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1); 1833 1834 ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP); 1835 if (ret) 1836 return ret; 1837 1838 ret = do_pkvm_init(hyp_va_bits); 1839 if (ret) 1840 return ret; 1841 1842 free_hyp_pgds(); 1843 1844 return 0; 1845 } 1846 1847 /** 1848 * Inits Hyp-mode on all online CPUs 1849 */ 1850 static int init_hyp_mode(void) 1851 { 1852 u32 hyp_va_bits; 1853 int cpu; 1854 int err = -ENOMEM; 1855 1856 /* 1857 * The protected Hyp-mode cannot be initialized if the memory pool 1858 * allocation has failed. 1859 */ 1860 if (is_protected_kvm_enabled() && !hyp_mem_base) 1861 goto out_err; 1862 1863 /* 1864 * Allocate Hyp PGD and setup Hyp identity mapping 1865 */ 1866 err = kvm_mmu_init(&hyp_va_bits); 1867 if (err) 1868 goto out_err; 1869 1870 /* 1871 * Allocate stack pages for Hypervisor-mode 1872 */ 1873 for_each_possible_cpu(cpu) { 1874 unsigned long stack_page; 1875 1876 stack_page = __get_free_page(GFP_KERNEL); 1877 if (!stack_page) { 1878 err = -ENOMEM; 1879 goto out_err; 1880 } 1881 1882 per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page; 1883 } 1884 1885 /* 1886 * Allocate and initialize pages for Hypervisor-mode percpu regions. 1887 */ 1888 for_each_possible_cpu(cpu) { 1889 struct page *page; 1890 void *page_addr; 1891 1892 page = alloc_pages(GFP_KERNEL, nvhe_percpu_order()); 1893 if (!page) { 1894 err = -ENOMEM; 1895 goto out_err; 1896 } 1897 1898 page_addr = page_address(page); 1899 memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size()); 1900 kvm_arm_hyp_percpu_base[cpu] = (unsigned long)page_addr; 1901 } 1902 1903 /* 1904 * Map the Hyp-code called directly from the host 1905 */ 1906 err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start), 1907 kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC); 1908 if (err) { 1909 kvm_err("Cannot map world-switch code\n"); 1910 goto out_err; 1911 } 1912 1913 err = create_hyp_mappings(kvm_ksym_ref(__hyp_rodata_start), 1914 kvm_ksym_ref(__hyp_rodata_end), PAGE_HYP_RO); 1915 if (err) { 1916 kvm_err("Cannot map .hyp.rodata section\n"); 1917 goto out_err; 1918 } 1919 1920 err = create_hyp_mappings(kvm_ksym_ref(__start_rodata), 1921 kvm_ksym_ref(__end_rodata), PAGE_HYP_RO); 1922 if (err) { 1923 kvm_err("Cannot map rodata section\n"); 1924 goto out_err; 1925 } 1926 1927 /* 1928 * .hyp.bss is guaranteed to be placed at the beginning of the .bss 1929 * section thanks to an assertion in the linker script. Map it RW and 1930 * the rest of .bss RO. 1931 */ 1932 err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_start), 1933 kvm_ksym_ref(__hyp_bss_end), PAGE_HYP); 1934 if (err) { 1935 kvm_err("Cannot map hyp bss section: %d\n", err); 1936 goto out_err; 1937 } 1938 1939 err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_end), 1940 kvm_ksym_ref(__bss_stop), PAGE_HYP_RO); 1941 if (err) { 1942 kvm_err("Cannot map bss section\n"); 1943 goto out_err; 1944 } 1945 1946 /* 1947 * Map the Hyp stack pages 1948 */ 1949 for_each_possible_cpu(cpu) { 1950 char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu); 1951 err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE, 1952 PAGE_HYP); 1953 1954 if (err) { 1955 kvm_err("Cannot map hyp stack\n"); 1956 goto out_err; 1957 } 1958 } 1959 1960 for_each_possible_cpu(cpu) { 1961 char *percpu_begin = (char *)kvm_arm_hyp_percpu_base[cpu]; 1962 char *percpu_end = percpu_begin + nvhe_percpu_size(); 1963 1964 /* Map Hyp percpu pages */ 1965 err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP); 1966 if (err) { 1967 kvm_err("Cannot map hyp percpu region\n"); 1968 goto out_err; 1969 } 1970 1971 /* Prepare the CPU initialization parameters */ 1972 cpu_prepare_hyp_mode(cpu); 1973 } 1974 1975 if (is_protected_kvm_enabled()) { 1976 init_cpu_logical_map(); 1977 1978 if (!init_psci_relay()) { 1979 err = -ENODEV; 1980 goto out_err; 1981 } 1982 } 1983 1984 if (is_protected_kvm_enabled()) { 1985 err = kvm_hyp_init_protection(hyp_va_bits); 1986 if (err) { 1987 kvm_err("Failed to init hyp memory protection\n"); 1988 goto out_err; 1989 } 1990 } 1991 1992 return 0; 1993 1994 out_err: 1995 teardown_hyp_mode(); 1996 kvm_err("error initializing Hyp mode: %d\n", err); 1997 return err; 1998 } 1999 2000 static void _kvm_host_prot_finalize(void *arg) 2001 { 2002 int *err = arg; 2003 2004 if (WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize))) 2005 WRITE_ONCE(*err, -EINVAL); 2006 } 2007 2008 static int pkvm_drop_host_privileges(void) 2009 { 2010 int ret = 0; 2011 2012 /* 2013 * Flip the static key upfront as that may no longer be possible 2014 * once the host stage 2 is installed. 2015 */ 2016 static_branch_enable(&kvm_protected_mode_initialized); 2017 on_each_cpu(_kvm_host_prot_finalize, &ret, 1); 2018 return ret; 2019 } 2020 2021 static int finalize_hyp_mode(void) 2022 { 2023 if (!is_protected_kvm_enabled()) 2024 return 0; 2025 2026 /* 2027 * Exclude HYP BSS from kmemleak so that it doesn't get peeked 2028 * at, which would end badly once the section is inaccessible. 2029 * None of other sections should ever be introspected. 2030 */ 2031 kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start); 2032 return pkvm_drop_host_privileges(); 2033 } 2034 2035 struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr) 2036 { 2037 struct kvm_vcpu *vcpu; 2038 int i; 2039 2040 mpidr &= MPIDR_HWID_BITMASK; 2041 kvm_for_each_vcpu(i, vcpu, kvm) { 2042 if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu)) 2043 return vcpu; 2044 } 2045 return NULL; 2046 } 2047 2048 bool kvm_arch_has_irq_bypass(void) 2049 { 2050 return true; 2051 } 2052 2053 int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, 2054 struct irq_bypass_producer *prod) 2055 { 2056 struct kvm_kernel_irqfd *irqfd = 2057 container_of(cons, struct kvm_kernel_irqfd, consumer); 2058 2059 return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq, 2060 &irqfd->irq_entry); 2061 } 2062 void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, 2063 struct irq_bypass_producer *prod) 2064 { 2065 struct kvm_kernel_irqfd *irqfd = 2066 container_of(cons, struct kvm_kernel_irqfd, consumer); 2067 2068 kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq, 2069 &irqfd->irq_entry); 2070 } 2071 2072 void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons) 2073 { 2074 struct kvm_kernel_irqfd *irqfd = 2075 container_of(cons, struct kvm_kernel_irqfd, consumer); 2076 2077 kvm_arm_halt_guest(irqfd->kvm); 2078 } 2079 2080 void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons) 2081 { 2082 struct kvm_kernel_irqfd *irqfd = 2083 container_of(cons, struct kvm_kernel_irqfd, consumer); 2084 2085 kvm_arm_resume_guest(irqfd->kvm); 2086 } 2087 2088 /** 2089 * Initialize Hyp-mode and memory mappings on all CPUs. 2090 */ 2091 int kvm_arch_init(void *opaque) 2092 { 2093 int err; 2094 bool in_hyp_mode; 2095 2096 if (!is_hyp_mode_available()) { 2097 kvm_info("HYP mode not available\n"); 2098 return -ENODEV; 2099 } 2100 2101 if (kvm_get_mode() == KVM_MODE_NONE) { 2102 kvm_info("KVM disabled from command line\n"); 2103 return -ENODEV; 2104 } 2105 2106 in_hyp_mode = is_kernel_in_hyp_mode(); 2107 2108 if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) || 2109 cpus_have_final_cap(ARM64_WORKAROUND_1508412)) 2110 kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \ 2111 "Only trusted guests should be used on this system.\n"); 2112 2113 err = kvm_set_ipa_limit(); 2114 if (err) 2115 return err; 2116 2117 err = kvm_arm_init_sve(); 2118 if (err) 2119 return err; 2120 2121 if (!in_hyp_mode) { 2122 err = init_hyp_mode(); 2123 if (err) 2124 goto out_err; 2125 } 2126 2127 err = kvm_init_vector_slots(); 2128 if (err) { 2129 kvm_err("Cannot initialise vector slots\n"); 2130 goto out_err; 2131 } 2132 2133 err = init_subsystems(); 2134 if (err) 2135 goto out_hyp; 2136 2137 if (!in_hyp_mode) { 2138 err = finalize_hyp_mode(); 2139 if (err) { 2140 kvm_err("Failed to finalize Hyp protection\n"); 2141 goto out_hyp; 2142 } 2143 } 2144 2145 if (is_protected_kvm_enabled()) { 2146 kvm_info("Protected nVHE mode initialized successfully\n"); 2147 } else if (in_hyp_mode) { 2148 kvm_info("VHE mode initialized successfully\n"); 2149 } else { 2150 kvm_info("Hyp mode initialized successfully\n"); 2151 } 2152 2153 return 0; 2154 2155 out_hyp: 2156 hyp_cpu_pm_exit(); 2157 if (!in_hyp_mode) 2158 teardown_hyp_mode(); 2159 out_err: 2160 return err; 2161 } 2162 2163 /* NOP: Compiling as a module not supported */ 2164 void kvm_arch_exit(void) 2165 { 2166 kvm_perf_teardown(); 2167 } 2168 2169 static int __init early_kvm_mode_cfg(char *arg) 2170 { 2171 if (!arg) 2172 return -EINVAL; 2173 2174 if (strcmp(arg, "protected") == 0) { 2175 kvm_mode = KVM_MODE_PROTECTED; 2176 return 0; 2177 } 2178 2179 if (strcmp(arg, "nvhe") == 0 && !WARN_ON(is_kernel_in_hyp_mode())) { 2180 kvm_mode = KVM_MODE_DEFAULT; 2181 return 0; 2182 } 2183 2184 if (strcmp(arg, "none") == 0) { 2185 kvm_mode = KVM_MODE_NONE; 2186 return 0; 2187 } 2188 2189 return -EINVAL; 2190 } 2191 early_param("kvm-arm.mode", early_kvm_mode_cfg); 2192 2193 enum kvm_mode kvm_get_mode(void) 2194 { 2195 return kvm_mode; 2196 } 2197 2198 static int arm_init(void) 2199 { 2200 int rc = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); 2201 return rc; 2202 } 2203 2204 module_init(arm_init); 2205