1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
4 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
5 */
6
7 #include <linux/bug.h>
8 #include <linux/cpu_pm.h>
9 #include <linux/errno.h>
10 #include <linux/err.h>
11 #include <linux/kvm_host.h>
12 #include <linux/list.h>
13 #include <linux/module.h>
14 #include <linux/vmalloc.h>
15 #include <linux/fs.h>
16 #include <linux/mman.h>
17 #include <linux/sched.h>
18 #include <linux/kvm.h>
19 #include <linux/kvm_irqfd.h>
20 #include <linux/irqbypass.h>
21 #include <linux/sched/stat.h>
22 #include <linux/psci.h>
23 #include <trace/events/kvm.h>
24
25 #define CREATE_TRACE_POINTS
26 #include "trace_arm.h"
27 #include "hyp_trace.h"
28
29 #include <linux/uaccess.h>
30 #include <asm/ptrace.h>
31 #include <asm/mman.h>
32 #include <asm/tlbflush.h>
33 #include <asm/cacheflush.h>
34 #include <asm/cpufeature.h>
35 #include <asm/virt.h>
36 #include <asm/kvm_arm.h>
37 #include <asm/kvm_asm.h>
38 #include <asm/kvm_emulate.h>
39 #include <asm/kvm_hyp.h>
40 #include <asm/kvm_mmu.h>
41 #include <asm/kvm_nested.h>
42 #include <asm/kvm_pkvm.h>
43 #include <asm/kvm_ptrauth.h>
44 #include <asm/sections.h>
45 #include <asm/stacktrace/nvhe.h>
46
47 #include <kvm/arm_hypercalls.h>
48 #include <kvm/arm_pmu.h>
49 #include <kvm/arm_psci.h>
50 #include <kvm/arm_vgic.h>
51
52 #include <linux/irqchip/arm-gic-v5.h>
53
54 #include "sys_regs.h"
55
56 static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT;
57
58 enum kvm_wfx_trap_policy {
59 KVM_WFX_NOTRAP_SINGLE_TASK, /* Default option */
60 KVM_WFX_NOTRAP,
61 KVM_WFX_TRAP,
62 };
63
64 static enum kvm_wfx_trap_policy kvm_wfi_trap_policy __read_mostly = KVM_WFX_NOTRAP_SINGLE_TASK;
65 static enum kvm_wfx_trap_policy kvm_wfe_trap_policy __read_mostly = KVM_WFX_NOTRAP_SINGLE_TASK;
66
67 /*
68 * Tracks KVM IOCTLs and their associated KVM capabilities.
69 */
70 struct kvm_ioctl_cap_map {
71 unsigned int ioctl;
72 long ext;
73 };
74
75 /* Make KVM_CAP_NR_VCPUS the reference for features we always supported */
76 #define KVM_CAP_ARM_BASIC KVM_CAP_NR_VCPUS
77
78 /*
79 * Sorted by ioctl to allow for potential binary search,
80 * though linear scan is sufficient for this size.
81 */
82 static const struct kvm_ioctl_cap_map vm_ioctl_caps[] = {
83 { KVM_CREATE_IRQCHIP, KVM_CAP_IRQCHIP },
84 { KVM_ARM_SET_DEVICE_ADDR, KVM_CAP_ARM_SET_DEVICE_ADDR },
85 { KVM_ARM_MTE_COPY_TAGS, KVM_CAP_ARM_MTE },
86 { KVM_SET_DEVICE_ATTR, KVM_CAP_DEVICE_CTRL },
87 { KVM_GET_DEVICE_ATTR, KVM_CAP_DEVICE_CTRL },
88 { KVM_HAS_DEVICE_ATTR, KVM_CAP_DEVICE_CTRL },
89 { KVM_ARM_SET_COUNTER_OFFSET, KVM_CAP_COUNTER_OFFSET },
90 { KVM_ARM_GET_REG_WRITABLE_MASKS, KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES },
91 { KVM_ARM_PREFERRED_TARGET, KVM_CAP_ARM_BASIC },
92 };
93
94 /*
95 * Set *ext to the capability.
96 * Return 0 if found, or -EINVAL if no IOCTL matches.
97 */
kvm_get_cap_for_kvm_ioctl(unsigned int ioctl,long * ext)98 long kvm_get_cap_for_kvm_ioctl(unsigned int ioctl, long *ext)
99 {
100 int i;
101
102 for (i = 0; i < ARRAY_SIZE(vm_ioctl_caps); i++) {
103 if (vm_ioctl_caps[i].ioctl == ioctl) {
104 *ext = vm_ioctl_caps[i].ext;
105 return 0;
106 }
107 }
108
109 return -EINVAL;
110 }
111
112 DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
113
114 DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_base);
115 DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
116
117 DECLARE_KVM_NVHE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
118
119 static bool vgic_present, kvm_arm_initialised;
120
121 static DEFINE_PER_CPU(unsigned char, kvm_hyp_initialized);
122
is_kvm_arm_initialised(void)123 bool is_kvm_arm_initialised(void)
124 {
125 return kvm_arm_initialised;
126 }
127
kvm_arch_vcpu_should_kick(struct kvm_vcpu * vcpu)128 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
129 {
130 return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
131 }
132
kvm_vm_ioctl_enable_cap(struct kvm * kvm,struct kvm_enable_cap * cap)133 int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
134 struct kvm_enable_cap *cap)
135 {
136 int r = -EINVAL;
137
138 if (cap->flags)
139 return -EINVAL;
140
141 if (is_protected_kvm_enabled() && !kvm_pkvm_ext_allowed(kvm, cap->cap))
142 return -EINVAL;
143
144 switch (cap->cap) {
145 case KVM_CAP_ARM_NISV_TO_USER:
146 r = 0;
147 set_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER,
148 &kvm->arch.flags);
149 break;
150 case KVM_CAP_ARM_MTE:
151 mutex_lock(&kvm->lock);
152 if (system_supports_mte() && !kvm->created_vcpus) {
153 r = 0;
154 set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags);
155 }
156 mutex_unlock(&kvm->lock);
157 break;
158 case KVM_CAP_ARM_SYSTEM_SUSPEND:
159 r = 0;
160 set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags);
161 break;
162 case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE:
163 mutex_lock(&kvm->slots_lock);
164 /*
165 * To keep things simple, allow changing the chunk
166 * size only when no memory slots have been created.
167 */
168 if (kvm_are_all_memslots_empty(kvm)) {
169 u64 new_cap = cap->args[0];
170
171 if (!new_cap || kvm_is_block_size_supported(new_cap)) {
172 r = 0;
173 kvm->arch.mmu.split_page_chunk_size = new_cap;
174 }
175 }
176 mutex_unlock(&kvm->slots_lock);
177 break;
178 case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS:
179 mutex_lock(&kvm->lock);
180 if (!kvm->created_vcpus) {
181 r = 0;
182 set_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags);
183 }
184 mutex_unlock(&kvm->lock);
185 break;
186 case KVM_CAP_ARM_SEA_TO_USER:
187 r = 0;
188 set_bit(KVM_ARCH_FLAG_EXIT_SEA, &kvm->arch.flags);
189 break;
190 default:
191 break;
192 }
193
194 return r;
195 }
196
kvm_arm_default_max_vcpus(void)197 static int kvm_arm_default_max_vcpus(void)
198 {
199 return vgic_present ? kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS;
200 }
201
202 /**
203 * kvm_arch_init_vm - initializes a VM data structure
204 * @kvm: pointer to the KVM struct
205 * @type: kvm device type
206 */
kvm_arch_init_vm(struct kvm * kvm,unsigned long type)207 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
208 {
209 int ret;
210
211 if (type & ~KVM_VM_TYPE_ARM_MASK)
212 return -EINVAL;
213
214 mutex_init(&kvm->arch.config_lock);
215
216 #ifdef CONFIG_LOCKDEP
217 /* Clue in lockdep that the config_lock must be taken inside kvm->lock */
218 mutex_lock(&kvm->lock);
219 mutex_lock(&kvm->arch.config_lock);
220 mutex_unlock(&kvm->arch.config_lock);
221 mutex_unlock(&kvm->lock);
222 #endif
223
224 kvm_init_nested(kvm);
225
226 ret = kvm_share_hyp(kvm, kvm + 1);
227 if (ret)
228 return ret;
229
230 if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL_ACCOUNT)) {
231 ret = -ENOMEM;
232 goto err_unshare_kvm;
233 }
234 cpumask_copy(kvm->arch.supported_cpus, cpu_possible_mask);
235
236 ret = kvm_init_stage2_mmu(kvm, &kvm->arch.mmu, type);
237 if (ret)
238 goto err_free_cpumask;
239
240 if (is_protected_kvm_enabled()) {
241 /*
242 * If any failures occur after this is successful, make sure to
243 * call __pkvm_unreserve_vm to unreserve the VM in hyp.
244 */
245 ret = pkvm_init_host_vm(kvm, type);
246 if (ret)
247 goto err_uninit_mmu;
248 } else if (type & KVM_VM_TYPE_ARM_PROTECTED) {
249 ret = -EINVAL;
250 goto err_uninit_mmu;
251 }
252
253 kvm_vgic_early_init(kvm);
254
255 kvm_timer_init_vm(kvm);
256
257 /* The maximum number of VCPUs is limited by the host's GIC model */
258 kvm->max_vcpus = kvm_arm_default_max_vcpus();
259
260 kvm_arm_init_hypercalls(kvm);
261
262 bitmap_zero(kvm->arch.vcpu_features, KVM_VCPU_MAX_FEATURES);
263
264 return 0;
265
266 err_uninit_mmu:
267 kvm_uninit_stage2_mmu(kvm);
268 err_free_cpumask:
269 free_cpumask_var(kvm->arch.supported_cpus);
270 err_unshare_kvm:
271 kvm_unshare_hyp(kvm, kvm + 1);
272 return ret;
273 }
274
kvm_arch_vcpu_fault(struct kvm_vcpu * vcpu,struct vm_fault * vmf)275 vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
276 {
277 return VM_FAULT_SIGBUS;
278 }
279
kvm_arch_create_vm_debugfs(struct kvm * kvm)280 void kvm_arch_create_vm_debugfs(struct kvm *kvm)
281 {
282 kvm_sys_regs_create_debugfs(kvm);
283 kvm_s2_ptdump_create_debugfs(kvm);
284 }
285
kvm_destroy_mpidr_data(struct kvm * kvm)286 static void kvm_destroy_mpidr_data(struct kvm *kvm)
287 {
288 struct kvm_mpidr_data *data;
289
290 mutex_lock(&kvm->arch.config_lock);
291
292 data = rcu_dereference_protected(kvm->arch.mpidr_data,
293 lockdep_is_held(&kvm->arch.config_lock));
294 if (data) {
295 rcu_assign_pointer(kvm->arch.mpidr_data, NULL);
296 synchronize_rcu();
297 kfree(data);
298 }
299
300 mutex_unlock(&kvm->arch.config_lock);
301 }
302
303 /**
304 * kvm_arch_destroy_vm - destroy the VM data structure
305 * @kvm: pointer to the KVM struct
306 */
kvm_arch_destroy_vm(struct kvm * kvm)307 void kvm_arch_destroy_vm(struct kvm *kvm)
308 {
309 bitmap_free(kvm->arch.pmu_filter);
310 free_cpumask_var(kvm->arch.supported_cpus);
311
312 kvm_vgic_destroy(kvm);
313
314 if (is_protected_kvm_enabled())
315 pkvm_destroy_hyp_vm(kvm);
316
317 kvm_uninit_stage2_mmu(kvm);
318 kvm_destroy_mpidr_data(kvm);
319
320 kfree(kvm->arch.sysreg_masks);
321 kvm_destroy_vcpus(kvm);
322
323 kvm_unshare_hyp(kvm, kvm + 1);
324
325 kvm_arm_teardown_hypercalls(kvm);
326 }
327
kvm_has_full_ptr_auth(void)328 static bool kvm_has_full_ptr_auth(void)
329 {
330 bool apa, gpa, api, gpi, apa3, gpa3;
331 u64 isar1, isar2, val;
332
333 /*
334 * Check that:
335 *
336 * - both Address and Generic auth are implemented for a given
337 * algorithm (Q5, IMPDEF or Q3)
338 * - only a single algorithm is implemented.
339 */
340 if (!system_has_full_ptr_auth())
341 return false;
342
343 isar1 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1);
344 isar2 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1);
345
346 apa = !!FIELD_GET(ID_AA64ISAR1_EL1_APA_MASK, isar1);
347 val = FIELD_GET(ID_AA64ISAR1_EL1_GPA_MASK, isar1);
348 gpa = (val == ID_AA64ISAR1_EL1_GPA_IMP);
349
350 api = !!FIELD_GET(ID_AA64ISAR1_EL1_API_MASK, isar1);
351 val = FIELD_GET(ID_AA64ISAR1_EL1_GPI_MASK, isar1);
352 gpi = (val == ID_AA64ISAR1_EL1_GPI_IMP);
353
354 apa3 = !!FIELD_GET(ID_AA64ISAR2_EL1_APA3_MASK, isar2);
355 val = FIELD_GET(ID_AA64ISAR2_EL1_GPA3_MASK, isar2);
356 gpa3 = (val == ID_AA64ISAR2_EL1_GPA3_IMP);
357
358 return (apa == gpa && api == gpi && apa3 == gpa3 &&
359 (apa + api + apa3) == 1);
360 }
361
kvm_vm_ioctl_check_extension(struct kvm * kvm,long ext)362 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
363 {
364 int r;
365
366 if (is_protected_kvm_enabled() && !kvm_pkvm_ext_allowed(kvm, ext))
367 return 0;
368
369 switch (ext) {
370 case KVM_CAP_IRQCHIP:
371 r = vgic_present;
372 break;
373 case KVM_CAP_IOEVENTFD:
374 case KVM_CAP_USER_MEMORY:
375 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
376 case KVM_CAP_ONE_REG:
377 case KVM_CAP_ARM_PSCI:
378 case KVM_CAP_ARM_PSCI_0_2:
379 case KVM_CAP_READONLY_MEM:
380 case KVM_CAP_MP_STATE:
381 case KVM_CAP_IMMEDIATE_EXIT:
382 case KVM_CAP_VCPU_EVENTS:
383 case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2:
384 case KVM_CAP_ARM_NISV_TO_USER:
385 case KVM_CAP_ARM_INJECT_EXT_DABT:
386 case KVM_CAP_SET_GUEST_DEBUG:
387 case KVM_CAP_VCPU_ATTRIBUTES:
388 case KVM_CAP_PTP_KVM:
389 case KVM_CAP_ARM_SYSTEM_SUSPEND:
390 case KVM_CAP_IRQFD_RESAMPLE:
391 case KVM_CAP_COUNTER_OFFSET:
392 case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS:
393 case KVM_CAP_ARM_SEA_TO_USER:
394 r = 1;
395 break;
396 case KVM_CAP_SET_GUEST_DEBUG2:
397 return KVM_GUESTDBG_VALID_MASK;
398 case KVM_CAP_ARM_SET_DEVICE_ADDR:
399 r = 1;
400 break;
401 case KVM_CAP_NR_VCPUS:
402 /*
403 * ARM64 treats KVM_CAP_NR_CPUS differently from all other
404 * architectures, as it does not always bound it to
405 * KVM_CAP_MAX_VCPUS. It should not matter much because
406 * this is just an advisory value.
407 */
408 r = min_t(unsigned int, num_online_cpus(),
409 kvm_arm_default_max_vcpus());
410 break;
411 case KVM_CAP_MAX_VCPUS:
412 case KVM_CAP_MAX_VCPU_ID:
413 if (kvm)
414 r = kvm->max_vcpus;
415 else
416 r = kvm_arm_default_max_vcpus();
417 break;
418 case KVM_CAP_MSI_DEVID:
419 if (!kvm)
420 r = -EINVAL;
421 else
422 r = kvm->arch.vgic.msis_require_devid;
423 break;
424 case KVM_CAP_ARM_USER_IRQ:
425 /*
426 * 1: EL1_VTIMER, EL1_PTIMER, and PMU.
427 * (bump this number if adding more devices)
428 */
429 r = 1;
430 break;
431 case KVM_CAP_ARM_MTE:
432 r = system_supports_mte();
433 break;
434 case KVM_CAP_STEAL_TIME:
435 r = kvm_arm_pvtime_supported();
436 break;
437 case KVM_CAP_ARM_EL1_32BIT:
438 r = cpus_have_final_cap(ARM64_HAS_32BIT_EL1);
439 break;
440 case KVM_CAP_ARM_EL2:
441 r = cpus_have_final_cap(ARM64_HAS_NESTED_VIRT);
442 break;
443 case KVM_CAP_ARM_EL2_E2H0:
444 r = cpus_have_final_cap(ARM64_HAS_HCR_NV1);
445 break;
446 case KVM_CAP_GUEST_DEBUG_HW_BPS:
447 r = get_num_brps();
448 break;
449 case KVM_CAP_GUEST_DEBUG_HW_WPS:
450 r = get_num_wrps();
451 break;
452 case KVM_CAP_ARM_PMU_V3:
453 r = kvm_supports_guest_pmuv3();
454 break;
455 case KVM_CAP_ARM_INJECT_SERROR_ESR:
456 r = cpus_have_final_cap(ARM64_HAS_RAS_EXTN);
457 break;
458 case KVM_CAP_ARM_VM_IPA_SIZE:
459 r = get_kvm_ipa_limit();
460 break;
461 case KVM_CAP_ARM_SVE:
462 r = system_supports_sve();
463 break;
464 case KVM_CAP_ARM_PTRAUTH_ADDRESS:
465 case KVM_CAP_ARM_PTRAUTH_GENERIC:
466 r = kvm_has_full_ptr_auth();
467 break;
468 case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE:
469 if (kvm)
470 r = kvm->arch.mmu.split_page_chunk_size;
471 else
472 r = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT;
473 break;
474 case KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES:
475 r = kvm_supported_block_sizes();
476 break;
477 case KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES:
478 r = BIT(0);
479 break;
480 case KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED:
481 if (!kvm)
482 r = -EINVAL;
483 else
484 r = kvm_supports_cacheable_pfnmap();
485 break;
486
487 default:
488 r = 0;
489 }
490
491 return r;
492 }
493
kvm_arch_dev_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)494 long kvm_arch_dev_ioctl(struct file *filp,
495 unsigned int ioctl, unsigned long arg)
496 {
497 return -EINVAL;
498 }
499
kvm_arch_alloc_vm(void)500 struct kvm *kvm_arch_alloc_vm(void)
501 {
502 size_t sz = sizeof(struct kvm);
503
504 if (!has_vhe())
505 return kzalloc(sz, GFP_KERNEL_ACCOUNT);
506
507 return kvzalloc(sz, GFP_KERNEL_ACCOUNT);
508 }
509
kvm_arch_vcpu_precreate(struct kvm * kvm,unsigned int id)510 int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
511 {
512 if (irqchip_in_kernel(kvm) && vgic_initialized(kvm))
513 return -EBUSY;
514
515 if (id >= kvm->max_vcpus)
516 return -EINVAL;
517
518 return 0;
519 }
520
kvm_arch_vcpu_create(struct kvm_vcpu * vcpu)521 int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
522 {
523 int err;
524
525 spin_lock_init(&vcpu->arch.mp_state_lock);
526
527 #ifdef CONFIG_LOCKDEP
528 /* Inform lockdep that the config_lock is acquired after vcpu->mutex */
529 mutex_lock(&vcpu->mutex);
530 mutex_lock(&vcpu->kvm->arch.config_lock);
531 mutex_unlock(&vcpu->kvm->arch.config_lock);
532 mutex_unlock(&vcpu->mutex);
533 #endif
534
535 /* Force users to call KVM_ARM_VCPU_INIT */
536 vcpu_clear_flag(vcpu, VCPU_INITIALIZED);
537
538 vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO;
539
540 /* Set up the timer */
541 kvm_timer_vcpu_init(vcpu);
542
543 kvm_pmu_vcpu_init(vcpu);
544
545 kvm_arm_pvtime_vcpu_init(&vcpu->arch);
546
547 vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
548
549 /*
550 * This vCPU may have been created after mpidr_data was initialized.
551 * Throw out the pre-computed mappings if that is the case which forces
552 * KVM to fall back to iteratively searching the vCPUs.
553 */
554 kvm_destroy_mpidr_data(vcpu->kvm);
555
556 err = kvm_vgic_vcpu_init(vcpu);
557 if (err)
558 return err;
559
560 err = kvm_share_hyp(vcpu, vcpu + 1);
561 if (err)
562 kvm_vgic_vcpu_destroy(vcpu);
563
564 return err;
565 }
566
kvm_arch_vcpu_postcreate(struct kvm_vcpu * vcpu)567 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
568 {
569 }
570
kvm_arch_vcpu_destroy(struct kvm_vcpu * vcpu)571 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
572 {
573 if (!is_protected_kvm_enabled())
574 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
575 else
576 free_hyp_memcache(&vcpu->arch.pkvm_memcache);
577 kvm_timer_vcpu_terminate(vcpu);
578 kvm_pmu_vcpu_destroy(vcpu);
579 kvm_vgic_vcpu_destroy(vcpu);
580 kvm_arm_vcpu_destroy(vcpu);
581 }
582
kvm_arch_vcpu_blocking(struct kvm_vcpu * vcpu)583 void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
584 {
585
586 }
587
kvm_arch_vcpu_unblocking(struct kvm_vcpu * vcpu)588 void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
589 {
590
591 }
592
vcpu_set_pauth_traps(struct kvm_vcpu * vcpu)593 static void vcpu_set_pauth_traps(struct kvm_vcpu *vcpu)
594 {
595 if (vcpu_has_ptrauth(vcpu) && !is_protected_kvm_enabled()) {
596 /*
597 * Either we're running an L2 guest, and the API/APK bits come
598 * from L1's HCR_EL2, or API/APK are both set.
599 */
600 if (unlikely(is_nested_ctxt(vcpu))) {
601 u64 val;
602
603 val = __vcpu_sys_reg(vcpu, HCR_EL2);
604 val &= (HCR_API | HCR_APK);
605 vcpu->arch.hcr_el2 &= ~(HCR_API | HCR_APK);
606 vcpu->arch.hcr_el2 |= val;
607 } else {
608 vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK);
609 }
610
611 /*
612 * Save the host keys if there is any chance for the guest
613 * to use pauth, as the entry code will reload the guest
614 * keys in that case.
615 */
616 if (vcpu->arch.hcr_el2 & (HCR_API | HCR_APK)) {
617 struct kvm_cpu_context *ctxt;
618
619 ctxt = this_cpu_ptr_hyp_sym(kvm_hyp_ctxt);
620 ptrauth_save_keys(ctxt);
621 }
622 }
623 }
624
kvm_vcpu_should_clear_twi(struct kvm_vcpu * vcpu)625 static bool kvm_vcpu_should_clear_twi(struct kvm_vcpu *vcpu)
626 {
627 if (unlikely(kvm_wfi_trap_policy != KVM_WFX_NOTRAP_SINGLE_TASK))
628 return kvm_wfi_trap_policy == KVM_WFX_NOTRAP;
629
630 if (vgic_is_v5(vcpu->kvm))
631 return single_task_running();
632
633 return single_task_running() &&
634 vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 &&
635 (atomic_read(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count) ||
636 vcpu->kvm->arch.vgic.nassgireq);
637 }
638
kvm_vcpu_should_clear_twe(struct kvm_vcpu * vcpu)639 static bool kvm_vcpu_should_clear_twe(struct kvm_vcpu *vcpu)
640 {
641 if (unlikely(kvm_wfe_trap_policy != KVM_WFX_NOTRAP_SINGLE_TASK))
642 return kvm_wfe_trap_policy == KVM_WFX_NOTRAP;
643
644 return single_task_running();
645 }
646
kvm_arch_vcpu_load(struct kvm_vcpu * vcpu,int cpu)647 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
648 {
649 struct kvm_s2_mmu *mmu;
650 int *last_ran;
651
652 if (is_protected_kvm_enabled())
653 goto nommu;
654
655 if (vcpu_has_nv(vcpu))
656 kvm_vcpu_load_hw_mmu(vcpu);
657
658 mmu = vcpu->arch.hw_mmu;
659 last_ran = this_cpu_ptr(mmu->last_vcpu_ran);
660
661 /*
662 * Ensure a VMID is allocated for the MMU before programming VTTBR_EL2,
663 * which happens eagerly in VHE.
664 *
665 * Also, the VMID allocator only preserves VMIDs that are active at the
666 * time of rollover, so KVM might need to grab a new VMID for the MMU if
667 * this is called from kvm_sched_in().
668 */
669 kvm_arm_vmid_update(&mmu->vmid);
670
671 /*
672 * We guarantee that both TLBs and I-cache are private to each
673 * vcpu. If detecting that a vcpu from the same VM has
674 * previously run on the same physical CPU, call into the
675 * hypervisor code to nuke the relevant contexts.
676 *
677 * We might get preempted before the vCPU actually runs, but
678 * over-invalidation doesn't affect correctness.
679 */
680 if (*last_ran != vcpu->vcpu_idx) {
681 kvm_call_hyp(__kvm_flush_cpu_context, mmu);
682 *last_ran = vcpu->vcpu_idx;
683 }
684
685 nommu:
686 vcpu->cpu = cpu;
687
688 /*
689 * The timer must be loaded before the vgic to correctly set up physical
690 * interrupt deactivation in nested state (e.g. timer interrupt).
691 */
692 kvm_timer_vcpu_load(vcpu);
693 kvm_vgic_load(vcpu);
694 kvm_vcpu_load_debug(vcpu);
695 kvm_vcpu_load_fgt(vcpu);
696 if (has_vhe())
697 kvm_vcpu_load_vhe(vcpu);
698 kvm_arch_vcpu_load_fp(vcpu);
699 kvm_vcpu_pmu_restore_guest(vcpu);
700 if (kvm_arm_is_pvtime_enabled(&vcpu->arch))
701 kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu);
702
703 if (kvm_vcpu_should_clear_twe(vcpu))
704 vcpu->arch.hcr_el2 &= ~HCR_TWE;
705 else
706 vcpu->arch.hcr_el2 |= HCR_TWE;
707
708 if (kvm_vcpu_should_clear_twi(vcpu))
709 vcpu->arch.hcr_el2 &= ~HCR_TWI;
710 else
711 vcpu->arch.hcr_el2 |= HCR_TWI;
712
713 vcpu_set_pauth_traps(vcpu);
714
715 if (is_protected_kvm_enabled()) {
716 kvm_call_hyp_nvhe(__pkvm_vcpu_load,
717 vcpu->kvm->arch.pkvm.handle,
718 vcpu->vcpu_idx, vcpu->arch.hcr_el2);
719 kvm_call_hyp(__vgic_v3_restore_vmcr_aprs,
720 &vcpu->arch.vgic_cpu.vgic_v3);
721 }
722
723 if (!cpumask_test_cpu(cpu, vcpu->kvm->arch.supported_cpus))
724 vcpu_set_on_unsupported_cpu(vcpu);
725
726 vcpu->arch.pid = pid_nr(vcpu->pid);
727 }
728
kvm_arch_vcpu_put(struct kvm_vcpu * vcpu)729 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
730 {
731 if (is_protected_kvm_enabled()) {
732 kvm_call_hyp(__vgic_v3_save_aprs, &vcpu->arch.vgic_cpu.vgic_v3);
733 kvm_call_hyp_nvhe(__pkvm_vcpu_put);
734 }
735
736 kvm_vcpu_put_debug(vcpu);
737 kvm_arch_vcpu_put_fp(vcpu);
738 if (has_vhe())
739 kvm_vcpu_put_vhe(vcpu);
740 kvm_timer_vcpu_put(vcpu);
741 kvm_vgic_put(vcpu);
742 kvm_vcpu_pmu_restore_host(vcpu);
743 if (vcpu_has_nv(vcpu))
744 kvm_vcpu_put_hw_mmu(vcpu);
745 kvm_arm_vmid_clear_active();
746
747 vcpu_clear_on_unsupported_cpu(vcpu);
748 vcpu->cpu = -1;
749 }
750
__kvm_arm_vcpu_power_off(struct kvm_vcpu * vcpu)751 static void __kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu)
752 {
753 WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_STOPPED);
754 kvm_make_request(KVM_REQ_SLEEP, vcpu);
755 kvm_vcpu_kick(vcpu);
756 }
757
kvm_arm_vcpu_power_off(struct kvm_vcpu * vcpu)758 void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu)
759 {
760 spin_lock(&vcpu->arch.mp_state_lock);
761 __kvm_arm_vcpu_power_off(vcpu);
762 spin_unlock(&vcpu->arch.mp_state_lock);
763 }
764
kvm_arm_vcpu_stopped(struct kvm_vcpu * vcpu)765 bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu)
766 {
767 return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_STOPPED;
768 }
769
kvm_arm_vcpu_suspend(struct kvm_vcpu * vcpu)770 static void kvm_arm_vcpu_suspend(struct kvm_vcpu *vcpu)
771 {
772 WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_SUSPENDED);
773 kvm_make_request(KVM_REQ_SUSPEND, vcpu);
774 kvm_vcpu_kick(vcpu);
775 }
776
kvm_arm_vcpu_suspended(struct kvm_vcpu * vcpu)777 static bool kvm_arm_vcpu_suspended(struct kvm_vcpu *vcpu)
778 {
779 return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_SUSPENDED;
780 }
781
kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu * vcpu,struct kvm_mp_state * mp_state)782 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
783 struct kvm_mp_state *mp_state)
784 {
785 *mp_state = READ_ONCE(vcpu->arch.mp_state);
786
787 return 0;
788 }
789
kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu * vcpu,struct kvm_mp_state * mp_state)790 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
791 struct kvm_mp_state *mp_state)
792 {
793 int ret = 0;
794
795 spin_lock(&vcpu->arch.mp_state_lock);
796
797 switch (mp_state->mp_state) {
798 case KVM_MP_STATE_RUNNABLE:
799 WRITE_ONCE(vcpu->arch.mp_state, *mp_state);
800 break;
801 case KVM_MP_STATE_STOPPED:
802 __kvm_arm_vcpu_power_off(vcpu);
803 break;
804 case KVM_MP_STATE_SUSPENDED:
805 kvm_arm_vcpu_suspend(vcpu);
806 break;
807 default:
808 ret = -EINVAL;
809 }
810
811 spin_unlock(&vcpu->arch.mp_state_lock);
812
813 return ret;
814 }
815
816 /**
817 * kvm_arch_vcpu_runnable - determine if the vcpu can be scheduled
818 * @v: The VCPU pointer
819 *
820 * If the guest CPU is not waiting for interrupts or an interrupt line is
821 * asserted, the CPU is by definition runnable.
822 */
kvm_arch_vcpu_runnable(struct kvm_vcpu * v)823 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
824 {
825 bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF | HCR_VSE);
826
827 irq_lines |= (!irqchip_in_kernel(v->kvm) &&
828 (kvm_timer_should_notify_user(v) ||
829 kvm_pmu_should_notify_user(v)));
830
831 return ((irq_lines || kvm_vgic_vcpu_pending_irq(v))
832 && !kvm_arm_vcpu_stopped(v) && !v->arch.pause);
833 }
834
kvm_arch_vcpu_in_kernel(struct kvm_vcpu * vcpu)835 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
836 {
837 return vcpu_mode_priv(vcpu);
838 }
839
840 #ifdef CONFIG_GUEST_PERF_EVENTS
kvm_arch_vcpu_get_ip(struct kvm_vcpu * vcpu)841 unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu)
842 {
843 return *vcpu_pc(vcpu);
844 }
845 #endif
846
kvm_init_mpidr_data(struct kvm * kvm)847 static void kvm_init_mpidr_data(struct kvm *kvm)
848 {
849 struct kvm_mpidr_data *data = NULL;
850 unsigned long c, mask, nr_entries;
851 u64 aff_set = 0, aff_clr = ~0UL;
852 struct kvm_vcpu *vcpu;
853
854 mutex_lock(&kvm->arch.config_lock);
855
856 if (rcu_access_pointer(kvm->arch.mpidr_data) ||
857 atomic_read(&kvm->online_vcpus) == 1)
858 goto out;
859
860 kvm_for_each_vcpu(c, vcpu, kvm) {
861 u64 aff = kvm_vcpu_get_mpidr_aff(vcpu);
862 aff_set |= aff;
863 aff_clr &= aff;
864 }
865
866 /*
867 * A significant bit can be either 0 or 1, and will only appear in
868 * aff_set. Use aff_clr to weed out the useless stuff.
869 */
870 mask = aff_set ^ aff_clr;
871 nr_entries = BIT_ULL(hweight_long(mask));
872
873 /*
874 * Don't let userspace fool us. If we need more than a single page
875 * to describe the compressed MPIDR array, just fall back to the
876 * iterative method. Single vcpu VMs do not need this either.
877 */
878 if (struct_size(data, cmpidr_to_idx, nr_entries) <= PAGE_SIZE)
879 data = kzalloc_flex(*data, cmpidr_to_idx, nr_entries,
880 GFP_KERNEL_ACCOUNT);
881
882 if (!data)
883 goto out;
884
885 data->mpidr_mask = mask;
886
887 kvm_for_each_vcpu(c, vcpu, kvm) {
888 u64 aff = kvm_vcpu_get_mpidr_aff(vcpu);
889 u16 index = kvm_mpidr_index(data, aff);
890
891 data->cmpidr_to_idx[index] = c;
892 }
893
894 rcu_assign_pointer(kvm->arch.mpidr_data, data);
895 out:
896 mutex_unlock(&kvm->arch.config_lock);
897 }
898
899 /*
900 * Handle both the initialisation that is being done when the vcpu is
901 * run for the first time, as well as the updates that must be
902 * performed each time we get a new thread dealing with this vcpu.
903 */
kvm_arch_vcpu_run_pid_change(struct kvm_vcpu * vcpu)904 int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
905 {
906 struct kvm *kvm = vcpu->kvm;
907 int ret;
908
909 if (!kvm_vcpu_initialized(vcpu))
910 return -ENOEXEC;
911
912 if (!kvm_arm_vcpu_is_finalized(vcpu))
913 return -EPERM;
914
915 if (likely(vcpu_has_run_once(vcpu)))
916 return 0;
917
918 kvm_init_mpidr_data(kvm);
919
920 if (likely(irqchip_in_kernel(kvm))) {
921 /*
922 * Map the VGIC hardware resources before running a vcpu the
923 * first time on this VM.
924 */
925 ret = kvm_vgic_map_resources(kvm);
926 if (ret)
927 return ret;
928 }
929
930 ret = kvm_finalize_sys_regs(vcpu);
931 if (ret)
932 return ret;
933
934 if (vcpu_has_nv(vcpu)) {
935 ret = kvm_vcpu_allocate_vncr_tlb(vcpu);
936 if (ret)
937 return ret;
938
939 ret = kvm_vgic_vcpu_nv_init(vcpu);
940 if (ret)
941 return ret;
942 }
943
944 /*
945 * This needs to happen after any restriction has been applied
946 * to the feature set.
947 */
948 kvm_calculate_traps(vcpu);
949
950 ret = kvm_timer_enable(vcpu);
951 if (ret)
952 return ret;
953
954 if (kvm_vcpu_has_pmu(vcpu)) {
955 ret = kvm_arm_pmu_v3_enable(vcpu);
956 if (ret)
957 return ret;
958 }
959
960 ret = vgic_v5_finalize_ppi_state(kvm);
961 if (ret)
962 return ret;
963
964 if (is_protected_kvm_enabled()) {
965 ret = pkvm_create_hyp_vm(kvm);
966 if (ret)
967 return ret;
968
969 ret = pkvm_create_hyp_vcpu(vcpu);
970 if (ret)
971 return ret;
972 }
973
974 mutex_lock(&kvm->arch.config_lock);
975 set_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags);
976 mutex_unlock(&kvm->arch.config_lock);
977
978 return ret;
979 }
980
kvm_arch_intc_initialized(struct kvm * kvm)981 bool kvm_arch_intc_initialized(struct kvm *kvm)
982 {
983 return vgic_initialized(kvm);
984 }
985
kvm_arm_halt_guest(struct kvm * kvm)986 void kvm_arm_halt_guest(struct kvm *kvm)
987 {
988 unsigned long i;
989 struct kvm_vcpu *vcpu;
990
991 kvm_for_each_vcpu(i, vcpu, kvm)
992 vcpu->arch.pause = true;
993 kvm_make_all_cpus_request(kvm, KVM_REQ_SLEEP);
994 }
995
kvm_arm_resume_guest(struct kvm * kvm)996 void kvm_arm_resume_guest(struct kvm *kvm)
997 {
998 unsigned long i;
999 struct kvm_vcpu *vcpu;
1000
1001 kvm_for_each_vcpu(i, vcpu, kvm) {
1002 vcpu->arch.pause = false;
1003 __kvm_vcpu_wake_up(vcpu);
1004 }
1005 }
1006
kvm_vcpu_sleep(struct kvm_vcpu * vcpu)1007 static void kvm_vcpu_sleep(struct kvm_vcpu *vcpu)
1008 {
1009 struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
1010
1011 rcuwait_wait_event(wait,
1012 (!kvm_arm_vcpu_stopped(vcpu)) && (!vcpu->arch.pause),
1013 TASK_INTERRUPTIBLE);
1014
1015 if (kvm_arm_vcpu_stopped(vcpu) || vcpu->arch.pause) {
1016 /* Awaken to handle a signal, request we sleep again later. */
1017 kvm_make_request(KVM_REQ_SLEEP, vcpu);
1018 }
1019
1020 /*
1021 * Make sure we will observe a potential reset request if we've
1022 * observed a change to the power state. Pairs with the smp_wmb() in
1023 * kvm_psci_vcpu_on().
1024 */
1025 smp_rmb();
1026 }
1027
1028 /**
1029 * kvm_vcpu_wfi - emulate Wait-For-Interrupt behavior
1030 * @vcpu: The VCPU pointer
1031 *
1032 * Suspend execution of a vCPU until a valid wake event is detected, i.e. until
1033 * the vCPU is runnable. The vCPU may or may not be scheduled out, depending
1034 * on when a wake event arrives, e.g. there may already be a pending wake event.
1035 */
kvm_vcpu_wfi(struct kvm_vcpu * vcpu)1036 void kvm_vcpu_wfi(struct kvm_vcpu *vcpu)
1037 {
1038 /*
1039 * Sync back the state of the GIC CPU interface so that we have
1040 * the latest PMR and group enables. This ensures that
1041 * kvm_arch_vcpu_runnable has up-to-date data to decide whether
1042 * we have pending interrupts, e.g. when determining if the
1043 * vCPU should block.
1044 *
1045 * For the same reason, we want to tell GICv4 that we need
1046 * doorbells to be signalled, should an interrupt become pending.
1047 */
1048 preempt_disable();
1049 vcpu_set_flag(vcpu, IN_WFI);
1050 kvm_vgic_put(vcpu);
1051 preempt_enable();
1052
1053 kvm_vcpu_halt(vcpu);
1054 vcpu_clear_flag(vcpu, IN_WFIT);
1055
1056 preempt_disable();
1057 vcpu_clear_flag(vcpu, IN_WFI);
1058 kvm_vgic_load(vcpu);
1059 preempt_enable();
1060 }
1061
kvm_vcpu_suspend(struct kvm_vcpu * vcpu)1062 static int kvm_vcpu_suspend(struct kvm_vcpu *vcpu)
1063 {
1064 if (!kvm_arm_vcpu_suspended(vcpu))
1065 return 1;
1066
1067 kvm_vcpu_wfi(vcpu);
1068
1069 /*
1070 * The suspend state is sticky; we do not leave it until userspace
1071 * explicitly marks the vCPU as runnable. Request that we suspend again
1072 * later.
1073 */
1074 kvm_make_request(KVM_REQ_SUSPEND, vcpu);
1075
1076 /*
1077 * Check to make sure the vCPU is actually runnable. If so, exit to
1078 * userspace informing it of the wakeup condition.
1079 */
1080 if (kvm_arch_vcpu_runnable(vcpu)) {
1081 memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
1082 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_WAKEUP;
1083 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
1084 return 0;
1085 }
1086
1087 /*
1088 * Otherwise, we were unblocked to process a different event, such as a
1089 * pending signal. Return 1 and allow kvm_arch_vcpu_ioctl_run() to
1090 * process the event.
1091 */
1092 return 1;
1093 }
1094
1095 /**
1096 * check_vcpu_requests - check and handle pending vCPU requests
1097 * @vcpu: the VCPU pointer
1098 *
1099 * Return: 1 if we should enter the guest
1100 * 0 if we should exit to userspace
1101 * < 0 if we should exit to userspace, where the return value indicates
1102 * an error
1103 */
check_vcpu_requests(struct kvm_vcpu * vcpu)1104 static int check_vcpu_requests(struct kvm_vcpu *vcpu)
1105 {
1106 if (kvm_request_pending(vcpu)) {
1107 if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu))
1108 return -EIO;
1109
1110 if (kvm_check_request(KVM_REQ_SLEEP, vcpu))
1111 kvm_vcpu_sleep(vcpu);
1112
1113 if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
1114 kvm_reset_vcpu(vcpu);
1115
1116 /*
1117 * Clear IRQ_PENDING requests that were made to guarantee
1118 * that a VCPU sees new virtual interrupts.
1119 */
1120 kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu);
1121
1122 /* Process interrupts deactivated through a trap */
1123 if (kvm_check_request(KVM_REQ_VGIC_PROCESS_UPDATE, vcpu))
1124 kvm_vgic_process_async_update(vcpu);
1125
1126 if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu))
1127 kvm_update_stolen_time(vcpu);
1128
1129 if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) {
1130 /* The distributor enable bits were changed */
1131 preempt_disable();
1132 vgic_v4_put(vcpu);
1133 vgic_v4_load(vcpu);
1134 preempt_enable();
1135 }
1136
1137 if (kvm_check_request(KVM_REQ_RELOAD_PMU, vcpu))
1138 kvm_vcpu_reload_pmu(vcpu);
1139
1140 if (kvm_check_request(KVM_REQ_RESYNC_PMU_EL0, vcpu))
1141 kvm_vcpu_pmu_restore_guest(vcpu);
1142
1143 if (kvm_check_request(KVM_REQ_SUSPEND, vcpu))
1144 return kvm_vcpu_suspend(vcpu);
1145
1146 if (kvm_dirty_ring_check_request(vcpu))
1147 return 0;
1148
1149 check_nested_vcpu_requests(vcpu);
1150 }
1151
1152 return 1;
1153 }
1154
vcpu_mode_is_bad_32bit(struct kvm_vcpu * vcpu)1155 static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu)
1156 {
1157 if (likely(!vcpu_mode_is_32bit(vcpu)))
1158 return false;
1159
1160 if (vcpu_has_nv(vcpu))
1161 return true;
1162
1163 return !kvm_supports_32bit_el0();
1164 }
1165
1166 /**
1167 * kvm_vcpu_exit_request - returns true if the VCPU should *not* enter the guest
1168 * @vcpu: The VCPU pointer
1169 * @ret: Pointer to write optional return code
1170 *
1171 * Returns: true if the VCPU needs to return to a preemptible + interruptible
1172 * and skip guest entry.
1173 *
1174 * This function disambiguates between two different types of exits: exits to a
1175 * preemptible + interruptible kernel context and exits to userspace. For an
1176 * exit to userspace, this function will write the return code to ret and return
1177 * true. For an exit to preemptible + interruptible kernel context (i.e. check
1178 * for pending work and re-enter), return true without writing to ret.
1179 */
kvm_vcpu_exit_request(struct kvm_vcpu * vcpu,int * ret)1180 static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret)
1181 {
1182 struct kvm_run *run = vcpu->run;
1183
1184 /*
1185 * If we're using a userspace irqchip, then check if we need
1186 * to tell a userspace irqchip about timer or PMU level
1187 * changes and if so, exit to userspace (the actual level
1188 * state gets updated in kvm_timer_update_run and
1189 * kvm_pmu_update_run below).
1190 */
1191 if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
1192 if (kvm_timer_should_notify_user(vcpu) ||
1193 kvm_pmu_should_notify_user(vcpu)) {
1194 *ret = -EINTR;
1195 run->exit_reason = KVM_EXIT_INTR;
1196 return true;
1197 }
1198 }
1199
1200 if (unlikely(vcpu_on_unsupported_cpu(vcpu))) {
1201 run->exit_reason = KVM_EXIT_FAIL_ENTRY;
1202 run->fail_entry.hardware_entry_failure_reason = KVM_EXIT_FAIL_ENTRY_CPU_UNSUPPORTED;
1203 run->fail_entry.cpu = smp_processor_id();
1204 *ret = 0;
1205 return true;
1206 }
1207
1208 return kvm_request_pending(vcpu) ||
1209 xfer_to_guest_mode_work_pending();
1210 }
1211
1212 /*
1213 * Actually run the vCPU, entering an RCU extended quiescent state (EQS) while
1214 * the vCPU is running.
1215 *
1216 * This must be noinstr as instrumentation may make use of RCU, and this is not
1217 * safe during the EQS.
1218 */
kvm_arm_vcpu_enter_exit(struct kvm_vcpu * vcpu)1219 static int noinstr kvm_arm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
1220 {
1221 int ret;
1222
1223 guest_state_enter_irqoff();
1224 ret = kvm_call_hyp_ret(__kvm_vcpu_run, vcpu);
1225 guest_state_exit_irqoff();
1226
1227 return ret;
1228 }
1229
1230 /**
1231 * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
1232 * @vcpu: The VCPU pointer
1233 *
1234 * This function is called through the VCPU_RUN ioctl called from user space. It
1235 * will execute VM code in a loop until the time slice for the process is used
1236 * or some emulation is needed from user space in which case the function will
1237 * return with return value 0 and with the kvm_run structure filled in with the
1238 * required data for the requested emulation.
1239 */
kvm_arch_vcpu_ioctl_run(struct kvm_vcpu * vcpu)1240 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
1241 {
1242 struct kvm_run *run = vcpu->run;
1243 int ret;
1244
1245 if (run->exit_reason == KVM_EXIT_MMIO) {
1246 ret = kvm_handle_mmio_return(vcpu);
1247 if (ret <= 0)
1248 return ret;
1249 }
1250
1251 vcpu_load(vcpu);
1252
1253 if (!vcpu->wants_to_run) {
1254 ret = -EINTR;
1255 goto out;
1256 }
1257
1258 kvm_sigset_activate(vcpu);
1259
1260 ret = 1;
1261 run->exit_reason = KVM_EXIT_UNKNOWN;
1262 run->flags = 0;
1263 while (ret > 0) {
1264 /*
1265 * Check conditions before entering the guest
1266 */
1267 ret = kvm_xfer_to_guest_mode_handle_work(vcpu);
1268 if (!ret)
1269 ret = 1;
1270
1271 if (ret > 0)
1272 ret = check_vcpu_requests(vcpu);
1273
1274 /*
1275 * Preparing the interrupts to be injected also
1276 * involves poking the GIC, which must be done in a
1277 * non-preemptible context.
1278 */
1279 preempt_disable();
1280
1281 kvm_nested_flush_hwstate(vcpu);
1282
1283 if (kvm_vcpu_has_pmu(vcpu))
1284 kvm_pmu_flush_hwstate(vcpu);
1285
1286 local_irq_disable();
1287
1288 kvm_vgic_flush_hwstate(vcpu);
1289
1290 kvm_pmu_update_vcpu_events(vcpu);
1291
1292 /*
1293 * Ensure we set mode to IN_GUEST_MODE after we disable
1294 * interrupts and before the final VCPU requests check.
1295 * See the comment in kvm_vcpu_exiting_guest_mode() and
1296 * Documentation/virt/kvm/vcpu-requests.rst
1297 */
1298 smp_store_mb(vcpu->mode, IN_GUEST_MODE);
1299
1300 if (ret <= 0 || kvm_vcpu_exit_request(vcpu, &ret)) {
1301 vcpu->mode = OUTSIDE_GUEST_MODE;
1302 isb(); /* Ensure work in x_flush_hwstate is committed */
1303 if (kvm_vcpu_has_pmu(vcpu))
1304 kvm_pmu_sync_hwstate(vcpu);
1305 if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
1306 kvm_timer_sync_user(vcpu);
1307 kvm_vgic_sync_hwstate(vcpu);
1308 local_irq_enable();
1309 preempt_enable();
1310 continue;
1311 }
1312
1313 kvm_arch_vcpu_ctxflush_fp(vcpu);
1314
1315 /**************************************************************
1316 * Enter the guest
1317 */
1318 trace_kvm_entry(*vcpu_pc(vcpu));
1319 guest_timing_enter_irqoff();
1320
1321 ret = kvm_arm_vcpu_enter_exit(vcpu);
1322
1323 vcpu->mode = OUTSIDE_GUEST_MODE;
1324 vcpu->stat.exits++;
1325 /*
1326 * Back from guest
1327 *************************************************************/
1328
1329 /*
1330 * We must sync the PMU state before the vgic state so
1331 * that the vgic can properly sample the updated state of the
1332 * interrupt line.
1333 */
1334 if (kvm_vcpu_has_pmu(vcpu))
1335 kvm_pmu_sync_hwstate(vcpu);
1336
1337 /*
1338 * Sync the vgic state before syncing the timer state because
1339 * the timer code needs to know if the virtual timer
1340 * interrupts are active.
1341 */
1342 kvm_vgic_sync_hwstate(vcpu);
1343
1344 /*
1345 * Sync the timer hardware state before enabling interrupts as
1346 * we don't want vtimer interrupts to race with syncing the
1347 * timer virtual interrupt state.
1348 */
1349 if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
1350 kvm_timer_sync_user(vcpu);
1351
1352 if (is_hyp_ctxt(vcpu))
1353 kvm_timer_sync_nested(vcpu);
1354
1355 kvm_arch_vcpu_ctxsync_fp(vcpu);
1356
1357 /*
1358 * We must ensure that any pending interrupts are taken before
1359 * we exit guest timing so that timer ticks are accounted as
1360 * guest time. Transiently unmask interrupts so that any
1361 * pending interrupts are taken.
1362 *
1363 * Per ARM DDI 0487G.b section D1.13.4, an ISB (or other
1364 * context synchronization event) is necessary to ensure that
1365 * pending interrupts are taken.
1366 */
1367 if (ARM_EXCEPTION_CODE(ret) == ARM_EXCEPTION_IRQ) {
1368 local_irq_enable();
1369 isb();
1370 local_irq_disable();
1371 }
1372
1373 guest_timing_exit_irqoff();
1374
1375 local_irq_enable();
1376
1377 trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
1378
1379 /* Exit types that need handling before we can be preempted */
1380 handle_exit_early(vcpu, ret);
1381
1382 kvm_nested_sync_hwstate(vcpu);
1383
1384 preempt_enable();
1385
1386 /*
1387 * The ARMv8 architecture doesn't give the hypervisor
1388 * a mechanism to prevent a guest from dropping to AArch32 EL0
1389 * if implemented by the CPU. If we spot the guest in such
1390 * state and that we decided it wasn't supposed to do so (like
1391 * with the asymmetric AArch32 case), return to userspace with
1392 * a fatal error.
1393 */
1394 if (vcpu_mode_is_bad_32bit(vcpu)) {
1395 /*
1396 * As we have caught the guest red-handed, decide that
1397 * it isn't fit for purpose anymore by making the vcpu
1398 * invalid. The VMM can try and fix it by issuing a
1399 * KVM_ARM_VCPU_INIT if it really wants to.
1400 */
1401 vcpu_clear_flag(vcpu, VCPU_INITIALIZED);
1402 ret = ARM_EXCEPTION_IL;
1403 }
1404
1405 ret = handle_exit(vcpu, ret);
1406 }
1407
1408 /* Tell userspace about in-kernel device output levels */
1409 if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
1410 kvm_timer_update_run(vcpu);
1411 kvm_pmu_update_run(vcpu);
1412 }
1413
1414 kvm_sigset_deactivate(vcpu);
1415
1416 out:
1417 /*
1418 * In the unlikely event that we are returning to userspace
1419 * with pending exceptions or PC adjustment, commit these
1420 * adjustments in order to give userspace a consistent view of
1421 * the vcpu state. Note that this relies on __kvm_adjust_pc()
1422 * being preempt-safe on VHE.
1423 */
1424 if (unlikely(vcpu_get_flag(vcpu, PENDING_EXCEPTION) ||
1425 vcpu_get_flag(vcpu, INCREMENT_PC)))
1426 kvm_call_hyp(__kvm_adjust_pc, vcpu);
1427
1428 vcpu_put(vcpu);
1429 return ret;
1430 }
1431
vcpu_interrupt_line(struct kvm_vcpu * vcpu,int number,bool level)1432 static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level)
1433 {
1434 int bit_index;
1435 bool set;
1436 unsigned long *hcr;
1437
1438 if (number == KVM_ARM_IRQ_CPU_IRQ)
1439 bit_index = __ffs(HCR_VI);
1440 else /* KVM_ARM_IRQ_CPU_FIQ */
1441 bit_index = __ffs(HCR_VF);
1442
1443 hcr = vcpu_hcr(vcpu);
1444 if (level)
1445 set = test_and_set_bit(bit_index, hcr);
1446 else
1447 set = test_and_clear_bit(bit_index, hcr);
1448
1449 /*
1450 * If we didn't change anything, no need to wake up or kick other CPUs
1451 */
1452 if (set == level)
1453 return 0;
1454
1455 /*
1456 * The vcpu irq_lines field was updated, wake up sleeping VCPUs and
1457 * trigger a world-switch round on the running physical CPU to set the
1458 * virtual IRQ/FIQ fields in the HCR appropriately.
1459 */
1460 kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
1461 kvm_vcpu_kick(vcpu);
1462
1463 return 0;
1464 }
1465
kvm_vm_ioctl_irq_line(struct kvm * kvm,struct kvm_irq_level * irq_level,bool line_status)1466 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
1467 bool line_status)
1468 {
1469 unsigned int irq_type, vcpu_id, irq_num;
1470 struct kvm_vcpu *vcpu = NULL;
1471 bool level = irq_level->level;
1472 u32 irq = irq_level->irq;
1473 unsigned long *mask;
1474
1475 irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK;
1476 vcpu_id = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK;
1477 vcpu_id += ((irq >> KVM_ARM_IRQ_VCPU2_SHIFT) & KVM_ARM_IRQ_VCPU2_MASK) * (KVM_ARM_IRQ_VCPU_MASK + 1);
1478 irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK;
1479
1480 trace_kvm_irq_line(irq_type, vcpu_id, irq_num, irq_level->level);
1481
1482 switch (irq_type) {
1483 case KVM_ARM_IRQ_TYPE_CPU:
1484 if (irqchip_in_kernel(kvm))
1485 return -ENXIO;
1486
1487 vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
1488 if (!vcpu)
1489 return -EINVAL;
1490
1491 if (irq_num > KVM_ARM_IRQ_CPU_FIQ)
1492 return -EINVAL;
1493
1494 return vcpu_interrupt_line(vcpu, irq_num, level);
1495 case KVM_ARM_IRQ_TYPE_PPI:
1496 if (!irqchip_in_kernel(kvm))
1497 return -ENXIO;
1498
1499 vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
1500 if (!vcpu)
1501 return -EINVAL;
1502
1503 if (vgic_is_v5(kvm)) {
1504 if (irq_num >= VGIC_V5_NR_PRIVATE_IRQS)
1505 return -EINVAL;
1506
1507 /*
1508 * Only allow PPIs that are explicitly exposed to
1509 * usespace to be driven via KVM_IRQ_LINE
1510 */
1511 mask = kvm->arch.vgic.gicv5_vm.userspace_ppis;
1512 if (!test_bit(irq_num, mask))
1513 return -EINVAL;
1514
1515 /* Build a GICv5-style IntID here */
1516 irq_num = vgic_v5_make_ppi(irq_num);
1517 } else if (irq_num < VGIC_NR_SGIS ||
1518 irq_num >= VGIC_NR_PRIVATE_IRQS) {
1519 return -EINVAL;
1520 }
1521
1522 return kvm_vgic_inject_irq(kvm, vcpu, irq_num, level, NULL);
1523 case KVM_ARM_IRQ_TYPE_SPI:
1524 if (!irqchip_in_kernel(kvm))
1525 return -ENXIO;
1526
1527 if (vgic_is_v5(kvm)) {
1528 /* Build a GICv5-style IntID here */
1529 irq_num = vgic_v5_make_spi(irq_num);
1530 } else {
1531 if (irq_num < VGIC_NR_PRIVATE_IRQS)
1532 return -EINVAL;
1533 }
1534
1535 return kvm_vgic_inject_irq(kvm, NULL, irq_num, level, NULL);
1536 }
1537
1538 return -EINVAL;
1539 }
1540
system_supported_vcpu_features(void)1541 static unsigned long system_supported_vcpu_features(void)
1542 {
1543 unsigned long features = KVM_VCPU_VALID_FEATURES;
1544
1545 if (!cpus_have_final_cap(ARM64_HAS_32BIT_EL1))
1546 clear_bit(KVM_ARM_VCPU_EL1_32BIT, &features);
1547
1548 if (!kvm_supports_guest_pmuv3())
1549 clear_bit(KVM_ARM_VCPU_PMU_V3, &features);
1550
1551 if (!system_supports_sve())
1552 clear_bit(KVM_ARM_VCPU_SVE, &features);
1553
1554 if (!kvm_has_full_ptr_auth()) {
1555 clear_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, &features);
1556 clear_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, &features);
1557 }
1558
1559 if (!cpus_have_final_cap(ARM64_HAS_NESTED_VIRT))
1560 clear_bit(KVM_ARM_VCPU_HAS_EL2, &features);
1561
1562 return features;
1563 }
1564
kvm_vcpu_init_check_features(struct kvm_vcpu * vcpu,const struct kvm_vcpu_init * init)1565 static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu,
1566 const struct kvm_vcpu_init *init)
1567 {
1568 unsigned long features = init->features[0];
1569 int i;
1570
1571 if (features & ~KVM_VCPU_VALID_FEATURES)
1572 return -ENOENT;
1573
1574 for (i = 1; i < ARRAY_SIZE(init->features); i++) {
1575 if (init->features[i])
1576 return -ENOENT;
1577 }
1578
1579 if (features & ~system_supported_vcpu_features())
1580 return -EINVAL;
1581
1582 /*
1583 * For now make sure that both address/generic pointer authentication
1584 * features are requested by the userspace together.
1585 */
1586 if (test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, &features) !=
1587 test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, &features))
1588 return -EINVAL;
1589
1590 if (!test_bit(KVM_ARM_VCPU_EL1_32BIT, &features))
1591 return 0;
1592
1593 /* MTE is incompatible with AArch32 */
1594 if (kvm_has_mte(vcpu->kvm))
1595 return -EINVAL;
1596
1597 /* NV is incompatible with AArch32 */
1598 if (test_bit(KVM_ARM_VCPU_HAS_EL2, &features))
1599 return -EINVAL;
1600
1601 return 0;
1602 }
1603
kvm_vcpu_init_changed(struct kvm_vcpu * vcpu,const struct kvm_vcpu_init * init)1604 static bool kvm_vcpu_init_changed(struct kvm_vcpu *vcpu,
1605 const struct kvm_vcpu_init *init)
1606 {
1607 unsigned long features = init->features[0];
1608
1609 return !bitmap_equal(vcpu->kvm->arch.vcpu_features, &features,
1610 KVM_VCPU_MAX_FEATURES);
1611 }
1612
kvm_setup_vcpu(struct kvm_vcpu * vcpu)1613 static int kvm_setup_vcpu(struct kvm_vcpu *vcpu)
1614 {
1615 struct kvm *kvm = vcpu->kvm;
1616 int ret = 0;
1617
1618 /*
1619 * When the vCPU has a PMU, but no PMU is set for the guest
1620 * yet, set the default one.
1621 */
1622 if (kvm_vcpu_has_pmu(vcpu) && !kvm->arch.arm_pmu)
1623 ret = kvm_arm_set_default_pmu(kvm);
1624
1625 /* Prepare for nested if required */
1626 if (!ret && vcpu_has_nv(vcpu))
1627 ret = kvm_vcpu_init_nested(vcpu);
1628
1629 return ret;
1630 }
1631
__kvm_vcpu_set_target(struct kvm_vcpu * vcpu,const struct kvm_vcpu_init * init)1632 static int __kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
1633 const struct kvm_vcpu_init *init)
1634 {
1635 unsigned long features = init->features[0];
1636 struct kvm *kvm = vcpu->kvm;
1637 int ret = -EINVAL;
1638
1639 mutex_lock(&kvm->arch.config_lock);
1640
1641 if (test_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags) &&
1642 kvm_vcpu_init_changed(vcpu, init))
1643 goto out_unlock;
1644
1645 bitmap_copy(kvm->arch.vcpu_features, &features, KVM_VCPU_MAX_FEATURES);
1646
1647 ret = kvm_setup_vcpu(vcpu);
1648 if (ret)
1649 goto out_unlock;
1650
1651 /* Now we know what it is, we can reset it. */
1652 kvm_reset_vcpu(vcpu);
1653
1654 set_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags);
1655 vcpu_set_flag(vcpu, VCPU_INITIALIZED);
1656 ret = 0;
1657 out_unlock:
1658 mutex_unlock(&kvm->arch.config_lock);
1659 return ret;
1660 }
1661
kvm_vcpu_set_target(struct kvm_vcpu * vcpu,const struct kvm_vcpu_init * init)1662 static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
1663 const struct kvm_vcpu_init *init)
1664 {
1665 int ret;
1666
1667 if (init->target != KVM_ARM_TARGET_GENERIC_V8 &&
1668 init->target != kvm_target_cpu())
1669 return -EINVAL;
1670
1671 ret = kvm_vcpu_init_check_features(vcpu, init);
1672 if (ret)
1673 return ret;
1674
1675 if (!kvm_vcpu_initialized(vcpu))
1676 return __kvm_vcpu_set_target(vcpu, init);
1677
1678 if (kvm_vcpu_init_changed(vcpu, init))
1679 return -EINVAL;
1680
1681 kvm_reset_vcpu(vcpu);
1682 return 0;
1683 }
1684
kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu * vcpu,struct kvm_vcpu_init * init)1685 static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
1686 struct kvm_vcpu_init *init)
1687 {
1688 bool power_off = false;
1689 int ret;
1690
1691 /*
1692 * Treat the power-off vCPU feature as ephemeral. Clear the bit to avoid
1693 * reflecting it in the finalized feature set, thus limiting its scope
1694 * to a single KVM_ARM_VCPU_INIT call.
1695 */
1696 if (init->features[0] & BIT(KVM_ARM_VCPU_POWER_OFF)) {
1697 init->features[0] &= ~BIT(KVM_ARM_VCPU_POWER_OFF);
1698 power_off = true;
1699 }
1700
1701 ret = kvm_vcpu_set_target(vcpu, init);
1702 if (ret)
1703 return ret;
1704
1705 /*
1706 * Ensure a rebooted VM will fault in RAM pages and detect if the
1707 * guest MMU is turned off and flush the caches as needed.
1708 *
1709 * S2FWB enforces all memory accesses to RAM being cacheable,
1710 * ensuring that the data side is always coherent. We still
1711 * need to invalidate the I-cache though, as FWB does *not*
1712 * imply CTR_EL0.DIC.
1713 */
1714 if (vcpu_has_run_once(vcpu)) {
1715 if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
1716 stage2_unmap_vm(vcpu->kvm);
1717 else
1718 icache_inval_all_pou();
1719 }
1720
1721 vcpu_reset_hcr(vcpu);
1722
1723 /*
1724 * Handle the "start in power-off" case.
1725 */
1726 spin_lock(&vcpu->arch.mp_state_lock);
1727
1728 if (power_off)
1729 __kvm_arm_vcpu_power_off(vcpu);
1730 else
1731 WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_RUNNABLE);
1732
1733 spin_unlock(&vcpu->arch.mp_state_lock);
1734
1735 return 0;
1736 }
1737
kvm_arm_vcpu_set_attr(struct kvm_vcpu * vcpu,struct kvm_device_attr * attr)1738 static int kvm_arm_vcpu_set_attr(struct kvm_vcpu *vcpu,
1739 struct kvm_device_attr *attr)
1740 {
1741 int ret = -ENXIO;
1742
1743 switch (attr->group) {
1744 default:
1745 ret = kvm_arm_vcpu_arch_set_attr(vcpu, attr);
1746 break;
1747 }
1748
1749 return ret;
1750 }
1751
kvm_arm_vcpu_get_attr(struct kvm_vcpu * vcpu,struct kvm_device_attr * attr)1752 static int kvm_arm_vcpu_get_attr(struct kvm_vcpu *vcpu,
1753 struct kvm_device_attr *attr)
1754 {
1755 int ret = -ENXIO;
1756
1757 switch (attr->group) {
1758 default:
1759 ret = kvm_arm_vcpu_arch_get_attr(vcpu, attr);
1760 break;
1761 }
1762
1763 return ret;
1764 }
1765
kvm_arm_vcpu_has_attr(struct kvm_vcpu * vcpu,struct kvm_device_attr * attr)1766 static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu,
1767 struct kvm_device_attr *attr)
1768 {
1769 int ret = -ENXIO;
1770
1771 switch (attr->group) {
1772 default:
1773 ret = kvm_arm_vcpu_arch_has_attr(vcpu, attr);
1774 break;
1775 }
1776
1777 return ret;
1778 }
1779
kvm_arm_vcpu_get_events(struct kvm_vcpu * vcpu,struct kvm_vcpu_events * events)1780 static int kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
1781 struct kvm_vcpu_events *events)
1782 {
1783 memset(events, 0, sizeof(*events));
1784
1785 return __kvm_arm_vcpu_get_events(vcpu, events);
1786 }
1787
kvm_arm_vcpu_set_events(struct kvm_vcpu * vcpu,struct kvm_vcpu_events * events)1788 static int kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
1789 struct kvm_vcpu_events *events)
1790 {
1791 int i;
1792
1793 /* check whether the reserved field is zero */
1794 for (i = 0; i < ARRAY_SIZE(events->reserved); i++)
1795 if (events->reserved[i])
1796 return -EINVAL;
1797
1798 /* check whether the pad field is zero */
1799 for (i = 0; i < ARRAY_SIZE(events->exception.pad); i++)
1800 if (events->exception.pad[i])
1801 return -EINVAL;
1802
1803 return __kvm_arm_vcpu_set_events(vcpu, events);
1804 }
1805
kvm_arch_vcpu_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)1806 long kvm_arch_vcpu_ioctl(struct file *filp,
1807 unsigned int ioctl, unsigned long arg)
1808 {
1809 struct kvm_vcpu *vcpu = filp->private_data;
1810 void __user *argp = (void __user *)arg;
1811 struct kvm_device_attr attr;
1812 long r;
1813
1814 switch (ioctl) {
1815 case KVM_ARM_VCPU_INIT: {
1816 struct kvm_vcpu_init init;
1817
1818 r = -EFAULT;
1819 if (copy_from_user(&init, argp, sizeof(init)))
1820 break;
1821
1822 r = kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init);
1823 break;
1824 }
1825 case KVM_SET_ONE_REG:
1826 case KVM_GET_ONE_REG: {
1827 struct kvm_one_reg reg;
1828
1829 r = -ENOEXEC;
1830 if (unlikely(!kvm_vcpu_initialized(vcpu)))
1831 break;
1832
1833 r = -EFAULT;
1834 if (copy_from_user(®, argp, sizeof(reg)))
1835 break;
1836
1837 /*
1838 * We could owe a reset due to PSCI. Handle the pending reset
1839 * here to ensure userspace register accesses are ordered after
1840 * the reset.
1841 */
1842 if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
1843 kvm_reset_vcpu(vcpu);
1844
1845 if (ioctl == KVM_SET_ONE_REG)
1846 r = kvm_arm_set_reg(vcpu, ®);
1847 else
1848 r = kvm_arm_get_reg(vcpu, ®);
1849 break;
1850 }
1851 case KVM_GET_REG_LIST: {
1852 struct kvm_reg_list __user *user_list = argp;
1853 struct kvm_reg_list reg_list;
1854 unsigned n;
1855
1856 r = -ENOEXEC;
1857 if (unlikely(!kvm_vcpu_initialized(vcpu)))
1858 break;
1859
1860 r = -EPERM;
1861 if (!kvm_arm_vcpu_is_finalized(vcpu))
1862 break;
1863
1864 r = -EFAULT;
1865 if (copy_from_user(®_list, user_list, sizeof(reg_list)))
1866 break;
1867 n = reg_list.n;
1868 reg_list.n = kvm_arm_num_regs(vcpu);
1869 if (copy_to_user(user_list, ®_list, sizeof(reg_list)))
1870 break;
1871 r = -E2BIG;
1872 if (n < reg_list.n)
1873 break;
1874 r = kvm_arm_copy_reg_indices(vcpu, user_list->reg);
1875 break;
1876 }
1877 case KVM_SET_DEVICE_ATTR: {
1878 r = -EFAULT;
1879 if (copy_from_user(&attr, argp, sizeof(attr)))
1880 break;
1881 r = kvm_arm_vcpu_set_attr(vcpu, &attr);
1882 break;
1883 }
1884 case KVM_GET_DEVICE_ATTR: {
1885 r = -EFAULT;
1886 if (copy_from_user(&attr, argp, sizeof(attr)))
1887 break;
1888 r = kvm_arm_vcpu_get_attr(vcpu, &attr);
1889 break;
1890 }
1891 case KVM_HAS_DEVICE_ATTR: {
1892 r = -EFAULT;
1893 if (copy_from_user(&attr, argp, sizeof(attr)))
1894 break;
1895 r = kvm_arm_vcpu_has_attr(vcpu, &attr);
1896 break;
1897 }
1898 case KVM_GET_VCPU_EVENTS: {
1899 struct kvm_vcpu_events events;
1900
1901 if (!kvm_vcpu_initialized(vcpu))
1902 return -ENOEXEC;
1903
1904 if (kvm_arm_vcpu_get_events(vcpu, &events))
1905 return -EINVAL;
1906
1907 if (copy_to_user(argp, &events, sizeof(events)))
1908 return -EFAULT;
1909
1910 return 0;
1911 }
1912 case KVM_SET_VCPU_EVENTS: {
1913 struct kvm_vcpu_events events;
1914
1915 if (!kvm_vcpu_initialized(vcpu))
1916 return -ENOEXEC;
1917
1918 if (copy_from_user(&events, argp, sizeof(events)))
1919 return -EFAULT;
1920
1921 return kvm_arm_vcpu_set_events(vcpu, &events);
1922 }
1923 case KVM_ARM_VCPU_FINALIZE: {
1924 int what;
1925
1926 if (!kvm_vcpu_initialized(vcpu))
1927 return -ENOEXEC;
1928
1929 if (get_user(what, (const int __user *)argp))
1930 return -EFAULT;
1931
1932 return kvm_arm_vcpu_finalize(vcpu, what);
1933 }
1934 default:
1935 r = -EINVAL;
1936 }
1937
1938 return r;
1939 }
1940
kvm_arch_vcpu_unlocked_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)1941 long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl,
1942 unsigned long arg)
1943 {
1944 return -ENOIOCTLCMD;
1945 }
1946
kvm_arch_sync_dirty_log(struct kvm * kvm,struct kvm_memory_slot * memslot)1947 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
1948 {
1949
1950 }
1951
kvm_vm_ioctl_set_device_addr(struct kvm * kvm,struct kvm_arm_device_addr * dev_addr)1952 static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
1953 struct kvm_arm_device_addr *dev_addr)
1954 {
1955 switch (FIELD_GET(KVM_ARM_DEVICE_ID_MASK, dev_addr->id)) {
1956 case KVM_ARM_DEVICE_VGIC_V2:
1957 if (!vgic_present)
1958 return -ENXIO;
1959 return kvm_set_legacy_vgic_v2_addr(kvm, dev_addr);
1960 default:
1961 return -ENODEV;
1962 }
1963 }
1964
kvm_vm_has_attr(struct kvm * kvm,struct kvm_device_attr * attr)1965 static int kvm_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
1966 {
1967 switch (attr->group) {
1968 case KVM_ARM_VM_SMCCC_CTRL:
1969 return kvm_vm_smccc_has_attr(kvm, attr);
1970 default:
1971 return -ENXIO;
1972 }
1973 }
1974
kvm_vm_set_attr(struct kvm * kvm,struct kvm_device_attr * attr)1975 static int kvm_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
1976 {
1977 switch (attr->group) {
1978 case KVM_ARM_VM_SMCCC_CTRL:
1979 return kvm_vm_smccc_set_attr(kvm, attr);
1980 default:
1981 return -ENXIO;
1982 }
1983 }
1984
kvm_arch_vm_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)1985 int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
1986 {
1987 struct kvm *kvm = filp->private_data;
1988 void __user *argp = (void __user *)arg;
1989 struct kvm_device_attr attr;
1990
1991 if (is_protected_kvm_enabled() && !kvm_pkvm_ioctl_allowed(kvm, ioctl))
1992 return -EINVAL;
1993
1994 switch (ioctl) {
1995 case KVM_CREATE_IRQCHIP: {
1996 int ret;
1997 if (!vgic_present)
1998 return -ENXIO;
1999 mutex_lock(&kvm->lock);
2000 ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
2001 mutex_unlock(&kvm->lock);
2002 return ret;
2003 }
2004 case KVM_ARM_SET_DEVICE_ADDR: {
2005 struct kvm_arm_device_addr dev_addr;
2006
2007 if (copy_from_user(&dev_addr, argp, sizeof(dev_addr)))
2008 return -EFAULT;
2009 return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr);
2010 }
2011 case KVM_ARM_PREFERRED_TARGET: {
2012 struct kvm_vcpu_init init = {
2013 .target = KVM_ARM_TARGET_GENERIC_V8,
2014 };
2015
2016 if (copy_to_user(argp, &init, sizeof(init)))
2017 return -EFAULT;
2018
2019 return 0;
2020 }
2021 case KVM_ARM_MTE_COPY_TAGS: {
2022 struct kvm_arm_copy_mte_tags copy_tags;
2023
2024 if (copy_from_user(©_tags, argp, sizeof(copy_tags)))
2025 return -EFAULT;
2026 return kvm_vm_ioctl_mte_copy_tags(kvm, ©_tags);
2027 }
2028 case KVM_ARM_SET_COUNTER_OFFSET: {
2029 struct kvm_arm_counter_offset offset;
2030
2031 if (copy_from_user(&offset, argp, sizeof(offset)))
2032 return -EFAULT;
2033 return kvm_vm_ioctl_set_counter_offset(kvm, &offset);
2034 }
2035 case KVM_HAS_DEVICE_ATTR: {
2036 if (copy_from_user(&attr, argp, sizeof(attr)))
2037 return -EFAULT;
2038
2039 return kvm_vm_has_attr(kvm, &attr);
2040 }
2041 case KVM_SET_DEVICE_ATTR: {
2042 if (copy_from_user(&attr, argp, sizeof(attr)))
2043 return -EFAULT;
2044
2045 return kvm_vm_set_attr(kvm, &attr);
2046 }
2047 case KVM_ARM_GET_REG_WRITABLE_MASKS: {
2048 struct reg_mask_range range;
2049
2050 if (copy_from_user(&range, argp, sizeof(range)))
2051 return -EFAULT;
2052 return kvm_vm_ioctl_get_reg_writable_masks(kvm, &range);
2053 }
2054 default:
2055 return -EINVAL;
2056 }
2057 }
2058
nvhe_percpu_size(void)2059 static unsigned long nvhe_percpu_size(void)
2060 {
2061 return (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_end) -
2062 (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_start);
2063 }
2064
nvhe_percpu_order(void)2065 static unsigned long nvhe_percpu_order(void)
2066 {
2067 unsigned long size = nvhe_percpu_size();
2068
2069 return size ? get_order(size) : 0;
2070 }
2071
pkvm_host_sve_state_order(void)2072 static size_t pkvm_host_sve_state_order(void)
2073 {
2074 return get_order(pkvm_host_sve_state_size());
2075 }
2076
2077 /* A lookup table holding the hypervisor VA for each vector slot */
2078 static void *hyp_spectre_vector_selector[BP_HARDEN_EL2_SLOTS];
2079
kvm_init_vector_slot(void * base,enum arm64_hyp_spectre_vector slot)2080 static void kvm_init_vector_slot(void *base, enum arm64_hyp_spectre_vector slot)
2081 {
2082 hyp_spectre_vector_selector[slot] = __kvm_vector_slot2addr(base, slot);
2083 }
2084
kvm_init_vector_slots(void)2085 static int kvm_init_vector_slots(void)
2086 {
2087 int err;
2088 void *base;
2089
2090 base = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector));
2091 kvm_init_vector_slot(base, HYP_VECTOR_DIRECT);
2092
2093 base = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs));
2094 kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_DIRECT);
2095
2096 if (kvm_system_needs_idmapped_vectors() &&
2097 !is_protected_kvm_enabled()) {
2098 err = create_hyp_exec_mappings(__pa_symbol(__bp_harden_hyp_vecs),
2099 __BP_HARDEN_HYP_VECS_SZ, &base);
2100 if (err)
2101 return err;
2102 }
2103
2104 kvm_init_vector_slot(base, HYP_VECTOR_INDIRECT);
2105 kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_INDIRECT);
2106 return 0;
2107 }
2108
cpu_prepare_hyp_mode(int cpu,u32 hyp_va_bits)2109 static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits)
2110 {
2111 struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
2112 unsigned long tcr;
2113
2114 /*
2115 * Calculate the raw per-cpu offset without a translation from the
2116 * kernel's mapping to the linear mapping, and store it in tpidr_el2
2117 * so that we can use adr_l to access per-cpu variables in EL2.
2118 * Also drop the KASAN tag which gets in the way...
2119 */
2120 params->tpidr_el2 = (unsigned long)kasan_reset_tag(per_cpu_ptr_nvhe_sym(__per_cpu_start, cpu)) -
2121 (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start));
2122
2123 params->mair_el2 = read_sysreg(mair_el1);
2124
2125 tcr = read_sysreg(tcr_el1);
2126 if (cpus_have_final_cap(ARM64_KVM_HVHE)) {
2127 tcr &= ~(TCR_HD | TCR_HA | TCR_A1 | TCR_T0SZ_MASK);
2128 tcr |= TCR_EPD1_MASK;
2129 } else {
2130 unsigned long ips = FIELD_GET(TCR_IPS_MASK, tcr);
2131
2132 tcr &= TCR_EL2_MASK;
2133 tcr |= TCR_EL2_RES1 | FIELD_PREP(TCR_EL2_PS_MASK, ips);
2134 if (lpa2_is_enabled())
2135 tcr |= TCR_EL2_DS;
2136 }
2137 tcr |= TCR_T0SZ(hyp_va_bits);
2138 params->tcr_el2 = tcr;
2139
2140 params->pgd_pa = kvm_mmu_get_httbr();
2141 if (is_protected_kvm_enabled())
2142 params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS;
2143 else
2144 params->hcr_el2 = HCR_HOST_NVHE_FLAGS;
2145
2146 if (system_supports_mte())
2147 params->hcr_el2 |= HCR_ATA;
2148 else
2149 params->hcr_el2 |= HCR_TID5;
2150
2151 if (cpus_have_final_cap(ARM64_KVM_HVHE))
2152 params->hcr_el2 |= HCR_E2H;
2153 params->vttbr = params->vtcr = 0;
2154
2155 /*
2156 * Flush the init params from the data cache because the struct will
2157 * be read while the MMU is off.
2158 */
2159 kvm_flush_dcache_to_poc(params, sizeof(*params));
2160 }
2161
hyp_install_host_vector(void)2162 static void hyp_install_host_vector(void)
2163 {
2164 struct kvm_nvhe_init_params *params;
2165 struct arm_smccc_res res;
2166
2167 /* Switch from the HYP stub to our own HYP init vector */
2168 __hyp_set_vectors(kvm_get_idmap_vector());
2169
2170 /*
2171 * Call initialization code, and switch to the full blown HYP code.
2172 * If the cpucaps haven't been finalized yet, something has gone very
2173 * wrong, and hyp will crash and burn when it uses any
2174 * cpus_have_*_cap() wrapper.
2175 */
2176 BUG_ON(!system_capabilities_finalized());
2177 params = this_cpu_ptr_nvhe_sym(kvm_init_params);
2178 arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init), virt_to_phys(params), &res);
2179 WARN_ON(res.a0 != SMCCC_RET_SUCCESS);
2180 }
2181
cpu_init_hyp_mode(void)2182 static void cpu_init_hyp_mode(void)
2183 {
2184 hyp_install_host_vector();
2185
2186 /*
2187 * Disabling SSBD on a non-VHE system requires us to enable SSBS
2188 * at EL2.
2189 */
2190 if (this_cpu_has_cap(ARM64_SSBS) &&
2191 arm64_get_spectre_v4_state() == SPECTRE_VULNERABLE) {
2192 kvm_call_hyp_nvhe(__kvm_enable_ssbs);
2193 }
2194 }
2195
cpu_hyp_reset(void)2196 static void cpu_hyp_reset(void)
2197 {
2198 if (!is_kernel_in_hyp_mode())
2199 __hyp_reset_vectors();
2200 }
2201
2202 /*
2203 * EL2 vectors can be mapped and rerouted in a number of ways,
2204 * depending on the kernel configuration and CPU present:
2205 *
2206 * - If the CPU is affected by Spectre-v2, the hardening sequence is
2207 * placed in one of the vector slots, which is executed before jumping
2208 * to the real vectors.
2209 *
2210 * - If the CPU also has the ARM64_SPECTRE_V3A cap, the slot
2211 * containing the hardening sequence is mapped next to the idmap page,
2212 * and executed before jumping to the real vectors.
2213 *
2214 * - If the CPU only has the ARM64_SPECTRE_V3A cap, then an
2215 * empty slot is selected, mapped next to the idmap page, and
2216 * executed before jumping to the real vectors.
2217 *
2218 * Note that ARM64_SPECTRE_V3A is somewhat incompatible with
2219 * VHE, as we don't have hypervisor-specific mappings. If the system
2220 * is VHE and yet selects this capability, it will be ignored.
2221 */
cpu_set_hyp_vector(void)2222 static void cpu_set_hyp_vector(void)
2223 {
2224 struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data);
2225 void *vector = hyp_spectre_vector_selector[data->slot];
2226
2227 if (!is_protected_kvm_enabled())
2228 *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vector;
2229 else
2230 kvm_call_hyp_nvhe(__pkvm_cpu_set_vector, data->slot);
2231 }
2232
cpu_hyp_init_context(void)2233 static void cpu_hyp_init_context(void)
2234 {
2235 kvm_init_host_cpu_context(host_data_ptr(host_ctxt));
2236 kvm_init_host_debug_data();
2237
2238 if (!is_kernel_in_hyp_mode())
2239 cpu_init_hyp_mode();
2240 }
2241
cpu_hyp_init_features(void)2242 static void cpu_hyp_init_features(void)
2243 {
2244 cpu_set_hyp_vector();
2245
2246 if (is_kernel_in_hyp_mode()) {
2247 kvm_timer_init_vhe();
2248 kvm_debug_init_vhe();
2249 }
2250
2251 if (vgic_present)
2252 kvm_vgic_init_cpu_hardware();
2253 }
2254
cpu_hyp_reinit(void)2255 static void cpu_hyp_reinit(void)
2256 {
2257 cpu_hyp_reset();
2258 cpu_hyp_init_context();
2259 cpu_hyp_init_features();
2260 }
2261
cpu_hyp_init(void * discard)2262 static void cpu_hyp_init(void *discard)
2263 {
2264 if (!__this_cpu_read(kvm_hyp_initialized)) {
2265 cpu_hyp_reinit();
2266 __this_cpu_write(kvm_hyp_initialized, 1);
2267 }
2268 }
2269
cpu_hyp_uninit(void * discard)2270 static void cpu_hyp_uninit(void *discard)
2271 {
2272 if (!is_protected_kvm_enabled() && __this_cpu_read(kvm_hyp_initialized)) {
2273 cpu_hyp_reset();
2274 __this_cpu_write(kvm_hyp_initialized, 0);
2275 }
2276 }
2277
kvm_arch_enable_virtualization_cpu(void)2278 int kvm_arch_enable_virtualization_cpu(void)
2279 {
2280 /*
2281 * Most calls to this function are made with migration
2282 * disabled, but not with preemption disabled. The former is
2283 * enough to ensure correctness, but most of the helpers
2284 * expect the later and will throw a tantrum otherwise.
2285 */
2286 preempt_disable();
2287
2288 cpu_hyp_init(NULL);
2289
2290 kvm_vgic_cpu_up();
2291 kvm_timer_cpu_up();
2292
2293 preempt_enable();
2294
2295 return 0;
2296 }
2297
kvm_arch_disable_virtualization_cpu(void)2298 void kvm_arch_disable_virtualization_cpu(void)
2299 {
2300 kvm_timer_cpu_down();
2301 kvm_vgic_cpu_down();
2302
2303 if (!is_protected_kvm_enabled())
2304 cpu_hyp_uninit(NULL);
2305 }
2306
2307 #ifdef CONFIG_CPU_PM
hyp_init_cpu_pm_notifier(struct notifier_block * self,unsigned long cmd,void * v)2308 static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
2309 unsigned long cmd,
2310 void *v)
2311 {
2312 /*
2313 * kvm_hyp_initialized is left with its old value over
2314 * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should
2315 * re-enable hyp.
2316 */
2317 switch (cmd) {
2318 case CPU_PM_ENTER:
2319 if (__this_cpu_read(kvm_hyp_initialized))
2320 /*
2321 * don't update kvm_hyp_initialized here
2322 * so that the hyp will be re-enabled
2323 * when we resume. See below.
2324 */
2325 cpu_hyp_reset();
2326
2327 return NOTIFY_OK;
2328 case CPU_PM_ENTER_FAILED:
2329 case CPU_PM_EXIT:
2330 if (__this_cpu_read(kvm_hyp_initialized))
2331 /* The hyp was enabled before suspend. */
2332 cpu_hyp_reinit();
2333
2334 return NOTIFY_OK;
2335
2336 default:
2337 return NOTIFY_DONE;
2338 }
2339 }
2340
2341 static struct notifier_block hyp_init_cpu_pm_nb = {
2342 .notifier_call = hyp_init_cpu_pm_notifier,
2343 };
2344
hyp_cpu_pm_init(void)2345 static void __init hyp_cpu_pm_init(void)
2346 {
2347 if (!is_protected_kvm_enabled())
2348 cpu_pm_register_notifier(&hyp_init_cpu_pm_nb);
2349 }
hyp_cpu_pm_exit(void)2350 static void __init hyp_cpu_pm_exit(void)
2351 {
2352 if (!is_protected_kvm_enabled())
2353 cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb);
2354 }
2355 #else
hyp_cpu_pm_init(void)2356 static inline void __init hyp_cpu_pm_init(void)
2357 {
2358 }
hyp_cpu_pm_exit(void)2359 static inline void __init hyp_cpu_pm_exit(void)
2360 {
2361 }
2362 #endif
2363
init_cpu_logical_map(void)2364 static void __init init_cpu_logical_map(void)
2365 {
2366 unsigned int cpu;
2367
2368 /*
2369 * Copy the MPIDR <-> logical CPU ID mapping to hyp.
2370 * Only copy the set of online CPUs whose features have been checked
2371 * against the finalized system capabilities. The hypervisor will not
2372 * allow any other CPUs from the `possible` set to boot.
2373 */
2374 for_each_online_cpu(cpu)
2375 hyp_cpu_logical_map[cpu] = cpu_logical_map(cpu);
2376 }
2377
2378 #define init_psci_0_1_impl_state(config, what) \
2379 config.psci_0_1_ ## what ## _implemented = psci_ops.what
2380
init_psci_relay(void)2381 static bool __init init_psci_relay(void)
2382 {
2383 /*
2384 * If PSCI has not been initialized, protected KVM cannot install
2385 * itself on newly booted CPUs.
2386 */
2387 if (!psci_ops.get_version) {
2388 kvm_err("Cannot initialize protected mode without PSCI\n");
2389 return false;
2390 }
2391
2392 kvm_host_psci_config.version = psci_ops.get_version();
2393 kvm_host_psci_config.smccc_version = arm_smccc_get_version();
2394
2395 if (kvm_host_psci_config.version == PSCI_VERSION(0, 1)) {
2396 kvm_host_psci_config.function_ids_0_1 = get_psci_0_1_function_ids();
2397 init_psci_0_1_impl_state(kvm_host_psci_config, cpu_suspend);
2398 init_psci_0_1_impl_state(kvm_host_psci_config, cpu_on);
2399 init_psci_0_1_impl_state(kvm_host_psci_config, cpu_off);
2400 init_psci_0_1_impl_state(kvm_host_psci_config, migrate);
2401 }
2402 return true;
2403 }
2404
init_subsystems(void)2405 static int __init init_subsystems(void)
2406 {
2407 int err = 0;
2408
2409 /*
2410 * Enable hardware so that subsystem initialisation can access EL2.
2411 */
2412 on_each_cpu(cpu_hyp_init, NULL, 1);
2413
2414 /*
2415 * Register CPU lower-power notifier
2416 */
2417 hyp_cpu_pm_init();
2418
2419 /*
2420 * Init HYP view of VGIC
2421 */
2422 err = kvm_vgic_hyp_init();
2423 switch (err) {
2424 case 0:
2425 vgic_present = true;
2426 break;
2427 case -ENODEV:
2428 case -ENXIO:
2429 /*
2430 * No VGIC? No pKVM for you.
2431 *
2432 * Protected mode assumes that VGICv3 is present, so no point
2433 * in trying to hobble along if vgic initialization fails.
2434 */
2435 if (is_protected_kvm_enabled())
2436 goto out;
2437
2438 /*
2439 * Otherwise, userspace could choose to implement a GIC for its
2440 * guest on non-cooperative hardware.
2441 */
2442 vgic_present = false;
2443 err = 0;
2444 break;
2445 default:
2446 goto out;
2447 }
2448
2449 if (kvm_mode == KVM_MODE_NV &&
2450 !(vgic_present && (kvm_vgic_global_state.type == VGIC_V3 ||
2451 kvm_vgic_global_state.has_gcie_v3_compat))) {
2452 kvm_err("NV support requires GICv3 or GICv5 with legacy support, giving up\n");
2453 err = -EINVAL;
2454 goto out;
2455 }
2456
2457 /*
2458 * Init HYP architected timer support
2459 */
2460 err = kvm_timer_hyp_init(vgic_present);
2461 if (err)
2462 goto out;
2463
2464 kvm_register_perf_callbacks();
2465
2466 err = kvm_hyp_trace_init();
2467 if (err)
2468 kvm_err("Failed to initialize Hyp tracing\n");
2469
2470 out:
2471 if (err)
2472 hyp_cpu_pm_exit();
2473
2474 if (err || !is_protected_kvm_enabled())
2475 on_each_cpu(cpu_hyp_uninit, NULL, 1);
2476
2477 return err;
2478 }
2479
teardown_subsystems(void)2480 static void __init teardown_subsystems(void)
2481 {
2482 kvm_unregister_perf_callbacks();
2483 hyp_cpu_pm_exit();
2484 }
2485
teardown_hyp_mode(void)2486 static void __init teardown_hyp_mode(void)
2487 {
2488 bool free_sve = system_supports_sve() && is_protected_kvm_enabled();
2489 int cpu;
2490
2491 free_hyp_pgds();
2492 for_each_possible_cpu(cpu) {
2493 if (per_cpu(kvm_hyp_initialized, cpu))
2494 continue;
2495
2496 free_pages(per_cpu(kvm_arm_hyp_stack_base, cpu), NVHE_STACK_SHIFT - PAGE_SHIFT);
2497
2498 if (!kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu])
2499 continue;
2500
2501 if (free_sve) {
2502 struct cpu_sve_state *sve_state;
2503
2504 sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state;
2505 free_pages((unsigned long) sve_state, pkvm_host_sve_state_order());
2506 }
2507
2508 free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order());
2509
2510 }
2511 }
2512
do_pkvm_init(u32 hyp_va_bits)2513 static int __init do_pkvm_init(u32 hyp_va_bits)
2514 {
2515 void *per_cpu_base = kvm_ksym_ref(kvm_nvhe_sym(kvm_arm_hyp_percpu_base));
2516 int ret;
2517
2518 preempt_disable();
2519 cpu_hyp_init_context();
2520 ret = kvm_call_hyp_nvhe(__pkvm_init, hyp_mem_base, hyp_mem_size,
2521 kern_hyp_va(per_cpu_base),
2522 hyp_va_bits);
2523 cpu_hyp_init_features();
2524
2525 /*
2526 * The stub hypercalls are now disabled, so set our local flag to
2527 * prevent a later re-init attempt in kvm_arch_enable_virtualization_cpu().
2528 */
2529 __this_cpu_write(kvm_hyp_initialized, 1);
2530 preempt_enable();
2531
2532 return ret;
2533 }
2534
get_hyp_id_aa64pfr0_el1(void)2535 static u64 get_hyp_id_aa64pfr0_el1(void)
2536 {
2537 /*
2538 * Track whether the system isn't affected by spectre/meltdown in the
2539 * hypervisor's view of id_aa64pfr0_el1, used for protected VMs.
2540 * Although this is per-CPU, we make it global for simplicity, e.g., not
2541 * to have to worry about vcpu migration.
2542 *
2543 * Unlike for non-protected VMs, userspace cannot override this for
2544 * protected VMs.
2545 */
2546 u64 val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
2547
2548 val &= ~(ID_AA64PFR0_EL1_CSV2 |
2549 ID_AA64PFR0_EL1_CSV3);
2550
2551 val |= FIELD_PREP(ID_AA64PFR0_EL1_CSV2,
2552 arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED);
2553 val |= FIELD_PREP(ID_AA64PFR0_EL1_CSV3,
2554 arm64_get_meltdown_state() == SPECTRE_UNAFFECTED);
2555
2556 return val;
2557 }
2558
kvm_hyp_init_symbols(void)2559 static void kvm_hyp_init_symbols(void)
2560 {
2561 kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = get_hyp_id_aa64pfr0_el1();
2562 kvm_nvhe_sym(id_aa64pfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1);
2563 kvm_nvhe_sym(id_aa64pfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR2_EL1);
2564 kvm_nvhe_sym(id_aa64isar0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR0_EL1);
2565 kvm_nvhe_sym(id_aa64isar1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1);
2566 kvm_nvhe_sym(id_aa64isar2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1);
2567 kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
2568 kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
2569 kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1);
2570 kvm_nvhe_sym(id_aa64smfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64SMFR0_EL1);
2571 kvm_nvhe_sym(__icache_flags) = __icache_flags;
2572 kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits;
2573
2574 /* Propagate the FGT state to the nVHE side */
2575 kvm_nvhe_sym(hfgrtr_masks) = hfgrtr_masks;
2576 kvm_nvhe_sym(hfgwtr_masks) = hfgwtr_masks;
2577 kvm_nvhe_sym(hfgitr_masks) = hfgitr_masks;
2578 kvm_nvhe_sym(hdfgrtr_masks) = hdfgrtr_masks;
2579 kvm_nvhe_sym(hdfgwtr_masks) = hdfgwtr_masks;
2580 kvm_nvhe_sym(hafgrtr_masks) = hafgrtr_masks;
2581 kvm_nvhe_sym(hfgrtr2_masks) = hfgrtr2_masks;
2582 kvm_nvhe_sym(hfgwtr2_masks) = hfgwtr2_masks;
2583 kvm_nvhe_sym(hfgitr2_masks) = hfgitr2_masks;
2584 kvm_nvhe_sym(hdfgrtr2_masks)= hdfgrtr2_masks;
2585 kvm_nvhe_sym(hdfgwtr2_masks)= hdfgwtr2_masks;
2586 kvm_nvhe_sym(ich_hfgrtr_masks) = ich_hfgrtr_masks;
2587 kvm_nvhe_sym(ich_hfgwtr_masks) = ich_hfgwtr_masks;
2588 kvm_nvhe_sym(ich_hfgitr_masks) = ich_hfgitr_masks;
2589
2590 /*
2591 * Flush entire BSS since part of its data containing init symbols is read
2592 * while the MMU is off.
2593 */
2594 kvm_flush_dcache_to_poc(kvm_ksym_ref(__hyp_bss_start),
2595 kvm_ksym_ref(__hyp_bss_end) - kvm_ksym_ref(__hyp_bss_start));
2596 }
2597
kvm_hyp_init_protection(u32 hyp_va_bits)2598 static int __init kvm_hyp_init_protection(u32 hyp_va_bits)
2599 {
2600 void *addr = phys_to_virt(hyp_mem_base);
2601 int ret;
2602
2603 ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP);
2604 if (ret)
2605 return ret;
2606
2607 ret = do_pkvm_init(hyp_va_bits);
2608 if (ret)
2609 return ret;
2610
2611 free_hyp_pgds();
2612
2613 return 0;
2614 }
2615
init_pkvm_host_sve_state(void)2616 static int init_pkvm_host_sve_state(void)
2617 {
2618 int cpu;
2619
2620 if (!system_supports_sve())
2621 return 0;
2622
2623 /* Allocate pages for host sve state in protected mode. */
2624 for_each_possible_cpu(cpu) {
2625 struct page *page = alloc_pages(GFP_KERNEL, pkvm_host_sve_state_order());
2626
2627 if (!page)
2628 return -ENOMEM;
2629
2630 per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state = page_address(page);
2631 }
2632
2633 /*
2634 * Don't map the pages in hyp since these are only used in protected
2635 * mode, which will (re)create its own mapping when initialized.
2636 */
2637
2638 return 0;
2639 }
2640
2641 /*
2642 * Finalizes the initialization of hyp mode, once everything else is initialized
2643 * and the initialziation process cannot fail.
2644 */
finalize_init_hyp_mode(void)2645 static void finalize_init_hyp_mode(void)
2646 {
2647 int cpu;
2648
2649 if (system_supports_sve() && is_protected_kvm_enabled()) {
2650 for_each_possible_cpu(cpu) {
2651 struct cpu_sve_state *sve_state;
2652
2653 sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state;
2654 per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state =
2655 kern_hyp_va(sve_state);
2656 }
2657 }
2658 }
2659
pkvm_hyp_init_ptrauth(void)2660 static void pkvm_hyp_init_ptrauth(void)
2661 {
2662 struct kvm_cpu_context *hyp_ctxt;
2663 int cpu;
2664
2665 for_each_possible_cpu(cpu) {
2666 hyp_ctxt = per_cpu_ptr_nvhe_sym(kvm_hyp_ctxt, cpu);
2667 hyp_ctxt->sys_regs[APIAKEYLO_EL1] = get_random_long();
2668 hyp_ctxt->sys_regs[APIAKEYHI_EL1] = get_random_long();
2669 hyp_ctxt->sys_regs[APIBKEYLO_EL1] = get_random_long();
2670 hyp_ctxt->sys_regs[APIBKEYHI_EL1] = get_random_long();
2671 hyp_ctxt->sys_regs[APDAKEYLO_EL1] = get_random_long();
2672 hyp_ctxt->sys_regs[APDAKEYHI_EL1] = get_random_long();
2673 hyp_ctxt->sys_regs[APDBKEYLO_EL1] = get_random_long();
2674 hyp_ctxt->sys_regs[APDBKEYHI_EL1] = get_random_long();
2675 hyp_ctxt->sys_regs[APGAKEYLO_EL1] = get_random_long();
2676 hyp_ctxt->sys_regs[APGAKEYHI_EL1] = get_random_long();
2677 }
2678 }
2679
2680 /* Inits Hyp-mode on all online CPUs */
init_hyp_mode(void)2681 static int __init init_hyp_mode(void)
2682 {
2683 u32 hyp_va_bits = kvm_hyp_va_bits();
2684 int cpu;
2685 int err = -ENOMEM;
2686
2687 /*
2688 * The protected Hyp-mode cannot be initialized if the memory pool
2689 * allocation has failed.
2690 */
2691 if (is_protected_kvm_enabled() && !hyp_mem_base)
2692 goto out_err;
2693
2694 /*
2695 * Allocate Hyp PGD and setup Hyp identity mapping
2696 */
2697 err = kvm_mmu_init(hyp_va_bits);
2698 if (err)
2699 goto out_err;
2700
2701 /*
2702 * Allocate stack pages for Hypervisor-mode
2703 */
2704 for_each_possible_cpu(cpu) {
2705 unsigned long stack_base;
2706
2707 stack_base = __get_free_pages(GFP_KERNEL, NVHE_STACK_SHIFT - PAGE_SHIFT);
2708 if (!stack_base) {
2709 err = -ENOMEM;
2710 goto out_err;
2711 }
2712
2713 per_cpu(kvm_arm_hyp_stack_base, cpu) = stack_base;
2714 }
2715
2716 /*
2717 * Allocate and initialize pages for Hypervisor-mode percpu regions.
2718 */
2719 for_each_possible_cpu(cpu) {
2720 struct page *page;
2721 void *page_addr;
2722
2723 page = alloc_pages(GFP_KERNEL, nvhe_percpu_order());
2724 if (!page) {
2725 err = -ENOMEM;
2726 goto out_err;
2727 }
2728
2729 page_addr = page_address(page);
2730 memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size());
2731 kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu] = (unsigned long)page_addr;
2732 }
2733
2734 kvm_nvhe_sym(hyp_nr_cpus) = num_possible_cpus();
2735
2736 /*
2737 * Map the Hyp-code called directly from the host
2738 */
2739 err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start),
2740 kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC);
2741 if (err) {
2742 kvm_err("Cannot map world-switch code\n");
2743 goto out_err;
2744 }
2745
2746 err = create_hyp_mappings(kvm_ksym_ref(__hyp_data_start),
2747 kvm_ksym_ref(__hyp_data_end), PAGE_HYP);
2748 if (err) {
2749 kvm_err("Cannot map .hyp.data section\n");
2750 goto out_err;
2751 }
2752
2753 err = create_hyp_mappings(kvm_ksym_ref(__hyp_rodata_start),
2754 kvm_ksym_ref(__hyp_rodata_end), PAGE_HYP_RO);
2755 if (err) {
2756 kvm_err("Cannot map .hyp.rodata section\n");
2757 goto out_err;
2758 }
2759
2760 err = create_hyp_mappings(kvm_ksym_ref(__start_rodata),
2761 kvm_ksym_ref(__end_rodata), PAGE_HYP_RO);
2762 if (err) {
2763 kvm_err("Cannot map rodata section\n");
2764 goto out_err;
2765 }
2766
2767 /*
2768 * .hyp.bss is guaranteed to be placed at the beginning of the .bss
2769 * section thanks to an assertion in the linker script. Map it RW and
2770 * the rest of .bss RO.
2771 */
2772 err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_start),
2773 kvm_ksym_ref(__hyp_bss_end), PAGE_HYP);
2774 if (err) {
2775 kvm_err("Cannot map hyp bss section: %d\n", err);
2776 goto out_err;
2777 }
2778
2779 err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_end),
2780 kvm_ksym_ref(__bss_stop), PAGE_HYP_RO);
2781 if (err) {
2782 kvm_err("Cannot map bss section\n");
2783 goto out_err;
2784 }
2785
2786 /*
2787 * Map the Hyp stack pages
2788 */
2789 for_each_possible_cpu(cpu) {
2790 struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
2791 char *stack_base = (char *)per_cpu(kvm_arm_hyp_stack_base, cpu);
2792
2793 err = create_hyp_stack(__pa(stack_base), ¶ms->stack_hyp_va);
2794 if (err) {
2795 kvm_err("Cannot map hyp stack\n");
2796 goto out_err;
2797 }
2798
2799 /*
2800 * Save the stack PA in nvhe_init_params. This will be needed
2801 * to recreate the stack mapping in protected nVHE mode.
2802 * __hyp_pa() won't do the right thing there, since the stack
2803 * has been mapped in the flexible private VA space.
2804 */
2805 params->stack_pa = __pa(stack_base);
2806 }
2807
2808 for_each_possible_cpu(cpu) {
2809 char *percpu_begin = (char *)kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu];
2810 char *percpu_end = percpu_begin + nvhe_percpu_size();
2811
2812 /* Map Hyp percpu pages */
2813 err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP);
2814 if (err) {
2815 kvm_err("Cannot map hyp percpu region\n");
2816 goto out_err;
2817 }
2818
2819 /* Prepare the CPU initialization parameters */
2820 cpu_prepare_hyp_mode(cpu, hyp_va_bits);
2821 }
2822
2823 kvm_hyp_init_symbols();
2824
2825 if (is_protected_kvm_enabled()) {
2826 if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL) &&
2827 cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH))
2828 pkvm_hyp_init_ptrauth();
2829
2830 init_cpu_logical_map();
2831
2832 if (!init_psci_relay()) {
2833 err = -ENODEV;
2834 goto out_err;
2835 }
2836
2837 err = init_pkvm_host_sve_state();
2838 if (err)
2839 goto out_err;
2840
2841 err = kvm_hyp_init_protection(hyp_va_bits);
2842 if (err) {
2843 kvm_err("Failed to init hyp memory protection\n");
2844 goto out_err;
2845 }
2846 }
2847
2848 return 0;
2849
2850 out_err:
2851 teardown_hyp_mode();
2852 kvm_err("error initializing Hyp mode: %d\n", err);
2853 return err;
2854 }
2855
kvm_mpidr_to_vcpu(struct kvm * kvm,unsigned long mpidr)2856 struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr)
2857 {
2858 struct kvm_vcpu *vcpu = NULL;
2859 struct kvm_mpidr_data *data;
2860 unsigned long i;
2861
2862 mpidr &= MPIDR_HWID_BITMASK;
2863
2864 rcu_read_lock();
2865 data = rcu_dereference(kvm->arch.mpidr_data);
2866
2867 if (data) {
2868 u16 idx = kvm_mpidr_index(data, mpidr);
2869
2870 vcpu = kvm_get_vcpu(kvm, data->cmpidr_to_idx[idx]);
2871 if (mpidr != kvm_vcpu_get_mpidr_aff(vcpu))
2872 vcpu = NULL;
2873 }
2874
2875 rcu_read_unlock();
2876
2877 if (vcpu)
2878 return vcpu;
2879
2880 kvm_for_each_vcpu(i, vcpu, kvm) {
2881 if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu))
2882 return vcpu;
2883 }
2884 return NULL;
2885 }
2886
kvm_arch_irqchip_in_kernel(struct kvm * kvm)2887 bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
2888 {
2889 return irqchip_in_kernel(kvm);
2890 }
2891
kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer * cons,struct irq_bypass_producer * prod)2892 int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
2893 struct irq_bypass_producer *prod)
2894 {
2895 struct kvm_kernel_irqfd *irqfd =
2896 container_of(cons, struct kvm_kernel_irqfd, consumer);
2897 struct kvm_kernel_irq_routing_entry *irq_entry = &irqfd->irq_entry;
2898
2899 /*
2900 * The only thing we have a chance of directly-injecting is LPIs. Maybe
2901 * one day...
2902 */
2903 if (irq_entry->type != KVM_IRQ_ROUTING_MSI)
2904 return 0;
2905
2906 return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq,
2907 &irqfd->irq_entry);
2908 }
2909
kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer * cons,struct irq_bypass_producer * prod)2910 void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
2911 struct irq_bypass_producer *prod)
2912 {
2913 struct kvm_kernel_irqfd *irqfd =
2914 container_of(cons, struct kvm_kernel_irqfd, consumer);
2915 struct kvm_kernel_irq_routing_entry *irq_entry = &irqfd->irq_entry;
2916
2917 if (irq_entry->type != KVM_IRQ_ROUTING_MSI)
2918 return;
2919
2920 kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq);
2921 }
2922
kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd * irqfd,struct kvm_kernel_irq_routing_entry * old,struct kvm_kernel_irq_routing_entry * new)2923 void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
2924 struct kvm_kernel_irq_routing_entry *old,
2925 struct kvm_kernel_irq_routing_entry *new)
2926 {
2927 if (old->type == KVM_IRQ_ROUTING_MSI &&
2928 new->type == KVM_IRQ_ROUTING_MSI &&
2929 !memcmp(&old->msi, &new->msi, sizeof(new->msi)))
2930 return;
2931
2932 /*
2933 * Remapping the vLPI requires taking the its_lock mutex to resolve
2934 * the new translation. We're in spinlock land at this point, so no
2935 * chance of resolving the translation.
2936 *
2937 * Unmap the vLPI and fall back to software LPI injection.
2938 */
2939 return kvm_vgic_v4_unset_forwarding(irqfd->kvm, irqfd->producer->irq);
2940 }
2941
kvm_arch_irq_bypass_stop(struct irq_bypass_consumer * cons)2942 void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons)
2943 {
2944 struct kvm_kernel_irqfd *irqfd =
2945 container_of(cons, struct kvm_kernel_irqfd, consumer);
2946
2947 kvm_arm_halt_guest(irqfd->kvm);
2948 }
2949
kvm_arch_irq_bypass_start(struct irq_bypass_consumer * cons)2950 void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons)
2951 {
2952 struct kvm_kernel_irqfd *irqfd =
2953 container_of(cons, struct kvm_kernel_irqfd, consumer);
2954
2955 kvm_arm_resume_guest(irqfd->kvm);
2956 }
2957
2958 /* Initialize Hyp-mode and memory mappings on all CPUs */
kvm_arm_init(void)2959 static __init int kvm_arm_init(void)
2960 {
2961 int err;
2962 bool in_hyp_mode;
2963
2964 if (!is_hyp_mode_available()) {
2965 kvm_info("HYP mode not available\n");
2966 return -ENODEV;
2967 }
2968
2969 if (kvm_get_mode() == KVM_MODE_NONE) {
2970 kvm_info("KVM disabled from command line\n");
2971 return -ENODEV;
2972 }
2973
2974 err = kvm_sys_reg_table_init();
2975 if (err) {
2976 kvm_info("Error initializing system register tables");
2977 return err;
2978 }
2979
2980 in_hyp_mode = is_kernel_in_hyp_mode();
2981
2982 if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) ||
2983 cpus_have_final_cap(ARM64_WORKAROUND_1508412))
2984 kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \
2985 "Only trusted guests should be used on this system.\n");
2986
2987 err = kvm_set_ipa_limit();
2988 if (err)
2989 return err;
2990
2991 err = kvm_arm_init_sve();
2992 if (err)
2993 return err;
2994
2995 err = kvm_arm_vmid_alloc_init();
2996 if (err) {
2997 kvm_err("Failed to initialize VMID allocator.\n");
2998 return err;
2999 }
3000
3001 if (!in_hyp_mode) {
3002 err = init_hyp_mode();
3003 if (err)
3004 goto out_err;
3005 }
3006
3007 err = kvm_init_vector_slots();
3008 if (err) {
3009 kvm_err("Cannot initialise vector slots\n");
3010 goto out_hyp;
3011 }
3012
3013 err = init_subsystems();
3014 if (err)
3015 goto out_hyp;
3016
3017 kvm_info("%s%sVHE%s mode initialized successfully\n",
3018 in_hyp_mode ? "" : (is_protected_kvm_enabled() ?
3019 "Protected " : "Hyp "),
3020 in_hyp_mode ? "" : (cpus_have_final_cap(ARM64_KVM_HVHE) ?
3021 "h" : "n"),
3022 cpus_have_final_cap(ARM64_HAS_NESTED_VIRT) ? "+NV2": "");
3023
3024 /*
3025 * FIXME: Do something reasonable if kvm_init() fails after pKVM
3026 * hypervisor protection is finalized.
3027 */
3028 err = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE);
3029 if (err)
3030 goto out_subs;
3031
3032 /*
3033 * This should be called after initialization is done and failure isn't
3034 * possible anymore.
3035 */
3036 if (!in_hyp_mode)
3037 finalize_init_hyp_mode();
3038
3039 kvm_arm_initialised = true;
3040
3041 return 0;
3042
3043 out_subs:
3044 teardown_subsystems();
3045 out_hyp:
3046 if (!in_hyp_mode)
3047 teardown_hyp_mode();
3048 out_err:
3049 kvm_arm_vmid_alloc_free();
3050 return err;
3051 }
3052
early_kvm_mode_cfg(char * arg)3053 static int __init early_kvm_mode_cfg(char *arg)
3054 {
3055 if (!arg)
3056 return -EINVAL;
3057
3058 if (strcmp(arg, "none") == 0) {
3059 kvm_mode = KVM_MODE_NONE;
3060 return 0;
3061 }
3062
3063 if (!is_hyp_mode_available()) {
3064 pr_warn_once("KVM is not available. Ignoring kvm-arm.mode\n");
3065 return 0;
3066 }
3067
3068 if (strcmp(arg, "protected") == 0) {
3069 if (!is_kernel_in_hyp_mode())
3070 kvm_mode = KVM_MODE_PROTECTED;
3071 else
3072 pr_warn_once("Protected KVM not available with VHE\n");
3073
3074 return 0;
3075 }
3076
3077 if (strcmp(arg, "nvhe") == 0 && !WARN_ON(is_kernel_in_hyp_mode())) {
3078 kvm_mode = KVM_MODE_DEFAULT;
3079 return 0;
3080 }
3081
3082 if (strcmp(arg, "nested") == 0 && !WARN_ON(!is_kernel_in_hyp_mode())) {
3083 kvm_mode = KVM_MODE_NV;
3084 return 0;
3085 }
3086
3087 return -EINVAL;
3088 }
3089 early_param("kvm-arm.mode", early_kvm_mode_cfg);
3090
early_kvm_wfx_trap_policy_cfg(char * arg,enum kvm_wfx_trap_policy * p)3091 static int __init early_kvm_wfx_trap_policy_cfg(char *arg, enum kvm_wfx_trap_policy *p)
3092 {
3093 if (!arg)
3094 return -EINVAL;
3095
3096 if (strcmp(arg, "trap") == 0) {
3097 *p = KVM_WFX_TRAP;
3098 return 0;
3099 }
3100
3101 if (strcmp(arg, "notrap") == 0) {
3102 *p = KVM_WFX_NOTRAP;
3103 return 0;
3104 }
3105
3106 return -EINVAL;
3107 }
3108
early_kvm_wfi_trap_policy_cfg(char * arg)3109 static int __init early_kvm_wfi_trap_policy_cfg(char *arg)
3110 {
3111 return early_kvm_wfx_trap_policy_cfg(arg, &kvm_wfi_trap_policy);
3112 }
3113 early_param("kvm-arm.wfi_trap_policy", early_kvm_wfi_trap_policy_cfg);
3114
early_kvm_wfe_trap_policy_cfg(char * arg)3115 static int __init early_kvm_wfe_trap_policy_cfg(char *arg)
3116 {
3117 return early_kvm_wfx_trap_policy_cfg(arg, &kvm_wfe_trap_policy);
3118 }
3119 early_param("kvm-arm.wfe_trap_policy", early_kvm_wfe_trap_policy_cfg);
3120
kvm_get_mode(void)3121 enum kvm_mode kvm_get_mode(void)
3122 {
3123 return kvm_mode;
3124 }
3125
3126 module_init(kvm_arm_init);
3127