xref: /linux/arch/arm64/kvm/arm.c (revision ee8287e068a3995b0f8001dd6931e221dfb7c530)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2012 - Virtual Open Systems and Columbia University
4  * Author: Christoffer Dall <c.dall@virtualopensystems.com>
5  */
6 
7 #include <linux/bug.h>
8 #include <linux/cpu_pm.h>
9 #include <linux/entry-kvm.h>
10 #include <linux/errno.h>
11 #include <linux/err.h>
12 #include <linux/kvm_host.h>
13 #include <linux/list.h>
14 #include <linux/module.h>
15 #include <linux/vmalloc.h>
16 #include <linux/fs.h>
17 #include <linux/mman.h>
18 #include <linux/sched.h>
19 #include <linux/kvm.h>
20 #include <linux/kvm_irqfd.h>
21 #include <linux/irqbypass.h>
22 #include <linux/sched/stat.h>
23 #include <linux/psci.h>
24 #include <trace/events/kvm.h>
25 
26 #define CREATE_TRACE_POINTS
27 #include "trace_arm.h"
28 
29 #include <linux/uaccess.h>
30 #include <asm/ptrace.h>
31 #include <asm/mman.h>
32 #include <asm/tlbflush.h>
33 #include <asm/cacheflush.h>
34 #include <asm/cpufeature.h>
35 #include <asm/virt.h>
36 #include <asm/kvm_arm.h>
37 #include <asm/kvm_asm.h>
38 #include <asm/kvm_emulate.h>
39 #include <asm/kvm_mmu.h>
40 #include <asm/kvm_nested.h>
41 #include <asm/kvm_pkvm.h>
42 #include <asm/kvm_ptrauth.h>
43 #include <asm/sections.h>
44 
45 #include <kvm/arm_hypercalls.h>
46 #include <kvm/arm_pmu.h>
47 #include <kvm/arm_psci.h>
48 
49 static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT;
50 
51 DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
52 
53 DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
54 DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
55 
56 DECLARE_KVM_NVHE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
57 
58 static bool vgic_present, kvm_arm_initialised;
59 
60 static DEFINE_PER_CPU(unsigned char, kvm_hyp_initialized);
61 DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
62 
63 bool is_kvm_arm_initialised(void)
64 {
65 	return kvm_arm_initialised;
66 }
67 
68 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
69 {
70 	return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
71 }
72 
73 /*
74  * This functions as an allow-list of protected VM capabilities.
75  * Features not explicitly allowed by this function are denied.
76  */
77 static bool pkvm_ext_allowed(struct kvm *kvm, long ext)
78 {
79 	switch (ext) {
80 	case KVM_CAP_IRQCHIP:
81 	case KVM_CAP_ARM_PSCI:
82 	case KVM_CAP_ARM_PSCI_0_2:
83 	case KVM_CAP_NR_VCPUS:
84 	case KVM_CAP_MAX_VCPUS:
85 	case KVM_CAP_MAX_VCPU_ID:
86 	case KVM_CAP_MSI_DEVID:
87 	case KVM_CAP_ARM_VM_IPA_SIZE:
88 	case KVM_CAP_ARM_PMU_V3:
89 	case KVM_CAP_ARM_SVE:
90 	case KVM_CAP_ARM_PTRAUTH_ADDRESS:
91 	case KVM_CAP_ARM_PTRAUTH_GENERIC:
92 		return true;
93 	default:
94 		return false;
95 	}
96 }
97 
98 int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
99 			    struct kvm_enable_cap *cap)
100 {
101 	int r = -EINVAL;
102 
103 	if (cap->flags)
104 		return -EINVAL;
105 
106 	if (kvm_vm_is_protected(kvm) && !pkvm_ext_allowed(kvm, cap->cap))
107 		return -EINVAL;
108 
109 	switch (cap->cap) {
110 	case KVM_CAP_ARM_NISV_TO_USER:
111 		r = 0;
112 		set_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER,
113 			&kvm->arch.flags);
114 		break;
115 	case KVM_CAP_ARM_MTE:
116 		mutex_lock(&kvm->lock);
117 		if (system_supports_mte() && !kvm->created_vcpus) {
118 			r = 0;
119 			set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags);
120 		}
121 		mutex_unlock(&kvm->lock);
122 		break;
123 	case KVM_CAP_ARM_SYSTEM_SUSPEND:
124 		r = 0;
125 		set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags);
126 		break;
127 	case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE:
128 		mutex_lock(&kvm->slots_lock);
129 		/*
130 		 * To keep things simple, allow changing the chunk
131 		 * size only when no memory slots have been created.
132 		 */
133 		if (kvm_are_all_memslots_empty(kvm)) {
134 			u64 new_cap = cap->args[0];
135 
136 			if (!new_cap || kvm_is_block_size_supported(new_cap)) {
137 				r = 0;
138 				kvm->arch.mmu.split_page_chunk_size = new_cap;
139 			}
140 		}
141 		mutex_unlock(&kvm->slots_lock);
142 		break;
143 	default:
144 		break;
145 	}
146 
147 	return r;
148 }
149 
150 static int kvm_arm_default_max_vcpus(void)
151 {
152 	return vgic_present ? kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS;
153 }
154 
155 /**
156  * kvm_arch_init_vm - initializes a VM data structure
157  * @kvm:	pointer to the KVM struct
158  */
159 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
160 {
161 	int ret;
162 
163 	mutex_init(&kvm->arch.config_lock);
164 
165 #ifdef CONFIG_LOCKDEP
166 	/* Clue in lockdep that the config_lock must be taken inside kvm->lock */
167 	mutex_lock(&kvm->lock);
168 	mutex_lock(&kvm->arch.config_lock);
169 	mutex_unlock(&kvm->arch.config_lock);
170 	mutex_unlock(&kvm->lock);
171 #endif
172 
173 	ret = kvm_share_hyp(kvm, kvm + 1);
174 	if (ret)
175 		return ret;
176 
177 	ret = pkvm_init_host_vm(kvm);
178 	if (ret)
179 		goto err_unshare_kvm;
180 
181 	if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL_ACCOUNT)) {
182 		ret = -ENOMEM;
183 		goto err_unshare_kvm;
184 	}
185 	cpumask_copy(kvm->arch.supported_cpus, cpu_possible_mask);
186 
187 	ret = kvm_init_stage2_mmu(kvm, &kvm->arch.mmu, type);
188 	if (ret)
189 		goto err_free_cpumask;
190 
191 	kvm_vgic_early_init(kvm);
192 
193 	kvm_timer_init_vm(kvm);
194 
195 	/* The maximum number of VCPUs is limited by the host's GIC model */
196 	kvm->max_vcpus = kvm_arm_default_max_vcpus();
197 
198 	kvm_arm_init_hypercalls(kvm);
199 
200 	bitmap_zero(kvm->arch.vcpu_features, KVM_VCPU_MAX_FEATURES);
201 
202 	return 0;
203 
204 err_free_cpumask:
205 	free_cpumask_var(kvm->arch.supported_cpus);
206 err_unshare_kvm:
207 	kvm_unshare_hyp(kvm, kvm + 1);
208 	return ret;
209 }
210 
211 vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
212 {
213 	return VM_FAULT_SIGBUS;
214 }
215 
216 void kvm_arch_create_vm_debugfs(struct kvm *kvm)
217 {
218 	kvm_sys_regs_create_debugfs(kvm);
219 }
220 
221 static void kvm_destroy_mpidr_data(struct kvm *kvm)
222 {
223 	struct kvm_mpidr_data *data;
224 
225 	mutex_lock(&kvm->arch.config_lock);
226 
227 	data = rcu_dereference_protected(kvm->arch.mpidr_data,
228 					 lockdep_is_held(&kvm->arch.config_lock));
229 	if (data) {
230 		rcu_assign_pointer(kvm->arch.mpidr_data, NULL);
231 		synchronize_rcu();
232 		kfree(data);
233 	}
234 
235 	mutex_unlock(&kvm->arch.config_lock);
236 }
237 
238 /**
239  * kvm_arch_destroy_vm - destroy the VM data structure
240  * @kvm:	pointer to the KVM struct
241  */
242 void kvm_arch_destroy_vm(struct kvm *kvm)
243 {
244 	bitmap_free(kvm->arch.pmu_filter);
245 	free_cpumask_var(kvm->arch.supported_cpus);
246 
247 	kvm_vgic_destroy(kvm);
248 
249 	if (is_protected_kvm_enabled())
250 		pkvm_destroy_hyp_vm(kvm);
251 
252 	kvm_destroy_mpidr_data(kvm);
253 
254 	kfree(kvm->arch.sysreg_masks);
255 	kvm_destroy_vcpus(kvm);
256 
257 	kvm_unshare_hyp(kvm, kvm + 1);
258 
259 	kvm_arm_teardown_hypercalls(kvm);
260 }
261 
262 static bool kvm_has_full_ptr_auth(void)
263 {
264 	bool apa, gpa, api, gpi, apa3, gpa3;
265 	u64 isar1, isar2, val;
266 
267 	/*
268 	 * Check that:
269 	 *
270 	 * - both Address and Generic auth are implemented for a given
271          *   algorithm (Q5, IMPDEF or Q3)
272 	 * - only a single algorithm is implemented.
273 	 */
274 	if (!system_has_full_ptr_auth())
275 		return false;
276 
277 	isar1 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1);
278 	isar2 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1);
279 
280 	apa = !!FIELD_GET(ID_AA64ISAR1_EL1_APA_MASK, isar1);
281 	val = FIELD_GET(ID_AA64ISAR1_EL1_GPA_MASK, isar1);
282 	gpa = (val == ID_AA64ISAR1_EL1_GPA_IMP);
283 
284 	api = !!FIELD_GET(ID_AA64ISAR1_EL1_API_MASK, isar1);
285 	val = FIELD_GET(ID_AA64ISAR1_EL1_GPI_MASK, isar1);
286 	gpi = (val == ID_AA64ISAR1_EL1_GPI_IMP);
287 
288 	apa3 = !!FIELD_GET(ID_AA64ISAR2_EL1_APA3_MASK, isar2);
289 	val  = FIELD_GET(ID_AA64ISAR2_EL1_GPA3_MASK, isar2);
290 	gpa3 = (val == ID_AA64ISAR2_EL1_GPA3_IMP);
291 
292 	return (apa == gpa && api == gpi && apa3 == gpa3 &&
293 		(apa + api + apa3) == 1);
294 }
295 
296 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
297 {
298 	int r;
299 
300 	if (kvm && kvm_vm_is_protected(kvm) && !pkvm_ext_allowed(kvm, ext))
301 		return 0;
302 
303 	switch (ext) {
304 	case KVM_CAP_IRQCHIP:
305 		r = vgic_present;
306 		break;
307 	case KVM_CAP_IOEVENTFD:
308 	case KVM_CAP_USER_MEMORY:
309 	case KVM_CAP_SYNC_MMU:
310 	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
311 	case KVM_CAP_ONE_REG:
312 	case KVM_CAP_ARM_PSCI:
313 	case KVM_CAP_ARM_PSCI_0_2:
314 	case KVM_CAP_READONLY_MEM:
315 	case KVM_CAP_MP_STATE:
316 	case KVM_CAP_IMMEDIATE_EXIT:
317 	case KVM_CAP_VCPU_EVENTS:
318 	case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2:
319 	case KVM_CAP_ARM_NISV_TO_USER:
320 	case KVM_CAP_ARM_INJECT_EXT_DABT:
321 	case KVM_CAP_SET_GUEST_DEBUG:
322 	case KVM_CAP_VCPU_ATTRIBUTES:
323 	case KVM_CAP_PTP_KVM:
324 	case KVM_CAP_ARM_SYSTEM_SUSPEND:
325 	case KVM_CAP_IRQFD_RESAMPLE:
326 	case KVM_CAP_COUNTER_OFFSET:
327 		r = 1;
328 		break;
329 	case KVM_CAP_SET_GUEST_DEBUG2:
330 		return KVM_GUESTDBG_VALID_MASK;
331 	case KVM_CAP_ARM_SET_DEVICE_ADDR:
332 		r = 1;
333 		break;
334 	case KVM_CAP_NR_VCPUS:
335 		/*
336 		 * ARM64 treats KVM_CAP_NR_CPUS differently from all other
337 		 * architectures, as it does not always bound it to
338 		 * KVM_CAP_MAX_VCPUS. It should not matter much because
339 		 * this is just an advisory value.
340 		 */
341 		r = min_t(unsigned int, num_online_cpus(),
342 			  kvm_arm_default_max_vcpus());
343 		break;
344 	case KVM_CAP_MAX_VCPUS:
345 	case KVM_CAP_MAX_VCPU_ID:
346 		if (kvm)
347 			r = kvm->max_vcpus;
348 		else
349 			r = kvm_arm_default_max_vcpus();
350 		break;
351 	case KVM_CAP_MSI_DEVID:
352 		if (!kvm)
353 			r = -EINVAL;
354 		else
355 			r = kvm->arch.vgic.msis_require_devid;
356 		break;
357 	case KVM_CAP_ARM_USER_IRQ:
358 		/*
359 		 * 1: EL1_VTIMER, EL1_PTIMER, and PMU.
360 		 * (bump this number if adding more devices)
361 		 */
362 		r = 1;
363 		break;
364 	case KVM_CAP_ARM_MTE:
365 		r = system_supports_mte();
366 		break;
367 	case KVM_CAP_STEAL_TIME:
368 		r = kvm_arm_pvtime_supported();
369 		break;
370 	case KVM_CAP_ARM_EL1_32BIT:
371 		r = cpus_have_final_cap(ARM64_HAS_32BIT_EL1);
372 		break;
373 	case KVM_CAP_GUEST_DEBUG_HW_BPS:
374 		r = get_num_brps();
375 		break;
376 	case KVM_CAP_GUEST_DEBUG_HW_WPS:
377 		r = get_num_wrps();
378 		break;
379 	case KVM_CAP_ARM_PMU_V3:
380 		r = kvm_arm_support_pmu_v3();
381 		break;
382 	case KVM_CAP_ARM_INJECT_SERROR_ESR:
383 		r = cpus_have_final_cap(ARM64_HAS_RAS_EXTN);
384 		break;
385 	case KVM_CAP_ARM_VM_IPA_SIZE:
386 		r = get_kvm_ipa_limit();
387 		break;
388 	case KVM_CAP_ARM_SVE:
389 		r = system_supports_sve();
390 		break;
391 	case KVM_CAP_ARM_PTRAUTH_ADDRESS:
392 	case KVM_CAP_ARM_PTRAUTH_GENERIC:
393 		r = kvm_has_full_ptr_auth();
394 		break;
395 	case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE:
396 		if (kvm)
397 			r = kvm->arch.mmu.split_page_chunk_size;
398 		else
399 			r = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT;
400 		break;
401 	case KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES:
402 		r = kvm_supported_block_sizes();
403 		break;
404 	case KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES:
405 		r = BIT(0);
406 		break;
407 	default:
408 		r = 0;
409 	}
410 
411 	return r;
412 }
413 
414 long kvm_arch_dev_ioctl(struct file *filp,
415 			unsigned int ioctl, unsigned long arg)
416 {
417 	return -EINVAL;
418 }
419 
420 struct kvm *kvm_arch_alloc_vm(void)
421 {
422 	size_t sz = sizeof(struct kvm);
423 
424 	if (!has_vhe())
425 		return kzalloc(sz, GFP_KERNEL_ACCOUNT);
426 
427 	return __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM | __GFP_ZERO);
428 }
429 
430 int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
431 {
432 	if (irqchip_in_kernel(kvm) && vgic_initialized(kvm))
433 		return -EBUSY;
434 
435 	if (id >= kvm->max_vcpus)
436 		return -EINVAL;
437 
438 	return 0;
439 }
440 
441 int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
442 {
443 	int err;
444 
445 	spin_lock_init(&vcpu->arch.mp_state_lock);
446 
447 #ifdef CONFIG_LOCKDEP
448 	/* Inform lockdep that the config_lock is acquired after vcpu->mutex */
449 	mutex_lock(&vcpu->mutex);
450 	mutex_lock(&vcpu->kvm->arch.config_lock);
451 	mutex_unlock(&vcpu->kvm->arch.config_lock);
452 	mutex_unlock(&vcpu->mutex);
453 #endif
454 
455 	/* Force users to call KVM_ARM_VCPU_INIT */
456 	vcpu_clear_flag(vcpu, VCPU_INITIALIZED);
457 
458 	vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO;
459 
460 	/* Set up the timer */
461 	kvm_timer_vcpu_init(vcpu);
462 
463 	kvm_pmu_vcpu_init(vcpu);
464 
465 	kvm_arm_reset_debug_ptr(vcpu);
466 
467 	kvm_arm_pvtime_vcpu_init(&vcpu->arch);
468 
469 	vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
470 
471 	/*
472 	 * This vCPU may have been created after mpidr_data was initialized.
473 	 * Throw out the pre-computed mappings if that is the case which forces
474 	 * KVM to fall back to iteratively searching the vCPUs.
475 	 */
476 	kvm_destroy_mpidr_data(vcpu->kvm);
477 
478 	err = kvm_vgic_vcpu_init(vcpu);
479 	if (err)
480 		return err;
481 
482 	return kvm_share_hyp(vcpu, vcpu + 1);
483 }
484 
485 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
486 {
487 }
488 
489 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
490 {
491 	if (vcpu_has_run_once(vcpu) && unlikely(!irqchip_in_kernel(vcpu->kvm)))
492 		static_branch_dec(&userspace_irqchip_in_use);
493 
494 	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
495 	kvm_timer_vcpu_terminate(vcpu);
496 	kvm_pmu_vcpu_destroy(vcpu);
497 	kvm_vgic_vcpu_destroy(vcpu);
498 	kvm_arm_vcpu_destroy(vcpu);
499 }
500 
501 void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
502 {
503 
504 }
505 
506 void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
507 {
508 
509 }
510 
511 static void vcpu_set_pauth_traps(struct kvm_vcpu *vcpu)
512 {
513 	if (vcpu_has_ptrauth(vcpu)) {
514 		/*
515 		 * Either we're running running an L2 guest, and the API/APK
516 		 * bits come from L1's HCR_EL2, or API/APK are both set.
517 		 */
518 		if (unlikely(vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu))) {
519 			u64 val;
520 
521 			val = __vcpu_sys_reg(vcpu, HCR_EL2);
522 			val &= (HCR_API | HCR_APK);
523 			vcpu->arch.hcr_el2 &= ~(HCR_API | HCR_APK);
524 			vcpu->arch.hcr_el2 |= val;
525 		} else {
526 			vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK);
527 		}
528 
529 		/*
530 		 * Save the host keys if there is any chance for the guest
531 		 * to use pauth, as the entry code will reload the guest
532 		 * keys in that case.
533 		 * Protected mode is the exception to that rule, as the
534 		 * entry into the EL2 code eagerly switch back and forth
535 		 * between host and hyp keys (and kvm_hyp_ctxt is out of
536 		 * reach anyway).
537 		 */
538 		if (is_protected_kvm_enabled())
539 			return;
540 
541 		if (vcpu->arch.hcr_el2 & (HCR_API | HCR_APK)) {
542 			struct kvm_cpu_context *ctxt;
543 			ctxt = this_cpu_ptr_hyp_sym(kvm_hyp_ctxt);
544 			ptrauth_save_keys(ctxt);
545 		}
546 	}
547 }
548 
549 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
550 {
551 	struct kvm_s2_mmu *mmu;
552 	int *last_ran;
553 
554 	mmu = vcpu->arch.hw_mmu;
555 	last_ran = this_cpu_ptr(mmu->last_vcpu_ran);
556 
557 	/*
558 	 * We guarantee that both TLBs and I-cache are private to each
559 	 * vcpu. If detecting that a vcpu from the same VM has
560 	 * previously run on the same physical CPU, call into the
561 	 * hypervisor code to nuke the relevant contexts.
562 	 *
563 	 * We might get preempted before the vCPU actually runs, but
564 	 * over-invalidation doesn't affect correctness.
565 	 */
566 	if (*last_ran != vcpu->vcpu_idx) {
567 		kvm_call_hyp(__kvm_flush_cpu_context, mmu);
568 		*last_ran = vcpu->vcpu_idx;
569 	}
570 
571 	vcpu->cpu = cpu;
572 
573 	kvm_vgic_load(vcpu);
574 	kvm_timer_vcpu_load(vcpu);
575 	if (has_vhe())
576 		kvm_vcpu_load_vhe(vcpu);
577 	kvm_arch_vcpu_load_fp(vcpu);
578 	kvm_vcpu_pmu_restore_guest(vcpu);
579 	if (kvm_arm_is_pvtime_enabled(&vcpu->arch))
580 		kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu);
581 
582 	if (single_task_running())
583 		vcpu_clear_wfx_traps(vcpu);
584 	else
585 		vcpu_set_wfx_traps(vcpu);
586 
587 	vcpu_set_pauth_traps(vcpu);
588 
589 	kvm_arch_vcpu_load_debug_state_flags(vcpu);
590 
591 	if (!cpumask_test_cpu(cpu, vcpu->kvm->arch.supported_cpus))
592 		vcpu_set_on_unsupported_cpu(vcpu);
593 }
594 
595 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
596 {
597 	kvm_arch_vcpu_put_debug_state_flags(vcpu);
598 	kvm_arch_vcpu_put_fp(vcpu);
599 	if (has_vhe())
600 		kvm_vcpu_put_vhe(vcpu);
601 	kvm_timer_vcpu_put(vcpu);
602 	kvm_vgic_put(vcpu);
603 	kvm_vcpu_pmu_restore_host(vcpu);
604 	kvm_arm_vmid_clear_active();
605 
606 	vcpu_clear_on_unsupported_cpu(vcpu);
607 	vcpu->cpu = -1;
608 }
609 
610 static void __kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu)
611 {
612 	WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_STOPPED);
613 	kvm_make_request(KVM_REQ_SLEEP, vcpu);
614 	kvm_vcpu_kick(vcpu);
615 }
616 
617 void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu)
618 {
619 	spin_lock(&vcpu->arch.mp_state_lock);
620 	__kvm_arm_vcpu_power_off(vcpu);
621 	spin_unlock(&vcpu->arch.mp_state_lock);
622 }
623 
624 bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu)
625 {
626 	return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_STOPPED;
627 }
628 
629 static void kvm_arm_vcpu_suspend(struct kvm_vcpu *vcpu)
630 {
631 	WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_SUSPENDED);
632 	kvm_make_request(KVM_REQ_SUSPEND, vcpu);
633 	kvm_vcpu_kick(vcpu);
634 }
635 
636 static bool kvm_arm_vcpu_suspended(struct kvm_vcpu *vcpu)
637 {
638 	return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_SUSPENDED;
639 }
640 
641 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
642 				    struct kvm_mp_state *mp_state)
643 {
644 	*mp_state = READ_ONCE(vcpu->arch.mp_state);
645 
646 	return 0;
647 }
648 
649 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
650 				    struct kvm_mp_state *mp_state)
651 {
652 	int ret = 0;
653 
654 	spin_lock(&vcpu->arch.mp_state_lock);
655 
656 	switch (mp_state->mp_state) {
657 	case KVM_MP_STATE_RUNNABLE:
658 		WRITE_ONCE(vcpu->arch.mp_state, *mp_state);
659 		break;
660 	case KVM_MP_STATE_STOPPED:
661 		__kvm_arm_vcpu_power_off(vcpu);
662 		break;
663 	case KVM_MP_STATE_SUSPENDED:
664 		kvm_arm_vcpu_suspend(vcpu);
665 		break;
666 	default:
667 		ret = -EINVAL;
668 	}
669 
670 	spin_unlock(&vcpu->arch.mp_state_lock);
671 
672 	return ret;
673 }
674 
675 /**
676  * kvm_arch_vcpu_runnable - determine if the vcpu can be scheduled
677  * @v:		The VCPU pointer
678  *
679  * If the guest CPU is not waiting for interrupts or an interrupt line is
680  * asserted, the CPU is by definition runnable.
681  */
682 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
683 {
684 	bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF);
685 	return ((irq_lines || kvm_vgic_vcpu_pending_irq(v))
686 		&& !kvm_arm_vcpu_stopped(v) && !v->arch.pause);
687 }
688 
689 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
690 {
691 	return vcpu_mode_priv(vcpu);
692 }
693 
694 #ifdef CONFIG_GUEST_PERF_EVENTS
695 unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu)
696 {
697 	return *vcpu_pc(vcpu);
698 }
699 #endif
700 
701 static void kvm_init_mpidr_data(struct kvm *kvm)
702 {
703 	struct kvm_mpidr_data *data = NULL;
704 	unsigned long c, mask, nr_entries;
705 	u64 aff_set = 0, aff_clr = ~0UL;
706 	struct kvm_vcpu *vcpu;
707 
708 	mutex_lock(&kvm->arch.config_lock);
709 
710 	if (rcu_access_pointer(kvm->arch.mpidr_data) ||
711 	    atomic_read(&kvm->online_vcpus) == 1)
712 		goto out;
713 
714 	kvm_for_each_vcpu(c, vcpu, kvm) {
715 		u64 aff = kvm_vcpu_get_mpidr_aff(vcpu);
716 		aff_set |= aff;
717 		aff_clr &= aff;
718 	}
719 
720 	/*
721 	 * A significant bit can be either 0 or 1, and will only appear in
722 	 * aff_set. Use aff_clr to weed out the useless stuff.
723 	 */
724 	mask = aff_set ^ aff_clr;
725 	nr_entries = BIT_ULL(hweight_long(mask));
726 
727 	/*
728 	 * Don't let userspace fool us. If we need more than a single page
729 	 * to describe the compressed MPIDR array, just fall back to the
730 	 * iterative method. Single vcpu VMs do not need this either.
731 	 */
732 	if (struct_size(data, cmpidr_to_idx, nr_entries) <= PAGE_SIZE)
733 		data = kzalloc(struct_size(data, cmpidr_to_idx, nr_entries),
734 			       GFP_KERNEL_ACCOUNT);
735 
736 	if (!data)
737 		goto out;
738 
739 	data->mpidr_mask = mask;
740 
741 	kvm_for_each_vcpu(c, vcpu, kvm) {
742 		u64 aff = kvm_vcpu_get_mpidr_aff(vcpu);
743 		u16 index = kvm_mpidr_index(data, aff);
744 
745 		data->cmpidr_to_idx[index] = c;
746 	}
747 
748 	rcu_assign_pointer(kvm->arch.mpidr_data, data);
749 out:
750 	mutex_unlock(&kvm->arch.config_lock);
751 }
752 
753 /*
754  * Handle both the initialisation that is being done when the vcpu is
755  * run for the first time, as well as the updates that must be
756  * performed each time we get a new thread dealing with this vcpu.
757  */
758 int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
759 {
760 	struct kvm *kvm = vcpu->kvm;
761 	int ret;
762 
763 	if (!kvm_vcpu_initialized(vcpu))
764 		return -ENOEXEC;
765 
766 	if (!kvm_arm_vcpu_is_finalized(vcpu))
767 		return -EPERM;
768 
769 	ret = kvm_arch_vcpu_run_map_fp(vcpu);
770 	if (ret)
771 		return ret;
772 
773 	if (likely(vcpu_has_run_once(vcpu)))
774 		return 0;
775 
776 	kvm_init_mpidr_data(kvm);
777 
778 	kvm_arm_vcpu_init_debug(vcpu);
779 
780 	if (likely(irqchip_in_kernel(kvm))) {
781 		/*
782 		 * Map the VGIC hardware resources before running a vcpu the
783 		 * first time on this VM.
784 		 */
785 		ret = kvm_vgic_map_resources(kvm);
786 		if (ret)
787 			return ret;
788 	}
789 
790 	if (vcpu_has_nv(vcpu)) {
791 		ret = kvm_init_nv_sysregs(vcpu->kvm);
792 		if (ret)
793 			return ret;
794 	}
795 
796 	/*
797 	 * This needs to happen after NV has imposed its own restrictions on
798 	 * the feature set
799 	 */
800 	kvm_init_sysreg(vcpu);
801 
802 	ret = kvm_timer_enable(vcpu);
803 	if (ret)
804 		return ret;
805 
806 	ret = kvm_arm_pmu_v3_enable(vcpu);
807 	if (ret)
808 		return ret;
809 
810 	if (is_protected_kvm_enabled()) {
811 		ret = pkvm_create_hyp_vm(kvm);
812 		if (ret)
813 			return ret;
814 	}
815 
816 	if (!irqchip_in_kernel(kvm)) {
817 		/*
818 		 * Tell the rest of the code that there are userspace irqchip
819 		 * VMs in the wild.
820 		 */
821 		static_branch_inc(&userspace_irqchip_in_use);
822 	}
823 
824 	/*
825 	 * Initialize traps for protected VMs.
826 	 * NOTE: Move to run in EL2 directly, rather than via a hypercall, once
827 	 * the code is in place for first run initialization at EL2.
828 	 */
829 	if (kvm_vm_is_protected(kvm))
830 		kvm_call_hyp_nvhe(__pkvm_vcpu_init_traps, vcpu);
831 
832 	mutex_lock(&kvm->arch.config_lock);
833 	set_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags);
834 	mutex_unlock(&kvm->arch.config_lock);
835 
836 	return ret;
837 }
838 
839 bool kvm_arch_intc_initialized(struct kvm *kvm)
840 {
841 	return vgic_initialized(kvm);
842 }
843 
844 void kvm_arm_halt_guest(struct kvm *kvm)
845 {
846 	unsigned long i;
847 	struct kvm_vcpu *vcpu;
848 
849 	kvm_for_each_vcpu(i, vcpu, kvm)
850 		vcpu->arch.pause = true;
851 	kvm_make_all_cpus_request(kvm, KVM_REQ_SLEEP);
852 }
853 
854 void kvm_arm_resume_guest(struct kvm *kvm)
855 {
856 	unsigned long i;
857 	struct kvm_vcpu *vcpu;
858 
859 	kvm_for_each_vcpu(i, vcpu, kvm) {
860 		vcpu->arch.pause = false;
861 		__kvm_vcpu_wake_up(vcpu);
862 	}
863 }
864 
865 static void kvm_vcpu_sleep(struct kvm_vcpu *vcpu)
866 {
867 	struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
868 
869 	rcuwait_wait_event(wait,
870 			   (!kvm_arm_vcpu_stopped(vcpu)) && (!vcpu->arch.pause),
871 			   TASK_INTERRUPTIBLE);
872 
873 	if (kvm_arm_vcpu_stopped(vcpu) || vcpu->arch.pause) {
874 		/* Awaken to handle a signal, request we sleep again later. */
875 		kvm_make_request(KVM_REQ_SLEEP, vcpu);
876 	}
877 
878 	/*
879 	 * Make sure we will observe a potential reset request if we've
880 	 * observed a change to the power state. Pairs with the smp_wmb() in
881 	 * kvm_psci_vcpu_on().
882 	 */
883 	smp_rmb();
884 }
885 
886 /**
887  * kvm_vcpu_wfi - emulate Wait-For-Interrupt behavior
888  * @vcpu:	The VCPU pointer
889  *
890  * Suspend execution of a vCPU until a valid wake event is detected, i.e. until
891  * the vCPU is runnable.  The vCPU may or may not be scheduled out, depending
892  * on when a wake event arrives, e.g. there may already be a pending wake event.
893  */
894 void kvm_vcpu_wfi(struct kvm_vcpu *vcpu)
895 {
896 	/*
897 	 * Sync back the state of the GIC CPU interface so that we have
898 	 * the latest PMR and group enables. This ensures that
899 	 * kvm_arch_vcpu_runnable has up-to-date data to decide whether
900 	 * we have pending interrupts, e.g. when determining if the
901 	 * vCPU should block.
902 	 *
903 	 * For the same reason, we want to tell GICv4 that we need
904 	 * doorbells to be signalled, should an interrupt become pending.
905 	 */
906 	preempt_disable();
907 	vcpu_set_flag(vcpu, IN_WFI);
908 	kvm_vgic_put(vcpu);
909 	preempt_enable();
910 
911 	kvm_vcpu_halt(vcpu);
912 	vcpu_clear_flag(vcpu, IN_WFIT);
913 
914 	preempt_disable();
915 	vcpu_clear_flag(vcpu, IN_WFI);
916 	kvm_vgic_load(vcpu);
917 	preempt_enable();
918 }
919 
920 static int kvm_vcpu_suspend(struct kvm_vcpu *vcpu)
921 {
922 	if (!kvm_arm_vcpu_suspended(vcpu))
923 		return 1;
924 
925 	kvm_vcpu_wfi(vcpu);
926 
927 	/*
928 	 * The suspend state is sticky; we do not leave it until userspace
929 	 * explicitly marks the vCPU as runnable. Request that we suspend again
930 	 * later.
931 	 */
932 	kvm_make_request(KVM_REQ_SUSPEND, vcpu);
933 
934 	/*
935 	 * Check to make sure the vCPU is actually runnable. If so, exit to
936 	 * userspace informing it of the wakeup condition.
937 	 */
938 	if (kvm_arch_vcpu_runnable(vcpu)) {
939 		memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
940 		vcpu->run->system_event.type = KVM_SYSTEM_EVENT_WAKEUP;
941 		vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
942 		return 0;
943 	}
944 
945 	/*
946 	 * Otherwise, we were unblocked to process a different event, such as a
947 	 * pending signal. Return 1 and allow kvm_arch_vcpu_ioctl_run() to
948 	 * process the event.
949 	 */
950 	return 1;
951 }
952 
953 /**
954  * check_vcpu_requests - check and handle pending vCPU requests
955  * @vcpu:	the VCPU pointer
956  *
957  * Return: 1 if we should enter the guest
958  *	   0 if we should exit to userspace
959  *	   < 0 if we should exit to userspace, where the return value indicates
960  *	   an error
961  */
962 static int check_vcpu_requests(struct kvm_vcpu *vcpu)
963 {
964 	if (kvm_request_pending(vcpu)) {
965 		if (kvm_check_request(KVM_REQ_SLEEP, vcpu))
966 			kvm_vcpu_sleep(vcpu);
967 
968 		if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
969 			kvm_reset_vcpu(vcpu);
970 
971 		/*
972 		 * Clear IRQ_PENDING requests that were made to guarantee
973 		 * that a VCPU sees new virtual interrupts.
974 		 */
975 		kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu);
976 
977 		if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu))
978 			kvm_update_stolen_time(vcpu);
979 
980 		if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) {
981 			/* The distributor enable bits were changed */
982 			preempt_disable();
983 			vgic_v4_put(vcpu);
984 			vgic_v4_load(vcpu);
985 			preempt_enable();
986 		}
987 
988 		if (kvm_check_request(KVM_REQ_RELOAD_PMU, vcpu))
989 			kvm_vcpu_reload_pmu(vcpu);
990 
991 		if (kvm_check_request(KVM_REQ_RESYNC_PMU_EL0, vcpu))
992 			kvm_vcpu_pmu_restore_guest(vcpu);
993 
994 		if (kvm_check_request(KVM_REQ_SUSPEND, vcpu))
995 			return kvm_vcpu_suspend(vcpu);
996 
997 		if (kvm_dirty_ring_check_request(vcpu))
998 			return 0;
999 	}
1000 
1001 	return 1;
1002 }
1003 
1004 static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu)
1005 {
1006 	if (likely(!vcpu_mode_is_32bit(vcpu)))
1007 		return false;
1008 
1009 	if (vcpu_has_nv(vcpu))
1010 		return true;
1011 
1012 	return !kvm_supports_32bit_el0();
1013 }
1014 
1015 /**
1016  * kvm_vcpu_exit_request - returns true if the VCPU should *not* enter the guest
1017  * @vcpu:	The VCPU pointer
1018  * @ret:	Pointer to write optional return code
1019  *
1020  * Returns: true if the VCPU needs to return to a preemptible + interruptible
1021  *	    and skip guest entry.
1022  *
1023  * This function disambiguates between two different types of exits: exits to a
1024  * preemptible + interruptible kernel context and exits to userspace. For an
1025  * exit to userspace, this function will write the return code to ret and return
1026  * true. For an exit to preemptible + interruptible kernel context (i.e. check
1027  * for pending work and re-enter), return true without writing to ret.
1028  */
1029 static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret)
1030 {
1031 	struct kvm_run *run = vcpu->run;
1032 
1033 	/*
1034 	 * If we're using a userspace irqchip, then check if we need
1035 	 * to tell a userspace irqchip about timer or PMU level
1036 	 * changes and if so, exit to userspace (the actual level
1037 	 * state gets updated in kvm_timer_update_run and
1038 	 * kvm_pmu_update_run below).
1039 	 */
1040 	if (static_branch_unlikely(&userspace_irqchip_in_use)) {
1041 		if (kvm_timer_should_notify_user(vcpu) ||
1042 		    kvm_pmu_should_notify_user(vcpu)) {
1043 			*ret = -EINTR;
1044 			run->exit_reason = KVM_EXIT_INTR;
1045 			return true;
1046 		}
1047 	}
1048 
1049 	if (unlikely(vcpu_on_unsupported_cpu(vcpu))) {
1050 		run->exit_reason = KVM_EXIT_FAIL_ENTRY;
1051 		run->fail_entry.hardware_entry_failure_reason = KVM_EXIT_FAIL_ENTRY_CPU_UNSUPPORTED;
1052 		run->fail_entry.cpu = smp_processor_id();
1053 		*ret = 0;
1054 		return true;
1055 	}
1056 
1057 	return kvm_request_pending(vcpu) ||
1058 			xfer_to_guest_mode_work_pending();
1059 }
1060 
1061 /*
1062  * Actually run the vCPU, entering an RCU extended quiescent state (EQS) while
1063  * the vCPU is running.
1064  *
1065  * This must be noinstr as instrumentation may make use of RCU, and this is not
1066  * safe during the EQS.
1067  */
1068 static int noinstr kvm_arm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
1069 {
1070 	int ret;
1071 
1072 	guest_state_enter_irqoff();
1073 	ret = kvm_call_hyp_ret(__kvm_vcpu_run, vcpu);
1074 	guest_state_exit_irqoff();
1075 
1076 	return ret;
1077 }
1078 
1079 /**
1080  * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
1081  * @vcpu:	The VCPU pointer
1082  *
1083  * This function is called through the VCPU_RUN ioctl called from user space. It
1084  * will execute VM code in a loop until the time slice for the process is used
1085  * or some emulation is needed from user space in which case the function will
1086  * return with return value 0 and with the kvm_run structure filled in with the
1087  * required data for the requested emulation.
1088  */
1089 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
1090 {
1091 	struct kvm_run *run = vcpu->run;
1092 	int ret;
1093 
1094 	if (run->exit_reason == KVM_EXIT_MMIO) {
1095 		ret = kvm_handle_mmio_return(vcpu);
1096 		if (ret <= 0)
1097 			return ret;
1098 	}
1099 
1100 	vcpu_load(vcpu);
1101 
1102 	if (run->immediate_exit) {
1103 		ret = -EINTR;
1104 		goto out;
1105 	}
1106 
1107 	kvm_sigset_activate(vcpu);
1108 
1109 	ret = 1;
1110 	run->exit_reason = KVM_EXIT_UNKNOWN;
1111 	run->flags = 0;
1112 	while (ret > 0) {
1113 		/*
1114 		 * Check conditions before entering the guest
1115 		 */
1116 		ret = xfer_to_guest_mode_handle_work(vcpu);
1117 		if (!ret)
1118 			ret = 1;
1119 
1120 		if (ret > 0)
1121 			ret = check_vcpu_requests(vcpu);
1122 
1123 		/*
1124 		 * Preparing the interrupts to be injected also
1125 		 * involves poking the GIC, which must be done in a
1126 		 * non-preemptible context.
1127 		 */
1128 		preempt_disable();
1129 
1130 		/*
1131 		 * The VMID allocator only tracks active VMIDs per
1132 		 * physical CPU, and therefore the VMID allocated may not be
1133 		 * preserved on VMID roll-over if the task was preempted,
1134 		 * making a thread's VMID inactive. So we need to call
1135 		 * kvm_arm_vmid_update() in non-premptible context.
1136 		 */
1137 		if (kvm_arm_vmid_update(&vcpu->arch.hw_mmu->vmid) &&
1138 		    has_vhe())
1139 			__load_stage2(vcpu->arch.hw_mmu,
1140 				      vcpu->arch.hw_mmu->arch);
1141 
1142 		kvm_pmu_flush_hwstate(vcpu);
1143 
1144 		local_irq_disable();
1145 
1146 		kvm_vgic_flush_hwstate(vcpu);
1147 
1148 		kvm_pmu_update_vcpu_events(vcpu);
1149 
1150 		/*
1151 		 * Ensure we set mode to IN_GUEST_MODE after we disable
1152 		 * interrupts and before the final VCPU requests check.
1153 		 * See the comment in kvm_vcpu_exiting_guest_mode() and
1154 		 * Documentation/virt/kvm/vcpu-requests.rst
1155 		 */
1156 		smp_store_mb(vcpu->mode, IN_GUEST_MODE);
1157 
1158 		if (ret <= 0 || kvm_vcpu_exit_request(vcpu, &ret)) {
1159 			vcpu->mode = OUTSIDE_GUEST_MODE;
1160 			isb(); /* Ensure work in x_flush_hwstate is committed */
1161 			kvm_pmu_sync_hwstate(vcpu);
1162 			if (static_branch_unlikely(&userspace_irqchip_in_use))
1163 				kvm_timer_sync_user(vcpu);
1164 			kvm_vgic_sync_hwstate(vcpu);
1165 			local_irq_enable();
1166 			preempt_enable();
1167 			continue;
1168 		}
1169 
1170 		kvm_arm_setup_debug(vcpu);
1171 		kvm_arch_vcpu_ctxflush_fp(vcpu);
1172 
1173 		/**************************************************************
1174 		 * Enter the guest
1175 		 */
1176 		trace_kvm_entry(*vcpu_pc(vcpu));
1177 		guest_timing_enter_irqoff();
1178 
1179 		ret = kvm_arm_vcpu_enter_exit(vcpu);
1180 
1181 		vcpu->mode = OUTSIDE_GUEST_MODE;
1182 		vcpu->stat.exits++;
1183 		/*
1184 		 * Back from guest
1185 		 *************************************************************/
1186 
1187 		kvm_arm_clear_debug(vcpu);
1188 
1189 		/*
1190 		 * We must sync the PMU state before the vgic state so
1191 		 * that the vgic can properly sample the updated state of the
1192 		 * interrupt line.
1193 		 */
1194 		kvm_pmu_sync_hwstate(vcpu);
1195 
1196 		/*
1197 		 * Sync the vgic state before syncing the timer state because
1198 		 * the timer code needs to know if the virtual timer
1199 		 * interrupts are active.
1200 		 */
1201 		kvm_vgic_sync_hwstate(vcpu);
1202 
1203 		/*
1204 		 * Sync the timer hardware state before enabling interrupts as
1205 		 * we don't want vtimer interrupts to race with syncing the
1206 		 * timer virtual interrupt state.
1207 		 */
1208 		if (static_branch_unlikely(&userspace_irqchip_in_use))
1209 			kvm_timer_sync_user(vcpu);
1210 
1211 		kvm_arch_vcpu_ctxsync_fp(vcpu);
1212 
1213 		/*
1214 		 * We must ensure that any pending interrupts are taken before
1215 		 * we exit guest timing so that timer ticks are accounted as
1216 		 * guest time. Transiently unmask interrupts so that any
1217 		 * pending interrupts are taken.
1218 		 *
1219 		 * Per ARM DDI 0487G.b section D1.13.4, an ISB (or other
1220 		 * context synchronization event) is necessary to ensure that
1221 		 * pending interrupts are taken.
1222 		 */
1223 		if (ARM_EXCEPTION_CODE(ret) == ARM_EXCEPTION_IRQ) {
1224 			local_irq_enable();
1225 			isb();
1226 			local_irq_disable();
1227 		}
1228 
1229 		guest_timing_exit_irqoff();
1230 
1231 		local_irq_enable();
1232 
1233 		trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
1234 
1235 		/* Exit types that need handling before we can be preempted */
1236 		handle_exit_early(vcpu, ret);
1237 
1238 		preempt_enable();
1239 
1240 		/*
1241 		 * The ARMv8 architecture doesn't give the hypervisor
1242 		 * a mechanism to prevent a guest from dropping to AArch32 EL0
1243 		 * if implemented by the CPU. If we spot the guest in such
1244 		 * state and that we decided it wasn't supposed to do so (like
1245 		 * with the asymmetric AArch32 case), return to userspace with
1246 		 * a fatal error.
1247 		 */
1248 		if (vcpu_mode_is_bad_32bit(vcpu)) {
1249 			/*
1250 			 * As we have caught the guest red-handed, decide that
1251 			 * it isn't fit for purpose anymore by making the vcpu
1252 			 * invalid. The VMM can try and fix it by issuing  a
1253 			 * KVM_ARM_VCPU_INIT if it really wants to.
1254 			 */
1255 			vcpu_clear_flag(vcpu, VCPU_INITIALIZED);
1256 			ret = ARM_EXCEPTION_IL;
1257 		}
1258 
1259 		ret = handle_exit(vcpu, ret);
1260 	}
1261 
1262 	/* Tell userspace about in-kernel device output levels */
1263 	if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
1264 		kvm_timer_update_run(vcpu);
1265 		kvm_pmu_update_run(vcpu);
1266 	}
1267 
1268 	kvm_sigset_deactivate(vcpu);
1269 
1270 out:
1271 	/*
1272 	 * In the unlikely event that we are returning to userspace
1273 	 * with pending exceptions or PC adjustment, commit these
1274 	 * adjustments in order to give userspace a consistent view of
1275 	 * the vcpu state. Note that this relies on __kvm_adjust_pc()
1276 	 * being preempt-safe on VHE.
1277 	 */
1278 	if (unlikely(vcpu_get_flag(vcpu, PENDING_EXCEPTION) ||
1279 		     vcpu_get_flag(vcpu, INCREMENT_PC)))
1280 		kvm_call_hyp(__kvm_adjust_pc, vcpu);
1281 
1282 	vcpu_put(vcpu);
1283 	return ret;
1284 }
1285 
1286 static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level)
1287 {
1288 	int bit_index;
1289 	bool set;
1290 	unsigned long *hcr;
1291 
1292 	if (number == KVM_ARM_IRQ_CPU_IRQ)
1293 		bit_index = __ffs(HCR_VI);
1294 	else /* KVM_ARM_IRQ_CPU_FIQ */
1295 		bit_index = __ffs(HCR_VF);
1296 
1297 	hcr = vcpu_hcr(vcpu);
1298 	if (level)
1299 		set = test_and_set_bit(bit_index, hcr);
1300 	else
1301 		set = test_and_clear_bit(bit_index, hcr);
1302 
1303 	/*
1304 	 * If we didn't change anything, no need to wake up or kick other CPUs
1305 	 */
1306 	if (set == level)
1307 		return 0;
1308 
1309 	/*
1310 	 * The vcpu irq_lines field was updated, wake up sleeping VCPUs and
1311 	 * trigger a world-switch round on the running physical CPU to set the
1312 	 * virtual IRQ/FIQ fields in the HCR appropriately.
1313 	 */
1314 	kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
1315 	kvm_vcpu_kick(vcpu);
1316 
1317 	return 0;
1318 }
1319 
1320 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
1321 			  bool line_status)
1322 {
1323 	u32 irq = irq_level->irq;
1324 	unsigned int irq_type, vcpu_id, irq_num;
1325 	struct kvm_vcpu *vcpu = NULL;
1326 	bool level = irq_level->level;
1327 
1328 	irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK;
1329 	vcpu_id = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK;
1330 	vcpu_id += ((irq >> KVM_ARM_IRQ_VCPU2_SHIFT) & KVM_ARM_IRQ_VCPU2_MASK) * (KVM_ARM_IRQ_VCPU_MASK + 1);
1331 	irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK;
1332 
1333 	trace_kvm_irq_line(irq_type, vcpu_id, irq_num, irq_level->level);
1334 
1335 	switch (irq_type) {
1336 	case KVM_ARM_IRQ_TYPE_CPU:
1337 		if (irqchip_in_kernel(kvm))
1338 			return -ENXIO;
1339 
1340 		vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
1341 		if (!vcpu)
1342 			return -EINVAL;
1343 
1344 		if (irq_num > KVM_ARM_IRQ_CPU_FIQ)
1345 			return -EINVAL;
1346 
1347 		return vcpu_interrupt_line(vcpu, irq_num, level);
1348 	case KVM_ARM_IRQ_TYPE_PPI:
1349 		if (!irqchip_in_kernel(kvm))
1350 			return -ENXIO;
1351 
1352 		vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
1353 		if (!vcpu)
1354 			return -EINVAL;
1355 
1356 		if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS)
1357 			return -EINVAL;
1358 
1359 		return kvm_vgic_inject_irq(kvm, vcpu, irq_num, level, NULL);
1360 	case KVM_ARM_IRQ_TYPE_SPI:
1361 		if (!irqchip_in_kernel(kvm))
1362 			return -ENXIO;
1363 
1364 		if (irq_num < VGIC_NR_PRIVATE_IRQS)
1365 			return -EINVAL;
1366 
1367 		return kvm_vgic_inject_irq(kvm, NULL, irq_num, level, NULL);
1368 	}
1369 
1370 	return -EINVAL;
1371 }
1372 
1373 static unsigned long system_supported_vcpu_features(void)
1374 {
1375 	unsigned long features = KVM_VCPU_VALID_FEATURES;
1376 
1377 	if (!cpus_have_final_cap(ARM64_HAS_32BIT_EL1))
1378 		clear_bit(KVM_ARM_VCPU_EL1_32BIT, &features);
1379 
1380 	if (!kvm_arm_support_pmu_v3())
1381 		clear_bit(KVM_ARM_VCPU_PMU_V3, &features);
1382 
1383 	if (!system_supports_sve())
1384 		clear_bit(KVM_ARM_VCPU_SVE, &features);
1385 
1386 	if (!kvm_has_full_ptr_auth()) {
1387 		clear_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, &features);
1388 		clear_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, &features);
1389 	}
1390 
1391 	if (!cpus_have_final_cap(ARM64_HAS_NESTED_VIRT))
1392 		clear_bit(KVM_ARM_VCPU_HAS_EL2, &features);
1393 
1394 	return features;
1395 }
1396 
1397 static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu,
1398 					const struct kvm_vcpu_init *init)
1399 {
1400 	unsigned long features = init->features[0];
1401 	int i;
1402 
1403 	if (features & ~KVM_VCPU_VALID_FEATURES)
1404 		return -ENOENT;
1405 
1406 	for (i = 1; i < ARRAY_SIZE(init->features); i++) {
1407 		if (init->features[i])
1408 			return -ENOENT;
1409 	}
1410 
1411 	if (features & ~system_supported_vcpu_features())
1412 		return -EINVAL;
1413 
1414 	/*
1415 	 * For now make sure that both address/generic pointer authentication
1416 	 * features are requested by the userspace together.
1417 	 */
1418 	if (test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, &features) !=
1419 	    test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, &features))
1420 		return -EINVAL;
1421 
1422 	/* Disallow NV+SVE for the time being */
1423 	if (test_bit(KVM_ARM_VCPU_HAS_EL2, &features) &&
1424 	    test_bit(KVM_ARM_VCPU_SVE, &features))
1425 		return -EINVAL;
1426 
1427 	if (!test_bit(KVM_ARM_VCPU_EL1_32BIT, &features))
1428 		return 0;
1429 
1430 	/* MTE is incompatible with AArch32 */
1431 	if (kvm_has_mte(vcpu->kvm))
1432 		return -EINVAL;
1433 
1434 	/* NV is incompatible with AArch32 */
1435 	if (test_bit(KVM_ARM_VCPU_HAS_EL2, &features))
1436 		return -EINVAL;
1437 
1438 	return 0;
1439 }
1440 
1441 static bool kvm_vcpu_init_changed(struct kvm_vcpu *vcpu,
1442 				  const struct kvm_vcpu_init *init)
1443 {
1444 	unsigned long features = init->features[0];
1445 
1446 	return !bitmap_equal(vcpu->kvm->arch.vcpu_features, &features,
1447 			     KVM_VCPU_MAX_FEATURES);
1448 }
1449 
1450 static int kvm_setup_vcpu(struct kvm_vcpu *vcpu)
1451 {
1452 	struct kvm *kvm = vcpu->kvm;
1453 	int ret = 0;
1454 
1455 	/*
1456 	 * When the vCPU has a PMU, but no PMU is set for the guest
1457 	 * yet, set the default one.
1458 	 */
1459 	if (kvm_vcpu_has_pmu(vcpu) && !kvm->arch.arm_pmu)
1460 		ret = kvm_arm_set_default_pmu(kvm);
1461 
1462 	return ret;
1463 }
1464 
1465 static int __kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
1466 				 const struct kvm_vcpu_init *init)
1467 {
1468 	unsigned long features = init->features[0];
1469 	struct kvm *kvm = vcpu->kvm;
1470 	int ret = -EINVAL;
1471 
1472 	mutex_lock(&kvm->arch.config_lock);
1473 
1474 	if (test_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags) &&
1475 	    kvm_vcpu_init_changed(vcpu, init))
1476 		goto out_unlock;
1477 
1478 	bitmap_copy(kvm->arch.vcpu_features, &features, KVM_VCPU_MAX_FEATURES);
1479 
1480 	ret = kvm_setup_vcpu(vcpu);
1481 	if (ret)
1482 		goto out_unlock;
1483 
1484 	/* Now we know what it is, we can reset it. */
1485 	kvm_reset_vcpu(vcpu);
1486 
1487 	set_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags);
1488 	vcpu_set_flag(vcpu, VCPU_INITIALIZED);
1489 	ret = 0;
1490 out_unlock:
1491 	mutex_unlock(&kvm->arch.config_lock);
1492 	return ret;
1493 }
1494 
1495 static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
1496 			       const struct kvm_vcpu_init *init)
1497 {
1498 	int ret;
1499 
1500 	if (init->target != KVM_ARM_TARGET_GENERIC_V8 &&
1501 	    init->target != kvm_target_cpu())
1502 		return -EINVAL;
1503 
1504 	ret = kvm_vcpu_init_check_features(vcpu, init);
1505 	if (ret)
1506 		return ret;
1507 
1508 	if (!kvm_vcpu_initialized(vcpu))
1509 		return __kvm_vcpu_set_target(vcpu, init);
1510 
1511 	if (kvm_vcpu_init_changed(vcpu, init))
1512 		return -EINVAL;
1513 
1514 	kvm_reset_vcpu(vcpu);
1515 	return 0;
1516 }
1517 
1518 static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
1519 					 struct kvm_vcpu_init *init)
1520 {
1521 	bool power_off = false;
1522 	int ret;
1523 
1524 	/*
1525 	 * Treat the power-off vCPU feature as ephemeral. Clear the bit to avoid
1526 	 * reflecting it in the finalized feature set, thus limiting its scope
1527 	 * to a single KVM_ARM_VCPU_INIT call.
1528 	 */
1529 	if (init->features[0] & BIT(KVM_ARM_VCPU_POWER_OFF)) {
1530 		init->features[0] &= ~BIT(KVM_ARM_VCPU_POWER_OFF);
1531 		power_off = true;
1532 	}
1533 
1534 	ret = kvm_vcpu_set_target(vcpu, init);
1535 	if (ret)
1536 		return ret;
1537 
1538 	/*
1539 	 * Ensure a rebooted VM will fault in RAM pages and detect if the
1540 	 * guest MMU is turned off and flush the caches as needed.
1541 	 *
1542 	 * S2FWB enforces all memory accesses to RAM being cacheable,
1543 	 * ensuring that the data side is always coherent. We still
1544 	 * need to invalidate the I-cache though, as FWB does *not*
1545 	 * imply CTR_EL0.DIC.
1546 	 */
1547 	if (vcpu_has_run_once(vcpu)) {
1548 		if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
1549 			stage2_unmap_vm(vcpu->kvm);
1550 		else
1551 			icache_inval_all_pou();
1552 	}
1553 
1554 	vcpu_reset_hcr(vcpu);
1555 	vcpu->arch.cptr_el2 = kvm_get_reset_cptr_el2(vcpu);
1556 
1557 	/*
1558 	 * Handle the "start in power-off" case.
1559 	 */
1560 	spin_lock(&vcpu->arch.mp_state_lock);
1561 
1562 	if (power_off)
1563 		__kvm_arm_vcpu_power_off(vcpu);
1564 	else
1565 		WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_RUNNABLE);
1566 
1567 	spin_unlock(&vcpu->arch.mp_state_lock);
1568 
1569 	return 0;
1570 }
1571 
1572 static int kvm_arm_vcpu_set_attr(struct kvm_vcpu *vcpu,
1573 				 struct kvm_device_attr *attr)
1574 {
1575 	int ret = -ENXIO;
1576 
1577 	switch (attr->group) {
1578 	default:
1579 		ret = kvm_arm_vcpu_arch_set_attr(vcpu, attr);
1580 		break;
1581 	}
1582 
1583 	return ret;
1584 }
1585 
1586 static int kvm_arm_vcpu_get_attr(struct kvm_vcpu *vcpu,
1587 				 struct kvm_device_attr *attr)
1588 {
1589 	int ret = -ENXIO;
1590 
1591 	switch (attr->group) {
1592 	default:
1593 		ret = kvm_arm_vcpu_arch_get_attr(vcpu, attr);
1594 		break;
1595 	}
1596 
1597 	return ret;
1598 }
1599 
1600 static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu,
1601 				 struct kvm_device_attr *attr)
1602 {
1603 	int ret = -ENXIO;
1604 
1605 	switch (attr->group) {
1606 	default:
1607 		ret = kvm_arm_vcpu_arch_has_attr(vcpu, attr);
1608 		break;
1609 	}
1610 
1611 	return ret;
1612 }
1613 
1614 static int kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
1615 				   struct kvm_vcpu_events *events)
1616 {
1617 	memset(events, 0, sizeof(*events));
1618 
1619 	return __kvm_arm_vcpu_get_events(vcpu, events);
1620 }
1621 
1622 static int kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
1623 				   struct kvm_vcpu_events *events)
1624 {
1625 	int i;
1626 
1627 	/* check whether the reserved field is zero */
1628 	for (i = 0; i < ARRAY_SIZE(events->reserved); i++)
1629 		if (events->reserved[i])
1630 			return -EINVAL;
1631 
1632 	/* check whether the pad field is zero */
1633 	for (i = 0; i < ARRAY_SIZE(events->exception.pad); i++)
1634 		if (events->exception.pad[i])
1635 			return -EINVAL;
1636 
1637 	return __kvm_arm_vcpu_set_events(vcpu, events);
1638 }
1639 
1640 long kvm_arch_vcpu_ioctl(struct file *filp,
1641 			 unsigned int ioctl, unsigned long arg)
1642 {
1643 	struct kvm_vcpu *vcpu = filp->private_data;
1644 	void __user *argp = (void __user *)arg;
1645 	struct kvm_device_attr attr;
1646 	long r;
1647 
1648 	switch (ioctl) {
1649 	case KVM_ARM_VCPU_INIT: {
1650 		struct kvm_vcpu_init init;
1651 
1652 		r = -EFAULT;
1653 		if (copy_from_user(&init, argp, sizeof(init)))
1654 			break;
1655 
1656 		r = kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init);
1657 		break;
1658 	}
1659 	case KVM_SET_ONE_REG:
1660 	case KVM_GET_ONE_REG: {
1661 		struct kvm_one_reg reg;
1662 
1663 		r = -ENOEXEC;
1664 		if (unlikely(!kvm_vcpu_initialized(vcpu)))
1665 			break;
1666 
1667 		r = -EFAULT;
1668 		if (copy_from_user(&reg, argp, sizeof(reg)))
1669 			break;
1670 
1671 		/*
1672 		 * We could owe a reset due to PSCI. Handle the pending reset
1673 		 * here to ensure userspace register accesses are ordered after
1674 		 * the reset.
1675 		 */
1676 		if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
1677 			kvm_reset_vcpu(vcpu);
1678 
1679 		if (ioctl == KVM_SET_ONE_REG)
1680 			r = kvm_arm_set_reg(vcpu, &reg);
1681 		else
1682 			r = kvm_arm_get_reg(vcpu, &reg);
1683 		break;
1684 	}
1685 	case KVM_GET_REG_LIST: {
1686 		struct kvm_reg_list __user *user_list = argp;
1687 		struct kvm_reg_list reg_list;
1688 		unsigned n;
1689 
1690 		r = -ENOEXEC;
1691 		if (unlikely(!kvm_vcpu_initialized(vcpu)))
1692 			break;
1693 
1694 		r = -EPERM;
1695 		if (!kvm_arm_vcpu_is_finalized(vcpu))
1696 			break;
1697 
1698 		r = -EFAULT;
1699 		if (copy_from_user(&reg_list, user_list, sizeof(reg_list)))
1700 			break;
1701 		n = reg_list.n;
1702 		reg_list.n = kvm_arm_num_regs(vcpu);
1703 		if (copy_to_user(user_list, &reg_list, sizeof(reg_list)))
1704 			break;
1705 		r = -E2BIG;
1706 		if (n < reg_list.n)
1707 			break;
1708 		r = kvm_arm_copy_reg_indices(vcpu, user_list->reg);
1709 		break;
1710 	}
1711 	case KVM_SET_DEVICE_ATTR: {
1712 		r = -EFAULT;
1713 		if (copy_from_user(&attr, argp, sizeof(attr)))
1714 			break;
1715 		r = kvm_arm_vcpu_set_attr(vcpu, &attr);
1716 		break;
1717 	}
1718 	case KVM_GET_DEVICE_ATTR: {
1719 		r = -EFAULT;
1720 		if (copy_from_user(&attr, argp, sizeof(attr)))
1721 			break;
1722 		r = kvm_arm_vcpu_get_attr(vcpu, &attr);
1723 		break;
1724 	}
1725 	case KVM_HAS_DEVICE_ATTR: {
1726 		r = -EFAULT;
1727 		if (copy_from_user(&attr, argp, sizeof(attr)))
1728 			break;
1729 		r = kvm_arm_vcpu_has_attr(vcpu, &attr);
1730 		break;
1731 	}
1732 	case KVM_GET_VCPU_EVENTS: {
1733 		struct kvm_vcpu_events events;
1734 
1735 		if (kvm_arm_vcpu_get_events(vcpu, &events))
1736 			return -EINVAL;
1737 
1738 		if (copy_to_user(argp, &events, sizeof(events)))
1739 			return -EFAULT;
1740 
1741 		return 0;
1742 	}
1743 	case KVM_SET_VCPU_EVENTS: {
1744 		struct kvm_vcpu_events events;
1745 
1746 		if (copy_from_user(&events, argp, sizeof(events)))
1747 			return -EFAULT;
1748 
1749 		return kvm_arm_vcpu_set_events(vcpu, &events);
1750 	}
1751 	case KVM_ARM_VCPU_FINALIZE: {
1752 		int what;
1753 
1754 		if (!kvm_vcpu_initialized(vcpu))
1755 			return -ENOEXEC;
1756 
1757 		if (get_user(what, (const int __user *)argp))
1758 			return -EFAULT;
1759 
1760 		return kvm_arm_vcpu_finalize(vcpu, what);
1761 	}
1762 	default:
1763 		r = -EINVAL;
1764 	}
1765 
1766 	return r;
1767 }
1768 
1769 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
1770 {
1771 
1772 }
1773 
1774 static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
1775 					struct kvm_arm_device_addr *dev_addr)
1776 {
1777 	switch (FIELD_GET(KVM_ARM_DEVICE_ID_MASK, dev_addr->id)) {
1778 	case KVM_ARM_DEVICE_VGIC_V2:
1779 		if (!vgic_present)
1780 			return -ENXIO;
1781 		return kvm_set_legacy_vgic_v2_addr(kvm, dev_addr);
1782 	default:
1783 		return -ENODEV;
1784 	}
1785 }
1786 
1787 static int kvm_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
1788 {
1789 	switch (attr->group) {
1790 	case KVM_ARM_VM_SMCCC_CTRL:
1791 		return kvm_vm_smccc_has_attr(kvm, attr);
1792 	default:
1793 		return -ENXIO;
1794 	}
1795 }
1796 
1797 static int kvm_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
1798 {
1799 	switch (attr->group) {
1800 	case KVM_ARM_VM_SMCCC_CTRL:
1801 		return kvm_vm_smccc_set_attr(kvm, attr);
1802 	default:
1803 		return -ENXIO;
1804 	}
1805 }
1806 
1807 int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
1808 {
1809 	struct kvm *kvm = filp->private_data;
1810 	void __user *argp = (void __user *)arg;
1811 	struct kvm_device_attr attr;
1812 
1813 	switch (ioctl) {
1814 	case KVM_CREATE_IRQCHIP: {
1815 		int ret;
1816 		if (!vgic_present)
1817 			return -ENXIO;
1818 		mutex_lock(&kvm->lock);
1819 		ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
1820 		mutex_unlock(&kvm->lock);
1821 		return ret;
1822 	}
1823 	case KVM_ARM_SET_DEVICE_ADDR: {
1824 		struct kvm_arm_device_addr dev_addr;
1825 
1826 		if (copy_from_user(&dev_addr, argp, sizeof(dev_addr)))
1827 			return -EFAULT;
1828 		return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr);
1829 	}
1830 	case KVM_ARM_PREFERRED_TARGET: {
1831 		struct kvm_vcpu_init init = {
1832 			.target = KVM_ARM_TARGET_GENERIC_V8,
1833 		};
1834 
1835 		if (copy_to_user(argp, &init, sizeof(init)))
1836 			return -EFAULT;
1837 
1838 		return 0;
1839 	}
1840 	case KVM_ARM_MTE_COPY_TAGS: {
1841 		struct kvm_arm_copy_mte_tags copy_tags;
1842 
1843 		if (copy_from_user(&copy_tags, argp, sizeof(copy_tags)))
1844 			return -EFAULT;
1845 		return kvm_vm_ioctl_mte_copy_tags(kvm, &copy_tags);
1846 	}
1847 	case KVM_ARM_SET_COUNTER_OFFSET: {
1848 		struct kvm_arm_counter_offset offset;
1849 
1850 		if (copy_from_user(&offset, argp, sizeof(offset)))
1851 			return -EFAULT;
1852 		return kvm_vm_ioctl_set_counter_offset(kvm, &offset);
1853 	}
1854 	case KVM_HAS_DEVICE_ATTR: {
1855 		if (copy_from_user(&attr, argp, sizeof(attr)))
1856 			return -EFAULT;
1857 
1858 		return kvm_vm_has_attr(kvm, &attr);
1859 	}
1860 	case KVM_SET_DEVICE_ATTR: {
1861 		if (copy_from_user(&attr, argp, sizeof(attr)))
1862 			return -EFAULT;
1863 
1864 		return kvm_vm_set_attr(kvm, &attr);
1865 	}
1866 	case KVM_ARM_GET_REG_WRITABLE_MASKS: {
1867 		struct reg_mask_range range;
1868 
1869 		if (copy_from_user(&range, argp, sizeof(range)))
1870 			return -EFAULT;
1871 		return kvm_vm_ioctl_get_reg_writable_masks(kvm, &range);
1872 	}
1873 	default:
1874 		return -EINVAL;
1875 	}
1876 }
1877 
1878 /* unlocks vcpus from @vcpu_lock_idx and smaller */
1879 static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx)
1880 {
1881 	struct kvm_vcpu *tmp_vcpu;
1882 
1883 	for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
1884 		tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
1885 		mutex_unlock(&tmp_vcpu->mutex);
1886 	}
1887 }
1888 
1889 void unlock_all_vcpus(struct kvm *kvm)
1890 {
1891 	lockdep_assert_held(&kvm->lock);
1892 
1893 	unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1);
1894 }
1895 
1896 /* Returns true if all vcpus were locked, false otherwise */
1897 bool lock_all_vcpus(struct kvm *kvm)
1898 {
1899 	struct kvm_vcpu *tmp_vcpu;
1900 	unsigned long c;
1901 
1902 	lockdep_assert_held(&kvm->lock);
1903 
1904 	/*
1905 	 * Any time a vcpu is in an ioctl (including running), the
1906 	 * core KVM code tries to grab the vcpu->mutex.
1907 	 *
1908 	 * By grabbing the vcpu->mutex of all VCPUs we ensure that no
1909 	 * other VCPUs can fiddle with the state while we access it.
1910 	 */
1911 	kvm_for_each_vcpu(c, tmp_vcpu, kvm) {
1912 		if (!mutex_trylock(&tmp_vcpu->mutex)) {
1913 			unlock_vcpus(kvm, c - 1);
1914 			return false;
1915 		}
1916 	}
1917 
1918 	return true;
1919 }
1920 
1921 static unsigned long nvhe_percpu_size(void)
1922 {
1923 	return (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_end) -
1924 		(unsigned long)CHOOSE_NVHE_SYM(__per_cpu_start);
1925 }
1926 
1927 static unsigned long nvhe_percpu_order(void)
1928 {
1929 	unsigned long size = nvhe_percpu_size();
1930 
1931 	return size ? get_order(size) : 0;
1932 }
1933 
1934 static size_t pkvm_host_sve_state_order(void)
1935 {
1936 	return get_order(pkvm_host_sve_state_size());
1937 }
1938 
1939 /* A lookup table holding the hypervisor VA for each vector slot */
1940 static void *hyp_spectre_vector_selector[BP_HARDEN_EL2_SLOTS];
1941 
1942 static void kvm_init_vector_slot(void *base, enum arm64_hyp_spectre_vector slot)
1943 {
1944 	hyp_spectre_vector_selector[slot] = __kvm_vector_slot2addr(base, slot);
1945 }
1946 
1947 static int kvm_init_vector_slots(void)
1948 {
1949 	int err;
1950 	void *base;
1951 
1952 	base = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector));
1953 	kvm_init_vector_slot(base, HYP_VECTOR_DIRECT);
1954 
1955 	base = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs));
1956 	kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_DIRECT);
1957 
1958 	if (kvm_system_needs_idmapped_vectors() &&
1959 	    !is_protected_kvm_enabled()) {
1960 		err = create_hyp_exec_mappings(__pa_symbol(__bp_harden_hyp_vecs),
1961 					       __BP_HARDEN_HYP_VECS_SZ, &base);
1962 		if (err)
1963 			return err;
1964 	}
1965 
1966 	kvm_init_vector_slot(base, HYP_VECTOR_INDIRECT);
1967 	kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_INDIRECT);
1968 	return 0;
1969 }
1970 
1971 static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits)
1972 {
1973 	struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
1974 	u64 mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
1975 	unsigned long tcr;
1976 
1977 	/*
1978 	 * Calculate the raw per-cpu offset without a translation from the
1979 	 * kernel's mapping to the linear mapping, and store it in tpidr_el2
1980 	 * so that we can use adr_l to access per-cpu variables in EL2.
1981 	 * Also drop the KASAN tag which gets in the way...
1982 	 */
1983 	params->tpidr_el2 = (unsigned long)kasan_reset_tag(per_cpu_ptr_nvhe_sym(__per_cpu_start, cpu)) -
1984 			    (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start));
1985 
1986 	params->mair_el2 = read_sysreg(mair_el1);
1987 
1988 	tcr = read_sysreg(tcr_el1);
1989 	if (cpus_have_final_cap(ARM64_KVM_HVHE)) {
1990 		tcr |= TCR_EPD1_MASK;
1991 	} else {
1992 		tcr &= TCR_EL2_MASK;
1993 		tcr |= TCR_EL2_RES1;
1994 	}
1995 	tcr &= ~TCR_T0SZ_MASK;
1996 	tcr |= TCR_T0SZ(hyp_va_bits);
1997 	tcr &= ~TCR_EL2_PS_MASK;
1998 	tcr |= FIELD_PREP(TCR_EL2_PS_MASK, kvm_get_parange(mmfr0));
1999 	if (kvm_lpa2_is_enabled())
2000 		tcr |= TCR_EL2_DS;
2001 	params->tcr_el2 = tcr;
2002 
2003 	params->pgd_pa = kvm_mmu_get_httbr();
2004 	if (is_protected_kvm_enabled())
2005 		params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS;
2006 	else
2007 		params->hcr_el2 = HCR_HOST_NVHE_FLAGS;
2008 	if (cpus_have_final_cap(ARM64_KVM_HVHE))
2009 		params->hcr_el2 |= HCR_E2H;
2010 	params->vttbr = params->vtcr = 0;
2011 
2012 	/*
2013 	 * Flush the init params from the data cache because the struct will
2014 	 * be read while the MMU is off.
2015 	 */
2016 	kvm_flush_dcache_to_poc(params, sizeof(*params));
2017 }
2018 
2019 static void hyp_install_host_vector(void)
2020 {
2021 	struct kvm_nvhe_init_params *params;
2022 	struct arm_smccc_res res;
2023 
2024 	/* Switch from the HYP stub to our own HYP init vector */
2025 	__hyp_set_vectors(kvm_get_idmap_vector());
2026 
2027 	/*
2028 	 * Call initialization code, and switch to the full blown HYP code.
2029 	 * If the cpucaps haven't been finalized yet, something has gone very
2030 	 * wrong, and hyp will crash and burn when it uses any
2031 	 * cpus_have_*_cap() wrapper.
2032 	 */
2033 	BUG_ON(!system_capabilities_finalized());
2034 	params = this_cpu_ptr_nvhe_sym(kvm_init_params);
2035 	arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init), virt_to_phys(params), &res);
2036 	WARN_ON(res.a0 != SMCCC_RET_SUCCESS);
2037 }
2038 
2039 static void cpu_init_hyp_mode(void)
2040 {
2041 	hyp_install_host_vector();
2042 
2043 	/*
2044 	 * Disabling SSBD on a non-VHE system requires us to enable SSBS
2045 	 * at EL2.
2046 	 */
2047 	if (this_cpu_has_cap(ARM64_SSBS) &&
2048 	    arm64_get_spectre_v4_state() == SPECTRE_VULNERABLE) {
2049 		kvm_call_hyp_nvhe(__kvm_enable_ssbs);
2050 	}
2051 }
2052 
2053 static void cpu_hyp_reset(void)
2054 {
2055 	if (!is_kernel_in_hyp_mode())
2056 		__hyp_reset_vectors();
2057 }
2058 
2059 /*
2060  * EL2 vectors can be mapped and rerouted in a number of ways,
2061  * depending on the kernel configuration and CPU present:
2062  *
2063  * - If the CPU is affected by Spectre-v2, the hardening sequence is
2064  *   placed in one of the vector slots, which is executed before jumping
2065  *   to the real vectors.
2066  *
2067  * - If the CPU also has the ARM64_SPECTRE_V3A cap, the slot
2068  *   containing the hardening sequence is mapped next to the idmap page,
2069  *   and executed before jumping to the real vectors.
2070  *
2071  * - If the CPU only has the ARM64_SPECTRE_V3A cap, then an
2072  *   empty slot is selected, mapped next to the idmap page, and
2073  *   executed before jumping to the real vectors.
2074  *
2075  * Note that ARM64_SPECTRE_V3A is somewhat incompatible with
2076  * VHE, as we don't have hypervisor-specific mappings. If the system
2077  * is VHE and yet selects this capability, it will be ignored.
2078  */
2079 static void cpu_set_hyp_vector(void)
2080 {
2081 	struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data);
2082 	void *vector = hyp_spectre_vector_selector[data->slot];
2083 
2084 	if (!is_protected_kvm_enabled())
2085 		*this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vector;
2086 	else
2087 		kvm_call_hyp_nvhe(__pkvm_cpu_set_vector, data->slot);
2088 }
2089 
2090 static void cpu_hyp_init_context(void)
2091 {
2092 	kvm_init_host_cpu_context(host_data_ptr(host_ctxt));
2093 
2094 	if (!is_kernel_in_hyp_mode())
2095 		cpu_init_hyp_mode();
2096 }
2097 
2098 static void cpu_hyp_init_features(void)
2099 {
2100 	cpu_set_hyp_vector();
2101 	kvm_arm_init_debug();
2102 
2103 	if (is_kernel_in_hyp_mode())
2104 		kvm_timer_init_vhe();
2105 
2106 	if (vgic_present)
2107 		kvm_vgic_init_cpu_hardware();
2108 }
2109 
2110 static void cpu_hyp_reinit(void)
2111 {
2112 	cpu_hyp_reset();
2113 	cpu_hyp_init_context();
2114 	cpu_hyp_init_features();
2115 }
2116 
2117 static void cpu_hyp_init(void *discard)
2118 {
2119 	if (!__this_cpu_read(kvm_hyp_initialized)) {
2120 		cpu_hyp_reinit();
2121 		__this_cpu_write(kvm_hyp_initialized, 1);
2122 	}
2123 }
2124 
2125 static void cpu_hyp_uninit(void *discard)
2126 {
2127 	if (__this_cpu_read(kvm_hyp_initialized)) {
2128 		cpu_hyp_reset();
2129 		__this_cpu_write(kvm_hyp_initialized, 0);
2130 	}
2131 }
2132 
2133 int kvm_arch_hardware_enable(void)
2134 {
2135 	/*
2136 	 * Most calls to this function are made with migration
2137 	 * disabled, but not with preemption disabled. The former is
2138 	 * enough to ensure correctness, but most of the helpers
2139 	 * expect the later and will throw a tantrum otherwise.
2140 	 */
2141 	preempt_disable();
2142 
2143 	cpu_hyp_init(NULL);
2144 
2145 	kvm_vgic_cpu_up();
2146 	kvm_timer_cpu_up();
2147 
2148 	preempt_enable();
2149 
2150 	return 0;
2151 }
2152 
2153 void kvm_arch_hardware_disable(void)
2154 {
2155 	kvm_timer_cpu_down();
2156 	kvm_vgic_cpu_down();
2157 
2158 	if (!is_protected_kvm_enabled())
2159 		cpu_hyp_uninit(NULL);
2160 }
2161 
2162 #ifdef CONFIG_CPU_PM
2163 static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
2164 				    unsigned long cmd,
2165 				    void *v)
2166 {
2167 	/*
2168 	 * kvm_hyp_initialized is left with its old value over
2169 	 * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should
2170 	 * re-enable hyp.
2171 	 */
2172 	switch (cmd) {
2173 	case CPU_PM_ENTER:
2174 		if (__this_cpu_read(kvm_hyp_initialized))
2175 			/*
2176 			 * don't update kvm_hyp_initialized here
2177 			 * so that the hyp will be re-enabled
2178 			 * when we resume. See below.
2179 			 */
2180 			cpu_hyp_reset();
2181 
2182 		return NOTIFY_OK;
2183 	case CPU_PM_ENTER_FAILED:
2184 	case CPU_PM_EXIT:
2185 		if (__this_cpu_read(kvm_hyp_initialized))
2186 			/* The hyp was enabled before suspend. */
2187 			cpu_hyp_reinit();
2188 
2189 		return NOTIFY_OK;
2190 
2191 	default:
2192 		return NOTIFY_DONE;
2193 	}
2194 }
2195 
2196 static struct notifier_block hyp_init_cpu_pm_nb = {
2197 	.notifier_call = hyp_init_cpu_pm_notifier,
2198 };
2199 
2200 static void __init hyp_cpu_pm_init(void)
2201 {
2202 	if (!is_protected_kvm_enabled())
2203 		cpu_pm_register_notifier(&hyp_init_cpu_pm_nb);
2204 }
2205 static void __init hyp_cpu_pm_exit(void)
2206 {
2207 	if (!is_protected_kvm_enabled())
2208 		cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb);
2209 }
2210 #else
2211 static inline void __init hyp_cpu_pm_init(void)
2212 {
2213 }
2214 static inline void __init hyp_cpu_pm_exit(void)
2215 {
2216 }
2217 #endif
2218 
2219 static void __init init_cpu_logical_map(void)
2220 {
2221 	unsigned int cpu;
2222 
2223 	/*
2224 	 * Copy the MPIDR <-> logical CPU ID mapping to hyp.
2225 	 * Only copy the set of online CPUs whose features have been checked
2226 	 * against the finalized system capabilities. The hypervisor will not
2227 	 * allow any other CPUs from the `possible` set to boot.
2228 	 */
2229 	for_each_online_cpu(cpu)
2230 		hyp_cpu_logical_map[cpu] = cpu_logical_map(cpu);
2231 }
2232 
2233 #define init_psci_0_1_impl_state(config, what)	\
2234 	config.psci_0_1_ ## what ## _implemented = psci_ops.what
2235 
2236 static bool __init init_psci_relay(void)
2237 {
2238 	/*
2239 	 * If PSCI has not been initialized, protected KVM cannot install
2240 	 * itself on newly booted CPUs.
2241 	 */
2242 	if (!psci_ops.get_version) {
2243 		kvm_err("Cannot initialize protected mode without PSCI\n");
2244 		return false;
2245 	}
2246 
2247 	kvm_host_psci_config.version = psci_ops.get_version();
2248 	kvm_host_psci_config.smccc_version = arm_smccc_get_version();
2249 
2250 	if (kvm_host_psci_config.version == PSCI_VERSION(0, 1)) {
2251 		kvm_host_psci_config.function_ids_0_1 = get_psci_0_1_function_ids();
2252 		init_psci_0_1_impl_state(kvm_host_psci_config, cpu_suspend);
2253 		init_psci_0_1_impl_state(kvm_host_psci_config, cpu_on);
2254 		init_psci_0_1_impl_state(kvm_host_psci_config, cpu_off);
2255 		init_psci_0_1_impl_state(kvm_host_psci_config, migrate);
2256 	}
2257 	return true;
2258 }
2259 
2260 static int __init init_subsystems(void)
2261 {
2262 	int err = 0;
2263 
2264 	/*
2265 	 * Enable hardware so that subsystem initialisation can access EL2.
2266 	 */
2267 	on_each_cpu(cpu_hyp_init, NULL, 1);
2268 
2269 	/*
2270 	 * Register CPU lower-power notifier
2271 	 */
2272 	hyp_cpu_pm_init();
2273 
2274 	/*
2275 	 * Init HYP view of VGIC
2276 	 */
2277 	err = kvm_vgic_hyp_init();
2278 	switch (err) {
2279 	case 0:
2280 		vgic_present = true;
2281 		break;
2282 	case -ENODEV:
2283 	case -ENXIO:
2284 		vgic_present = false;
2285 		err = 0;
2286 		break;
2287 	default:
2288 		goto out;
2289 	}
2290 
2291 	/*
2292 	 * Init HYP architected timer support
2293 	 */
2294 	err = kvm_timer_hyp_init(vgic_present);
2295 	if (err)
2296 		goto out;
2297 
2298 	kvm_register_perf_callbacks(NULL);
2299 
2300 out:
2301 	if (err)
2302 		hyp_cpu_pm_exit();
2303 
2304 	if (err || !is_protected_kvm_enabled())
2305 		on_each_cpu(cpu_hyp_uninit, NULL, 1);
2306 
2307 	return err;
2308 }
2309 
2310 static void __init teardown_subsystems(void)
2311 {
2312 	kvm_unregister_perf_callbacks();
2313 	hyp_cpu_pm_exit();
2314 }
2315 
2316 static void __init teardown_hyp_mode(void)
2317 {
2318 	bool free_sve = system_supports_sve() && is_protected_kvm_enabled();
2319 	int cpu;
2320 
2321 	free_hyp_pgds();
2322 	for_each_possible_cpu(cpu) {
2323 		free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
2324 		free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order());
2325 
2326 		if (free_sve) {
2327 			struct cpu_sve_state *sve_state;
2328 
2329 			sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state;
2330 			free_pages((unsigned long) sve_state, pkvm_host_sve_state_order());
2331 		}
2332 	}
2333 }
2334 
2335 static int __init do_pkvm_init(u32 hyp_va_bits)
2336 {
2337 	void *per_cpu_base = kvm_ksym_ref(kvm_nvhe_sym(kvm_arm_hyp_percpu_base));
2338 	int ret;
2339 
2340 	preempt_disable();
2341 	cpu_hyp_init_context();
2342 	ret = kvm_call_hyp_nvhe(__pkvm_init, hyp_mem_base, hyp_mem_size,
2343 				num_possible_cpus(), kern_hyp_va(per_cpu_base),
2344 				hyp_va_bits);
2345 	cpu_hyp_init_features();
2346 
2347 	/*
2348 	 * The stub hypercalls are now disabled, so set our local flag to
2349 	 * prevent a later re-init attempt in kvm_arch_hardware_enable().
2350 	 */
2351 	__this_cpu_write(kvm_hyp_initialized, 1);
2352 	preempt_enable();
2353 
2354 	return ret;
2355 }
2356 
2357 static u64 get_hyp_id_aa64pfr0_el1(void)
2358 {
2359 	/*
2360 	 * Track whether the system isn't affected by spectre/meltdown in the
2361 	 * hypervisor's view of id_aa64pfr0_el1, used for protected VMs.
2362 	 * Although this is per-CPU, we make it global for simplicity, e.g., not
2363 	 * to have to worry about vcpu migration.
2364 	 *
2365 	 * Unlike for non-protected VMs, userspace cannot override this for
2366 	 * protected VMs.
2367 	 */
2368 	u64 val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
2369 
2370 	val &= ~(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) |
2371 		 ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3));
2372 
2373 	val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2),
2374 			  arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED);
2375 	val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3),
2376 			  arm64_get_meltdown_state() == SPECTRE_UNAFFECTED);
2377 
2378 	return val;
2379 }
2380 
2381 static void kvm_hyp_init_symbols(void)
2382 {
2383 	kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = get_hyp_id_aa64pfr0_el1();
2384 	kvm_nvhe_sym(id_aa64pfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1);
2385 	kvm_nvhe_sym(id_aa64isar0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR0_EL1);
2386 	kvm_nvhe_sym(id_aa64isar1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1);
2387 	kvm_nvhe_sym(id_aa64isar2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1);
2388 	kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
2389 	kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
2390 	kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1);
2391 	kvm_nvhe_sym(id_aa64smfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64SMFR0_EL1);
2392 	kvm_nvhe_sym(__icache_flags) = __icache_flags;
2393 	kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits;
2394 }
2395 
2396 static int __init kvm_hyp_init_protection(u32 hyp_va_bits)
2397 {
2398 	void *addr = phys_to_virt(hyp_mem_base);
2399 	int ret;
2400 
2401 	ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP);
2402 	if (ret)
2403 		return ret;
2404 
2405 	ret = do_pkvm_init(hyp_va_bits);
2406 	if (ret)
2407 		return ret;
2408 
2409 	free_hyp_pgds();
2410 
2411 	return 0;
2412 }
2413 
2414 static int init_pkvm_host_sve_state(void)
2415 {
2416 	int cpu;
2417 
2418 	if (!system_supports_sve())
2419 		return 0;
2420 
2421 	/* Allocate pages for host sve state in protected mode. */
2422 	for_each_possible_cpu(cpu) {
2423 		struct page *page = alloc_pages(GFP_KERNEL, pkvm_host_sve_state_order());
2424 
2425 		if (!page)
2426 			return -ENOMEM;
2427 
2428 		per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state = page_address(page);
2429 	}
2430 
2431 	/*
2432 	 * Don't map the pages in hyp since these are only used in protected
2433 	 * mode, which will (re)create its own mapping when initialized.
2434 	 */
2435 
2436 	return 0;
2437 }
2438 
2439 /*
2440  * Finalizes the initialization of hyp mode, once everything else is initialized
2441  * and the initialziation process cannot fail.
2442  */
2443 static void finalize_init_hyp_mode(void)
2444 {
2445 	int cpu;
2446 
2447 	if (system_supports_sve() && is_protected_kvm_enabled()) {
2448 		for_each_possible_cpu(cpu) {
2449 			struct cpu_sve_state *sve_state;
2450 
2451 			sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state;
2452 			per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state =
2453 				kern_hyp_va(sve_state);
2454 		}
2455 	} else {
2456 		for_each_possible_cpu(cpu) {
2457 			struct user_fpsimd_state *fpsimd_state;
2458 
2459 			fpsimd_state = &per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->host_ctxt.fp_regs;
2460 			per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->fpsimd_state =
2461 				kern_hyp_va(fpsimd_state);
2462 		}
2463 	}
2464 }
2465 
2466 static void pkvm_hyp_init_ptrauth(void)
2467 {
2468 	struct kvm_cpu_context *hyp_ctxt;
2469 	int cpu;
2470 
2471 	for_each_possible_cpu(cpu) {
2472 		hyp_ctxt = per_cpu_ptr_nvhe_sym(kvm_hyp_ctxt, cpu);
2473 		hyp_ctxt->sys_regs[APIAKEYLO_EL1] = get_random_long();
2474 		hyp_ctxt->sys_regs[APIAKEYHI_EL1] = get_random_long();
2475 		hyp_ctxt->sys_regs[APIBKEYLO_EL1] = get_random_long();
2476 		hyp_ctxt->sys_regs[APIBKEYHI_EL1] = get_random_long();
2477 		hyp_ctxt->sys_regs[APDAKEYLO_EL1] = get_random_long();
2478 		hyp_ctxt->sys_regs[APDAKEYHI_EL1] = get_random_long();
2479 		hyp_ctxt->sys_regs[APDBKEYLO_EL1] = get_random_long();
2480 		hyp_ctxt->sys_regs[APDBKEYHI_EL1] = get_random_long();
2481 		hyp_ctxt->sys_regs[APGAKEYLO_EL1] = get_random_long();
2482 		hyp_ctxt->sys_regs[APGAKEYHI_EL1] = get_random_long();
2483 	}
2484 }
2485 
2486 /* Inits Hyp-mode on all online CPUs */
2487 static int __init init_hyp_mode(void)
2488 {
2489 	u32 hyp_va_bits;
2490 	int cpu;
2491 	int err = -ENOMEM;
2492 
2493 	/*
2494 	 * The protected Hyp-mode cannot be initialized if the memory pool
2495 	 * allocation has failed.
2496 	 */
2497 	if (is_protected_kvm_enabled() && !hyp_mem_base)
2498 		goto out_err;
2499 
2500 	/*
2501 	 * Allocate Hyp PGD and setup Hyp identity mapping
2502 	 */
2503 	err = kvm_mmu_init(&hyp_va_bits);
2504 	if (err)
2505 		goto out_err;
2506 
2507 	/*
2508 	 * Allocate stack pages for Hypervisor-mode
2509 	 */
2510 	for_each_possible_cpu(cpu) {
2511 		unsigned long stack_page;
2512 
2513 		stack_page = __get_free_page(GFP_KERNEL);
2514 		if (!stack_page) {
2515 			err = -ENOMEM;
2516 			goto out_err;
2517 		}
2518 
2519 		per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page;
2520 	}
2521 
2522 	/*
2523 	 * Allocate and initialize pages for Hypervisor-mode percpu regions.
2524 	 */
2525 	for_each_possible_cpu(cpu) {
2526 		struct page *page;
2527 		void *page_addr;
2528 
2529 		page = alloc_pages(GFP_KERNEL, nvhe_percpu_order());
2530 		if (!page) {
2531 			err = -ENOMEM;
2532 			goto out_err;
2533 		}
2534 
2535 		page_addr = page_address(page);
2536 		memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size());
2537 		kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu] = (unsigned long)page_addr;
2538 	}
2539 
2540 	/*
2541 	 * Map the Hyp-code called directly from the host
2542 	 */
2543 	err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start),
2544 				  kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC);
2545 	if (err) {
2546 		kvm_err("Cannot map world-switch code\n");
2547 		goto out_err;
2548 	}
2549 
2550 	err = create_hyp_mappings(kvm_ksym_ref(__hyp_rodata_start),
2551 				  kvm_ksym_ref(__hyp_rodata_end), PAGE_HYP_RO);
2552 	if (err) {
2553 		kvm_err("Cannot map .hyp.rodata section\n");
2554 		goto out_err;
2555 	}
2556 
2557 	err = create_hyp_mappings(kvm_ksym_ref(__start_rodata),
2558 				  kvm_ksym_ref(__end_rodata), PAGE_HYP_RO);
2559 	if (err) {
2560 		kvm_err("Cannot map rodata section\n");
2561 		goto out_err;
2562 	}
2563 
2564 	/*
2565 	 * .hyp.bss is guaranteed to be placed at the beginning of the .bss
2566 	 * section thanks to an assertion in the linker script. Map it RW and
2567 	 * the rest of .bss RO.
2568 	 */
2569 	err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_start),
2570 				  kvm_ksym_ref(__hyp_bss_end), PAGE_HYP);
2571 	if (err) {
2572 		kvm_err("Cannot map hyp bss section: %d\n", err);
2573 		goto out_err;
2574 	}
2575 
2576 	err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_end),
2577 				  kvm_ksym_ref(__bss_stop), PAGE_HYP_RO);
2578 	if (err) {
2579 		kvm_err("Cannot map bss section\n");
2580 		goto out_err;
2581 	}
2582 
2583 	/*
2584 	 * Map the Hyp stack pages
2585 	 */
2586 	for_each_possible_cpu(cpu) {
2587 		struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
2588 		char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu);
2589 
2590 		err = create_hyp_stack(__pa(stack_page), &params->stack_hyp_va);
2591 		if (err) {
2592 			kvm_err("Cannot map hyp stack\n");
2593 			goto out_err;
2594 		}
2595 
2596 		/*
2597 		 * Save the stack PA in nvhe_init_params. This will be needed
2598 		 * to recreate the stack mapping in protected nVHE mode.
2599 		 * __hyp_pa() won't do the right thing there, since the stack
2600 		 * has been mapped in the flexible private VA space.
2601 		 */
2602 		params->stack_pa = __pa(stack_page);
2603 	}
2604 
2605 	for_each_possible_cpu(cpu) {
2606 		char *percpu_begin = (char *)kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu];
2607 		char *percpu_end = percpu_begin + nvhe_percpu_size();
2608 
2609 		/* Map Hyp percpu pages */
2610 		err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP);
2611 		if (err) {
2612 			kvm_err("Cannot map hyp percpu region\n");
2613 			goto out_err;
2614 		}
2615 
2616 		/* Prepare the CPU initialization parameters */
2617 		cpu_prepare_hyp_mode(cpu, hyp_va_bits);
2618 	}
2619 
2620 	kvm_hyp_init_symbols();
2621 
2622 	if (is_protected_kvm_enabled()) {
2623 		if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL) &&
2624 		    cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH))
2625 			pkvm_hyp_init_ptrauth();
2626 
2627 		init_cpu_logical_map();
2628 
2629 		if (!init_psci_relay()) {
2630 			err = -ENODEV;
2631 			goto out_err;
2632 		}
2633 
2634 		err = init_pkvm_host_sve_state();
2635 		if (err)
2636 			goto out_err;
2637 
2638 		err = kvm_hyp_init_protection(hyp_va_bits);
2639 		if (err) {
2640 			kvm_err("Failed to init hyp memory protection\n");
2641 			goto out_err;
2642 		}
2643 	}
2644 
2645 	return 0;
2646 
2647 out_err:
2648 	teardown_hyp_mode();
2649 	kvm_err("error initializing Hyp mode: %d\n", err);
2650 	return err;
2651 }
2652 
2653 struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr)
2654 {
2655 	struct kvm_vcpu *vcpu = NULL;
2656 	struct kvm_mpidr_data *data;
2657 	unsigned long i;
2658 
2659 	mpidr &= MPIDR_HWID_BITMASK;
2660 
2661 	rcu_read_lock();
2662 	data = rcu_dereference(kvm->arch.mpidr_data);
2663 
2664 	if (data) {
2665 		u16 idx = kvm_mpidr_index(data, mpidr);
2666 
2667 		vcpu = kvm_get_vcpu(kvm, data->cmpidr_to_idx[idx]);
2668 		if (mpidr != kvm_vcpu_get_mpidr_aff(vcpu))
2669 			vcpu = NULL;
2670 	}
2671 
2672 	rcu_read_unlock();
2673 
2674 	if (vcpu)
2675 		return vcpu;
2676 
2677 	kvm_for_each_vcpu(i, vcpu, kvm) {
2678 		if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu))
2679 			return vcpu;
2680 	}
2681 	return NULL;
2682 }
2683 
2684 bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
2685 {
2686 	return irqchip_in_kernel(kvm);
2687 }
2688 
2689 bool kvm_arch_has_irq_bypass(void)
2690 {
2691 	return true;
2692 }
2693 
2694 int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
2695 				      struct irq_bypass_producer *prod)
2696 {
2697 	struct kvm_kernel_irqfd *irqfd =
2698 		container_of(cons, struct kvm_kernel_irqfd, consumer);
2699 
2700 	return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq,
2701 					  &irqfd->irq_entry);
2702 }
2703 void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
2704 				      struct irq_bypass_producer *prod)
2705 {
2706 	struct kvm_kernel_irqfd *irqfd =
2707 		container_of(cons, struct kvm_kernel_irqfd, consumer);
2708 
2709 	kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq,
2710 				     &irqfd->irq_entry);
2711 }
2712 
2713 void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons)
2714 {
2715 	struct kvm_kernel_irqfd *irqfd =
2716 		container_of(cons, struct kvm_kernel_irqfd, consumer);
2717 
2718 	kvm_arm_halt_guest(irqfd->kvm);
2719 }
2720 
2721 void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons)
2722 {
2723 	struct kvm_kernel_irqfd *irqfd =
2724 		container_of(cons, struct kvm_kernel_irqfd, consumer);
2725 
2726 	kvm_arm_resume_guest(irqfd->kvm);
2727 }
2728 
2729 /* Initialize Hyp-mode and memory mappings on all CPUs */
2730 static __init int kvm_arm_init(void)
2731 {
2732 	int err;
2733 	bool in_hyp_mode;
2734 
2735 	if (!is_hyp_mode_available()) {
2736 		kvm_info("HYP mode not available\n");
2737 		return -ENODEV;
2738 	}
2739 
2740 	if (kvm_get_mode() == KVM_MODE_NONE) {
2741 		kvm_info("KVM disabled from command line\n");
2742 		return -ENODEV;
2743 	}
2744 
2745 	err = kvm_sys_reg_table_init();
2746 	if (err) {
2747 		kvm_info("Error initializing system register tables");
2748 		return err;
2749 	}
2750 
2751 	in_hyp_mode = is_kernel_in_hyp_mode();
2752 
2753 	if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) ||
2754 	    cpus_have_final_cap(ARM64_WORKAROUND_1508412))
2755 		kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \
2756 			 "Only trusted guests should be used on this system.\n");
2757 
2758 	err = kvm_set_ipa_limit();
2759 	if (err)
2760 		return err;
2761 
2762 	err = kvm_arm_init_sve();
2763 	if (err)
2764 		return err;
2765 
2766 	err = kvm_arm_vmid_alloc_init();
2767 	if (err) {
2768 		kvm_err("Failed to initialize VMID allocator.\n");
2769 		return err;
2770 	}
2771 
2772 	if (!in_hyp_mode) {
2773 		err = init_hyp_mode();
2774 		if (err)
2775 			goto out_err;
2776 	}
2777 
2778 	err = kvm_init_vector_slots();
2779 	if (err) {
2780 		kvm_err("Cannot initialise vector slots\n");
2781 		goto out_hyp;
2782 	}
2783 
2784 	err = init_subsystems();
2785 	if (err)
2786 		goto out_hyp;
2787 
2788 	kvm_info("%s%sVHE mode initialized successfully\n",
2789 		 in_hyp_mode ? "" : (is_protected_kvm_enabled() ?
2790 				     "Protected " : "Hyp "),
2791 		 in_hyp_mode ? "" : (cpus_have_final_cap(ARM64_KVM_HVHE) ?
2792 				     "h" : "n"));
2793 
2794 	/*
2795 	 * FIXME: Do something reasonable if kvm_init() fails after pKVM
2796 	 * hypervisor protection is finalized.
2797 	 */
2798 	err = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE);
2799 	if (err)
2800 		goto out_subs;
2801 
2802 	/*
2803 	 * This should be called after initialization is done and failure isn't
2804 	 * possible anymore.
2805 	 */
2806 	if (!in_hyp_mode)
2807 		finalize_init_hyp_mode();
2808 
2809 	kvm_arm_initialised = true;
2810 
2811 	return 0;
2812 
2813 out_subs:
2814 	teardown_subsystems();
2815 out_hyp:
2816 	if (!in_hyp_mode)
2817 		teardown_hyp_mode();
2818 out_err:
2819 	kvm_arm_vmid_alloc_free();
2820 	return err;
2821 }
2822 
2823 static int __init early_kvm_mode_cfg(char *arg)
2824 {
2825 	if (!arg)
2826 		return -EINVAL;
2827 
2828 	if (strcmp(arg, "none") == 0) {
2829 		kvm_mode = KVM_MODE_NONE;
2830 		return 0;
2831 	}
2832 
2833 	if (!is_hyp_mode_available()) {
2834 		pr_warn_once("KVM is not available. Ignoring kvm-arm.mode\n");
2835 		return 0;
2836 	}
2837 
2838 	if (strcmp(arg, "protected") == 0) {
2839 		if (!is_kernel_in_hyp_mode())
2840 			kvm_mode = KVM_MODE_PROTECTED;
2841 		else
2842 			pr_warn_once("Protected KVM not available with VHE\n");
2843 
2844 		return 0;
2845 	}
2846 
2847 	if (strcmp(arg, "nvhe") == 0 && !WARN_ON(is_kernel_in_hyp_mode())) {
2848 		kvm_mode = KVM_MODE_DEFAULT;
2849 		return 0;
2850 	}
2851 
2852 	if (strcmp(arg, "nested") == 0 && !WARN_ON(!is_kernel_in_hyp_mode())) {
2853 		kvm_mode = KVM_MODE_NV;
2854 		return 0;
2855 	}
2856 
2857 	return -EINVAL;
2858 }
2859 early_param("kvm-arm.mode", early_kvm_mode_cfg);
2860 
2861 enum kvm_mode kvm_get_mode(void)
2862 {
2863 	return kvm_mode;
2864 }
2865 
2866 module_init(kvm_arm_init);
2867