xref: /linux/arch/arm64/kvm/hyp/nvhe/pkvm.c (revision 03313efed5e2ca55e862bf514b907a431ebf642a)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2021 Google LLC
4  * Author: Fuad Tabba <tabba@google.com>
5  */
6 
7 #include <kvm/arm_hypercalls.h>
8 
9 #include <linux/kvm_host.h>
10 #include <linux/mm.h>
11 
12 #include <asm/kvm_emulate.h>
13 
14 #include <nvhe/mem_protect.h>
15 #include <nvhe/memory.h>
16 #include <nvhe/pkvm.h>
17 #include <nvhe/trap_handler.h>
18 
19 /* Used by icache_is_aliasing(). */
20 unsigned long __icache_flags;
21 
22 /* Used by kvm_get_vttbr(). */
23 unsigned int kvm_arm_vmid_bits;
24 
25 unsigned int kvm_host_sve_max_vl;
26 
27 /*
28  * The currently loaded hyp vCPU for each physical CPU. Used in protected mode
29  * for both protected and non-protected VMs.
30  */
31 static DEFINE_PER_CPU(struct pkvm_hyp_vcpu *, loaded_hyp_vcpu);
32 
33 static void pkvm_vcpu_reset_hcr(struct kvm_vcpu *vcpu)
34 {
35 	vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS;
36 
37 	if (has_hvhe())
38 		vcpu->arch.hcr_el2 |= HCR_E2H;
39 
40 	if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) {
41 		/* route synchronous external abort exceptions to EL2 */
42 		vcpu->arch.hcr_el2 |= HCR_TEA;
43 		/* trap error record accesses */
44 		vcpu->arch.hcr_el2 |= HCR_TERR;
45 	}
46 
47 	if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
48 		vcpu->arch.hcr_el2 |= HCR_FWB;
49 
50 	if (cpus_have_final_cap(ARM64_HAS_EVT) &&
51 	    !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE) &&
52 	    kvm_read_vm_id_reg(vcpu->kvm, SYS_CTR_EL0) == read_cpuid(CTR_EL0))
53 		vcpu->arch.hcr_el2 |= HCR_TID4;
54 	else
55 		vcpu->arch.hcr_el2 |= HCR_TID2;
56 
57 	if (vcpu_has_ptrauth(vcpu))
58 		vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK);
59 
60 	if (kvm_has_mte(vcpu->kvm))
61 		vcpu->arch.hcr_el2 |= HCR_ATA;
62 }
63 
64 static void pvm_init_traps_hcr(struct kvm_vcpu *vcpu)
65 {
66 	struct kvm *kvm = vcpu->kvm;
67 	u64 val = vcpu->arch.hcr_el2;
68 
69 	/* No support for AArch32. */
70 	val |= HCR_RW;
71 
72 	/*
73 	 * Always trap:
74 	 * - Feature id registers: to control features exposed to guests
75 	 * - Implementation-defined features
76 	 */
77 	val |= HCR_TACR | HCR_TIDCP | HCR_TID3 | HCR_TID1;
78 
79 	if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, IMP)) {
80 		val |= HCR_TERR | HCR_TEA;
81 		val &= ~(HCR_FIEN);
82 	}
83 
84 	if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, IMP))
85 		val &= ~(HCR_AMVOFFEN);
86 
87 	if (!kvm_has_mte(kvm)) {
88 		val |= HCR_TID5;
89 		val &= ~(HCR_DCT | HCR_ATA);
90 	}
91 
92 	if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, LO, IMP))
93 		val |= HCR_TLOR;
94 
95 	vcpu->arch.hcr_el2 = val;
96 }
97 
98 static void pvm_init_traps_mdcr(struct kvm_vcpu *vcpu)
99 {
100 	struct kvm *kvm = vcpu->kvm;
101 	u64 val = vcpu->arch.mdcr_el2;
102 
103 	if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, IMP)) {
104 		val |= MDCR_EL2_TPM | MDCR_EL2_TPMCR;
105 		val &= ~(MDCR_EL2_HPME | MDCR_EL2_MTPME | MDCR_EL2_HPMN_MASK);
106 	}
107 
108 	if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DebugVer, IMP))
109 		val |= MDCR_EL2_TDRA | MDCR_EL2_TDA;
110 
111 	if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DoubleLock, IMP))
112 		val |= MDCR_EL2_TDOSA;
113 
114 	if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, IMP)) {
115 		val |= MDCR_EL2_TPMS;
116 		val &= ~MDCR_EL2_E2PB_MASK;
117 	}
118 
119 	if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceFilt, IMP))
120 		val |= MDCR_EL2_TTRF;
121 
122 	if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceBuffer, IMP))
123 		val &= ~MDCR_EL2_E2TB_MASK;
124 
125 	/* Trap Debug Communications Channel registers */
126 	if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, FGT, IMP))
127 		val |= MDCR_EL2_TDCC;
128 
129 	vcpu->arch.mdcr_el2 = val;
130 }
131 
132 /*
133  * Check that cpu features that are neither trapped nor supported are not
134  * enabled for protected VMs.
135  */
136 static int pkvm_check_pvm_cpu_features(struct kvm_vcpu *vcpu)
137 {
138 	struct kvm *kvm = vcpu->kvm;
139 
140 	/* No AArch32 support for protected guests. */
141 	if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL0, AARCH32) ||
142 	    kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL1, AARCH32))
143 		return -EINVAL;
144 
145 	/*
146 	 * Linux guests assume support for floating-point and Advanced SIMD. Do
147 	 * not change the trapping behavior for these from the KVM default.
148 	 */
149 	if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, FP, IMP) ||
150 	    !kvm_has_feat(kvm, ID_AA64PFR0_EL1, AdvSIMD, IMP))
151 		return -EINVAL;
152 
153 	/* No SME support in KVM right now. Check to catch if it changes. */
154 	if (kvm_has_feat(kvm, ID_AA64PFR1_EL1, SME, IMP))
155 		return -EINVAL;
156 
157 	return 0;
158 }
159 
160 /*
161  * Initialize trap register values in protected mode.
162  */
163 static int pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu)
164 {
165 	struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu;
166 	int ret;
167 
168 	vcpu->arch.mdcr_el2 = 0;
169 
170 	pkvm_vcpu_reset_hcr(vcpu);
171 
172 	if ((!pkvm_hyp_vcpu_is_protected(hyp_vcpu))) {
173 		struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
174 
175 		/* Trust the host for non-protected vcpu features. */
176 		vcpu->arch.hcrx_el2 = host_vcpu->arch.hcrx_el2;
177 		return 0;
178 	}
179 
180 	ret = pkvm_check_pvm_cpu_features(vcpu);
181 	if (ret)
182 		return ret;
183 
184 	pvm_init_traps_hcr(vcpu);
185 	pvm_init_traps_mdcr(vcpu);
186 	vcpu_set_hcrx(vcpu);
187 
188 	return 0;
189 }
190 
191 /*
192  * Start the VM table handle at the offset defined instead of at 0.
193  * Mainly for sanity checking and debugging.
194  */
195 #define HANDLE_OFFSET 0x1000
196 
197 /*
198  * Marks a reserved but not yet used entry in the VM table.
199  */
200 #define RESERVED_ENTRY ((void *)0xa110ca7ed)
201 
202 static unsigned int vm_handle_to_idx(pkvm_handle_t handle)
203 {
204 	return handle - HANDLE_OFFSET;
205 }
206 
207 static pkvm_handle_t idx_to_vm_handle(unsigned int idx)
208 {
209 	return idx + HANDLE_OFFSET;
210 }
211 
212 /*
213  * Spinlock for protecting state related to the VM table. Protects writes
214  * to 'vm_table', 'nr_table_entries', and other per-vm state on initialization.
215  * Also protects reads and writes to 'last_hyp_vcpu_lookup'.
216  */
217 DEFINE_HYP_SPINLOCK(vm_table_lock);
218 
219 /*
220  * A table that tracks all VMs in protected mode.
221  * Allocated during hyp initialization and setup.
222  */
223 static struct pkvm_hyp_vm **vm_table;
224 
225 void pkvm_hyp_vm_table_init(void *tbl)
226 {
227 	BUILD_BUG_ON((u64)HANDLE_OFFSET + KVM_MAX_PVMS > (pkvm_handle_t)-1);
228 	WARN_ON(vm_table);
229 	vm_table = tbl;
230 }
231 
232 /*
233  * Return the hyp vm structure corresponding to the handle.
234  */
235 struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle)
236 {
237 	unsigned int idx = vm_handle_to_idx(handle);
238 
239 	hyp_assert_lock_held(&vm_table_lock);
240 
241 	if (unlikely(idx >= KVM_MAX_PVMS))
242 		return NULL;
243 
244 	/* A reserved entry doesn't represent an initialized VM. */
245 	if (unlikely(vm_table[idx] == RESERVED_ENTRY))
246 		return NULL;
247 
248 	return vm_table[idx];
249 }
250 
251 struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle,
252 					 unsigned int vcpu_idx)
253 {
254 	struct pkvm_hyp_vcpu *hyp_vcpu = NULL;
255 	struct pkvm_hyp_vm *hyp_vm;
256 
257 	/* Cannot load a new vcpu without putting the old one first. */
258 	if (__this_cpu_read(loaded_hyp_vcpu))
259 		return NULL;
260 
261 	hyp_spin_lock(&vm_table_lock);
262 	hyp_vm = get_vm_by_handle(handle);
263 	if (!hyp_vm || hyp_vm->kvm.arch.pkvm.is_dying)
264 		goto unlock;
265 
266 	if (hyp_vm->kvm.created_vcpus <= vcpu_idx)
267 		goto unlock;
268 
269 	hyp_vcpu = hyp_vm->vcpus[vcpu_idx];
270 	if (!hyp_vcpu)
271 		goto unlock;
272 
273 	/* Ensure vcpu isn't loaded on more than one cpu simultaneously. */
274 	if (unlikely(hyp_vcpu->loaded_hyp_vcpu)) {
275 		hyp_vcpu = NULL;
276 		goto unlock;
277 	}
278 
279 	hyp_vcpu->loaded_hyp_vcpu = this_cpu_ptr(&loaded_hyp_vcpu);
280 	hyp_page_ref_inc(hyp_virt_to_page(hyp_vm));
281 unlock:
282 	hyp_spin_unlock(&vm_table_lock);
283 
284 	if (hyp_vcpu)
285 		__this_cpu_write(loaded_hyp_vcpu, hyp_vcpu);
286 	return hyp_vcpu;
287 }
288 
289 void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
290 {
291 	struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu);
292 
293 	hyp_spin_lock(&vm_table_lock);
294 	hyp_vcpu->loaded_hyp_vcpu = NULL;
295 	__this_cpu_write(loaded_hyp_vcpu, NULL);
296 	hyp_page_ref_dec(hyp_virt_to_page(hyp_vm));
297 	hyp_spin_unlock(&vm_table_lock);
298 }
299 
300 struct pkvm_hyp_vcpu *pkvm_get_loaded_hyp_vcpu(void)
301 {
302 	return __this_cpu_read(loaded_hyp_vcpu);
303 
304 }
305 
306 struct pkvm_hyp_vm *get_pkvm_hyp_vm(pkvm_handle_t handle)
307 {
308 	struct pkvm_hyp_vm *hyp_vm;
309 
310 	hyp_spin_lock(&vm_table_lock);
311 	hyp_vm = get_vm_by_handle(handle);
312 	if (!hyp_vm)
313 		goto unlock;
314 
315 	if (hyp_vm->kvm.arch.pkvm.is_dying)
316 		hyp_vm = NULL;
317 	else
318 		hyp_page_ref_inc(hyp_virt_to_page(hyp_vm));
319 unlock:
320 	hyp_spin_unlock(&vm_table_lock);
321 
322 	return hyp_vm;
323 }
324 
325 void put_pkvm_hyp_vm(struct pkvm_hyp_vm *hyp_vm)
326 {
327 	hyp_spin_lock(&vm_table_lock);
328 	hyp_page_ref_dec(hyp_virt_to_page(hyp_vm));
329 	hyp_spin_unlock(&vm_table_lock);
330 }
331 
332 struct pkvm_hyp_vm *get_np_pkvm_hyp_vm(pkvm_handle_t handle)
333 {
334 	struct pkvm_hyp_vm *hyp_vm = get_pkvm_hyp_vm(handle);
335 
336 	if (hyp_vm && pkvm_hyp_vm_is_protected(hyp_vm)) {
337 		put_pkvm_hyp_vm(hyp_vm);
338 		hyp_vm = NULL;
339 	}
340 
341 	return hyp_vm;
342 }
343 
344 static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struct kvm *host_kvm)
345 {
346 	struct kvm *kvm = &hyp_vm->kvm;
347 	unsigned long host_arch_flags = READ_ONCE(host_kvm->arch.flags);
348 	DECLARE_BITMAP(allowed_features, KVM_VCPU_MAX_FEATURES);
349 
350 	/* CTR_EL0 is always under host control, even for protected VMs. */
351 	hyp_vm->kvm.arch.ctr_el0 = host_kvm->arch.ctr_el0;
352 
353 	/* Preserve the vgic model so that GICv3 emulation works */
354 	hyp_vm->kvm.arch.vgic.vgic_model = host_kvm->arch.vgic.vgic_model;
355 
356 	/* No restrictions for non-protected VMs. */
357 	if (!kvm_vm_is_protected(kvm)) {
358 		hyp_vm->kvm.arch.flags = host_arch_flags;
359 		hyp_vm->kvm.arch.flags &= ~BIT_ULL(KVM_ARCH_FLAG_ID_REGS_INITIALIZED);
360 
361 		bitmap_copy(kvm->arch.vcpu_features,
362 			    host_kvm->arch.vcpu_features,
363 			    KVM_VCPU_MAX_FEATURES);
364 
365 		if (test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &host_arch_flags))
366 			hyp_vm->kvm.arch.midr_el1 = host_kvm->arch.midr_el1;
367 
368 		return;
369 	}
370 
371 	if (kvm_pkvm_ext_allowed(kvm, KVM_CAP_ARM_MTE))
372 		kvm->arch.flags |= host_arch_flags & BIT(KVM_ARCH_FLAG_MTE_ENABLED);
373 
374 	bitmap_zero(allowed_features, KVM_VCPU_MAX_FEATURES);
375 
376 	set_bit(KVM_ARM_VCPU_PSCI_0_2, allowed_features);
377 
378 	if (kvm_pkvm_ext_allowed(kvm, KVM_CAP_ARM_PMU_V3))
379 		set_bit(KVM_ARM_VCPU_PMU_V3, allowed_features);
380 
381 	if (kvm_pkvm_ext_allowed(kvm, KVM_CAP_ARM_PTRAUTH_ADDRESS))
382 		set_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, allowed_features);
383 
384 	if (kvm_pkvm_ext_allowed(kvm, KVM_CAP_ARM_PTRAUTH_GENERIC))
385 		set_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, allowed_features);
386 
387 	if (kvm_pkvm_ext_allowed(kvm, KVM_CAP_ARM_SVE)) {
388 		set_bit(KVM_ARM_VCPU_SVE, allowed_features);
389 		kvm->arch.flags |= host_arch_flags & BIT(KVM_ARCH_FLAG_GUEST_HAS_SVE);
390 	}
391 
392 	bitmap_and(kvm->arch.vcpu_features, host_kvm->arch.vcpu_features,
393 		   allowed_features, KVM_VCPU_MAX_FEATURES);
394 }
395 
396 static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu)
397 {
398 	if (host_vcpu)
399 		hyp_unpin_shared_mem(host_vcpu, host_vcpu + 1);
400 }
401 
402 static void unpin_host_sve_state(struct pkvm_hyp_vcpu *hyp_vcpu)
403 {
404 	void *sve_state;
405 
406 	if (!vcpu_has_feature(&hyp_vcpu->vcpu, KVM_ARM_VCPU_SVE))
407 		return;
408 
409 	sve_state = hyp_vcpu->vcpu.arch.sve_state;
410 	hyp_unpin_shared_mem(sve_state,
411 			     sve_state + vcpu_sve_state_size(&hyp_vcpu->vcpu));
412 }
413 
414 static void unpin_host_vcpus(struct pkvm_hyp_vcpu *hyp_vcpus[],
415 			     unsigned int nr_vcpus)
416 {
417 	int i;
418 
419 	for (i = 0; i < nr_vcpus; i++) {
420 		struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vcpus[i];
421 
422 		if (!hyp_vcpu)
423 			continue;
424 
425 		unpin_host_vcpu(hyp_vcpu->host_vcpu);
426 		unpin_host_sve_state(hyp_vcpu);
427 	}
428 }
429 
430 static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm,
431 			     unsigned int nr_vcpus, pkvm_handle_t handle)
432 {
433 	struct kvm_s2_mmu *mmu = &hyp_vm->kvm.arch.mmu;
434 	int idx = vm_handle_to_idx(handle);
435 
436 	hyp_vm->kvm.arch.pkvm.handle = handle;
437 
438 	hyp_vm->host_kvm = host_kvm;
439 	hyp_vm->kvm.created_vcpus = nr_vcpus;
440 	hyp_vm->kvm.arch.pkvm.is_protected = READ_ONCE(host_kvm->arch.pkvm.is_protected);
441 	hyp_vm->kvm.arch.pkvm.is_created = true;
442 	hyp_vm->kvm.arch.flags = 0;
443 	pkvm_init_features_from_host(hyp_vm, host_kvm);
444 
445 	/* VMID 0 is reserved for the host */
446 	atomic64_set(&mmu->vmid.id, idx + 1);
447 
448 	mmu->vtcr = host_mmu.arch.mmu.vtcr;
449 	mmu->arch = &hyp_vm->kvm.arch;
450 	mmu->pgt = &hyp_vm->pgt;
451 }
452 
453 static int pkvm_vcpu_init_sve(struct pkvm_hyp_vcpu *hyp_vcpu, struct kvm_vcpu *host_vcpu)
454 {
455 	struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu;
456 	unsigned int sve_max_vl;
457 	size_t sve_state_size;
458 	void *sve_state;
459 	int ret = 0;
460 
461 	if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE)) {
462 		vcpu_clear_flag(vcpu, VCPU_SVE_FINALIZED);
463 		return 0;
464 	}
465 
466 	/* Limit guest vector length to the maximum supported by the host. */
467 	sve_max_vl = min(READ_ONCE(host_vcpu->arch.sve_max_vl), kvm_host_sve_max_vl);
468 	sve_state_size = sve_state_size_from_vl(sve_max_vl);
469 	sve_state = kern_hyp_va(READ_ONCE(host_vcpu->arch.sve_state));
470 
471 	if (!sve_state || !sve_state_size) {
472 		ret = -EINVAL;
473 		goto err;
474 	}
475 
476 	ret = hyp_pin_shared_mem(sve_state, sve_state + sve_state_size);
477 	if (ret)
478 		goto err;
479 
480 	vcpu->arch.sve_state = sve_state;
481 	vcpu->arch.sve_max_vl = sve_max_vl;
482 
483 	return 0;
484 err:
485 	clear_bit(KVM_ARM_VCPU_SVE, vcpu->kvm->arch.vcpu_features);
486 	return ret;
487 }
488 
489 static int vm_copy_id_regs(struct pkvm_hyp_vcpu *hyp_vcpu)
490 {
491 	struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu);
492 	const struct kvm *host_kvm = hyp_vm->host_kvm;
493 	struct kvm *kvm = &hyp_vm->kvm;
494 
495 	if (!test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &host_kvm->arch.flags))
496 		return -EINVAL;
497 
498 	if (test_and_set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags))
499 		return 0;
500 
501 	memcpy(kvm->arch.id_regs, host_kvm->arch.id_regs, sizeof(kvm->arch.id_regs));
502 
503 	return 0;
504 }
505 
506 static int pkvm_vcpu_init_sysregs(struct pkvm_hyp_vcpu *hyp_vcpu)
507 {
508 	int ret = 0;
509 
510 	if (pkvm_hyp_vcpu_is_protected(hyp_vcpu))
511 		kvm_init_pvm_id_regs(&hyp_vcpu->vcpu);
512 	else
513 		ret = vm_copy_id_regs(hyp_vcpu);
514 
515 	return ret;
516 }
517 
518 static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu,
519 			      struct pkvm_hyp_vm *hyp_vm,
520 			      struct kvm_vcpu *host_vcpu)
521 {
522 	int ret = 0;
523 
524 	if (hyp_pin_shared_mem(host_vcpu, host_vcpu + 1))
525 		return -EBUSY;
526 
527 	hyp_vcpu->host_vcpu = host_vcpu;
528 
529 	hyp_vcpu->vcpu.kvm = &hyp_vm->kvm;
530 	hyp_vcpu->vcpu.vcpu_id = READ_ONCE(host_vcpu->vcpu_id);
531 	hyp_vcpu->vcpu.vcpu_idx = READ_ONCE(host_vcpu->vcpu_idx);
532 
533 	hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu;
534 	hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags);
535 	hyp_vcpu->vcpu.arch.mp_state.mp_state = KVM_MP_STATE_STOPPED;
536 
537 	ret = pkvm_vcpu_init_sysregs(hyp_vcpu);
538 	if (ret)
539 		goto done;
540 
541 	ret = pkvm_vcpu_init_traps(hyp_vcpu);
542 	if (ret)
543 		goto done;
544 
545 	ret = pkvm_vcpu_init_sve(hyp_vcpu, host_vcpu);
546 done:
547 	if (ret)
548 		unpin_host_vcpu(host_vcpu);
549 	return ret;
550 }
551 
552 static int find_free_vm_table_entry(void)
553 {
554 	int i;
555 
556 	for (i = 0; i < KVM_MAX_PVMS; ++i) {
557 		if (!vm_table[i])
558 			return i;
559 	}
560 
561 	return -ENOMEM;
562 }
563 
564 /*
565  * Reserve a VM table entry.
566  *
567  * Return a unique handle to the VM on success,
568  * negative error code on failure.
569  */
570 static int allocate_vm_table_entry(void)
571 {
572 	int idx;
573 
574 	hyp_assert_lock_held(&vm_table_lock);
575 
576 	/*
577 	 * Initializing protected state might have failed, yet a malicious
578 	 * host could trigger this function. Thus, ensure that 'vm_table'
579 	 * exists.
580 	 */
581 	if (unlikely(!vm_table))
582 		return -EINVAL;
583 
584 	idx = find_free_vm_table_entry();
585 	if (unlikely(idx < 0))
586 		return idx;
587 
588 	vm_table[idx] = RESERVED_ENTRY;
589 
590 	return idx;
591 }
592 
593 static int __insert_vm_table_entry(pkvm_handle_t handle,
594 				   struct pkvm_hyp_vm *hyp_vm)
595 {
596 	unsigned int idx;
597 
598 	hyp_assert_lock_held(&vm_table_lock);
599 
600 	/*
601 	 * Initializing protected state might have failed, yet a malicious
602 	 * host could trigger this function. Thus, ensure that 'vm_table'
603 	 * exists.
604 	 */
605 	if (unlikely(!vm_table))
606 		return -EINVAL;
607 
608 	idx = vm_handle_to_idx(handle);
609 	if (unlikely(idx >= KVM_MAX_PVMS))
610 		return -EINVAL;
611 
612 	if (unlikely(vm_table[idx] != RESERVED_ENTRY))
613 		return -EINVAL;
614 
615 	vm_table[idx] = hyp_vm;
616 
617 	return 0;
618 }
619 
620 /*
621  * Insert a pointer to the initialized VM into the VM table.
622  *
623  * Return 0 on success, or negative error code on failure.
624  */
625 static int insert_vm_table_entry(pkvm_handle_t handle,
626 				 struct pkvm_hyp_vm *hyp_vm)
627 {
628 	int ret;
629 
630 	hyp_spin_lock(&vm_table_lock);
631 	ret = __insert_vm_table_entry(handle, hyp_vm);
632 	hyp_spin_unlock(&vm_table_lock);
633 
634 	return ret;
635 }
636 
637 /*
638  * Deallocate and remove the VM table entry corresponding to the handle.
639  */
640 static void remove_vm_table_entry(pkvm_handle_t handle)
641 {
642 	hyp_assert_lock_held(&vm_table_lock);
643 	vm_table[vm_handle_to_idx(handle)] = NULL;
644 }
645 
646 static size_t pkvm_get_hyp_vm_size(unsigned int nr_vcpus)
647 {
648 	return size_add(sizeof(struct pkvm_hyp_vm),
649 		size_mul(sizeof(struct pkvm_hyp_vcpu *), nr_vcpus));
650 }
651 
652 static void *map_donated_memory_noclear(unsigned long host_va, size_t size)
653 {
654 	void *va = (void *)kern_hyp_va(host_va);
655 
656 	if (!PAGE_ALIGNED(va))
657 		return NULL;
658 
659 	if (__pkvm_host_donate_hyp(hyp_virt_to_pfn(va),
660 				   PAGE_ALIGN(size) >> PAGE_SHIFT))
661 		return NULL;
662 
663 	return va;
664 }
665 
666 static void *map_donated_memory(unsigned long host_va, size_t size)
667 {
668 	void *va = map_donated_memory_noclear(host_va, size);
669 
670 	if (va)
671 		memset(va, 0, size);
672 
673 	return va;
674 }
675 
676 static void __unmap_donated_memory(void *va, size_t size)
677 {
678 	kvm_flush_dcache_to_poc(va, size);
679 	WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(va),
680 				       PAGE_ALIGN(size) >> PAGE_SHIFT));
681 }
682 
683 static void unmap_donated_memory(void *va, size_t size)
684 {
685 	if (!va)
686 		return;
687 
688 	memset(va, 0, size);
689 	__unmap_donated_memory(va, size);
690 }
691 
692 static void unmap_donated_memory_noclear(void *va, size_t size)
693 {
694 	if (!va)
695 		return;
696 
697 	__unmap_donated_memory(va, size);
698 }
699 
700 /*
701  * Reserves an entry in the hypervisor for a new VM in protected mode.
702  *
703  * Return a unique handle to the VM on success, negative error code on failure.
704  */
705 int __pkvm_reserve_vm(void)
706 {
707 	int ret;
708 
709 	hyp_spin_lock(&vm_table_lock);
710 	ret = allocate_vm_table_entry();
711 	hyp_spin_unlock(&vm_table_lock);
712 
713 	if (ret < 0)
714 		return ret;
715 
716 	return idx_to_vm_handle(ret);
717 }
718 
719 /*
720  * Removes a reserved entry, but only if is hasn't been used yet.
721  * Otherwise, the VM needs to be destroyed.
722  */
723 void __pkvm_unreserve_vm(pkvm_handle_t handle)
724 {
725 	unsigned int idx = vm_handle_to_idx(handle);
726 
727 	if (unlikely(!vm_table))
728 		return;
729 
730 	hyp_spin_lock(&vm_table_lock);
731 	if (likely(idx < KVM_MAX_PVMS && vm_table[idx] == RESERVED_ENTRY))
732 		remove_vm_table_entry(handle);
733 	hyp_spin_unlock(&vm_table_lock);
734 }
735 
736 /*
737  * Initialize the hypervisor copy of the VM state using host-donated memory.
738  *
739  * Unmap the donated memory from the host at stage 2.
740  *
741  * host_kvm: A pointer to the host's struct kvm.
742  * vm_hva: The host va of the area being donated for the VM state.
743  *	   Must be page aligned.
744  * pgd_hva: The host va of the area being donated for the stage-2 PGD for
745  *	    the VM. Must be page aligned. Its size is implied by the VM's
746  *	    VTCR.
747  *
748  * Return 0 success, negative error code on failure.
749  */
750 int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
751 		   unsigned long pgd_hva)
752 {
753 	struct pkvm_hyp_vm *hyp_vm = NULL;
754 	size_t vm_size, pgd_size;
755 	unsigned int nr_vcpus;
756 	pkvm_handle_t handle;
757 	void *pgd = NULL;
758 	int ret;
759 
760 	ret = hyp_pin_shared_mem(host_kvm, host_kvm + 1);
761 	if (ret)
762 		return ret;
763 
764 	nr_vcpus = READ_ONCE(host_kvm->created_vcpus);
765 	if (nr_vcpus < 1) {
766 		ret = -EINVAL;
767 		goto err_unpin_kvm;
768 	}
769 
770 	handle = READ_ONCE(host_kvm->arch.pkvm.handle);
771 	if (unlikely(handle < HANDLE_OFFSET)) {
772 		ret = -EINVAL;
773 		goto err_unpin_kvm;
774 	}
775 
776 	vm_size = pkvm_get_hyp_vm_size(nr_vcpus);
777 	pgd_size = kvm_pgtable_stage2_pgd_size(host_mmu.arch.mmu.vtcr);
778 
779 	ret = -ENOMEM;
780 
781 	hyp_vm = map_donated_memory(vm_hva, vm_size);
782 	if (!hyp_vm)
783 		goto err_remove_mappings;
784 
785 	pgd = map_donated_memory_noclear(pgd_hva, pgd_size);
786 	if (!pgd)
787 		goto err_remove_mappings;
788 
789 	init_pkvm_hyp_vm(host_kvm, hyp_vm, nr_vcpus, handle);
790 
791 	ret = kvm_guest_prepare_stage2(hyp_vm, pgd);
792 	if (ret)
793 		goto err_remove_mappings;
794 
795 	/* Must be called last since this publishes the VM. */
796 	ret = insert_vm_table_entry(handle, hyp_vm);
797 	if (ret)
798 		goto err_remove_mappings;
799 
800 	return 0;
801 
802 err_remove_mappings:
803 	unmap_donated_memory(hyp_vm, vm_size);
804 	unmap_donated_memory(pgd, pgd_size);
805 err_unpin_kvm:
806 	hyp_unpin_shared_mem(host_kvm, host_kvm + 1);
807 	return ret;
808 }
809 
810 /*
811  * Initialize the hypervisor copy of the vCPU state using host-donated memory.
812  *
813  * handle: The hypervisor handle for the vm.
814  * host_vcpu: A pointer to the corresponding host vcpu.
815  * vcpu_hva: The host va of the area being donated for the vcpu state.
816  *	     Must be page aligned. The size of the area must be equal to
817  *	     the page-aligned size of 'struct pkvm_hyp_vcpu'.
818  * Return 0 on success, negative error code on failure.
819  */
820 int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu,
821 		     unsigned long vcpu_hva)
822 {
823 	struct pkvm_hyp_vcpu *hyp_vcpu;
824 	struct pkvm_hyp_vm *hyp_vm;
825 	unsigned int idx;
826 	int ret;
827 
828 	hyp_vcpu = map_donated_memory(vcpu_hva, sizeof(*hyp_vcpu));
829 	if (!hyp_vcpu)
830 		return -ENOMEM;
831 
832 	hyp_spin_lock(&vm_table_lock);
833 
834 	hyp_vm = get_vm_by_handle(handle);
835 	if (!hyp_vm) {
836 		ret = -ENOENT;
837 		goto unlock;
838 	}
839 
840 	ret = init_pkvm_hyp_vcpu(hyp_vcpu, hyp_vm, host_vcpu);
841 	if (ret)
842 		goto unlock;
843 
844 	idx = hyp_vcpu->vcpu.vcpu_idx;
845 	if (idx >= hyp_vm->kvm.created_vcpus) {
846 		ret = -EINVAL;
847 		goto unlock;
848 	}
849 
850 	if (hyp_vm->vcpus[idx]) {
851 		ret = -EINVAL;
852 		goto unlock;
853 	}
854 
855 	hyp_vm->vcpus[idx] = hyp_vcpu;
856 unlock:
857 	hyp_spin_unlock(&vm_table_lock);
858 
859 	if (ret)
860 		unmap_donated_memory(hyp_vcpu, sizeof(*hyp_vcpu));
861 	return ret;
862 }
863 
864 static void
865 teardown_donated_memory(struct kvm_hyp_memcache *mc, void *addr, size_t size)
866 {
867 	size = PAGE_ALIGN(size);
868 	memset(addr, 0, size);
869 
870 	for (void *start = addr; start < addr + size; start += PAGE_SIZE)
871 		push_hyp_memcache(mc, start, hyp_virt_to_phys);
872 
873 	unmap_donated_memory_noclear(addr, size);
874 }
875 
876 int __pkvm_reclaim_dying_guest_page(pkvm_handle_t handle, u64 gfn)
877 {
878 	struct pkvm_hyp_vm *hyp_vm;
879 	int ret = -EINVAL;
880 
881 	hyp_spin_lock(&vm_table_lock);
882 	hyp_vm = get_vm_by_handle(handle);
883 	if (hyp_vm && hyp_vm->kvm.arch.pkvm.is_dying)
884 		ret = __pkvm_host_reclaim_page_guest(gfn, hyp_vm);
885 	hyp_spin_unlock(&vm_table_lock);
886 
887 	return ret;
888 }
889 
890 int __pkvm_start_teardown_vm(pkvm_handle_t handle)
891 {
892 	struct pkvm_hyp_vm *hyp_vm;
893 	int ret = 0;
894 
895 	hyp_spin_lock(&vm_table_lock);
896 	hyp_vm = get_vm_by_handle(handle);
897 	if (!hyp_vm) {
898 		ret = -ENOENT;
899 		goto unlock;
900 	} else if (WARN_ON(hyp_page_count(hyp_vm))) {
901 		ret = -EBUSY;
902 		goto unlock;
903 	} else if (hyp_vm->kvm.arch.pkvm.is_dying) {
904 		ret = -EINVAL;
905 		goto unlock;
906 	}
907 
908 	hyp_vm->kvm.arch.pkvm.is_dying = true;
909 unlock:
910 	hyp_spin_unlock(&vm_table_lock);
911 
912 	return ret;
913 }
914 
915 int __pkvm_finalize_teardown_vm(pkvm_handle_t handle)
916 {
917 	struct kvm_hyp_memcache *mc, *stage2_mc;
918 	struct pkvm_hyp_vm *hyp_vm;
919 	struct kvm *host_kvm;
920 	unsigned int idx;
921 	size_t vm_size;
922 	int err;
923 
924 	hyp_spin_lock(&vm_table_lock);
925 	hyp_vm = get_vm_by_handle(handle);
926 	if (!hyp_vm) {
927 		err = -ENOENT;
928 		goto err_unlock;
929 	} else if (!hyp_vm->kvm.arch.pkvm.is_dying) {
930 		err = -EBUSY;
931 		goto err_unlock;
932 	}
933 
934 	host_kvm = hyp_vm->host_kvm;
935 
936 	/* Ensure the VMID is clean before it can be reallocated */
937 	__kvm_tlb_flush_vmid(&hyp_vm->kvm.arch.mmu);
938 	remove_vm_table_entry(handle);
939 	hyp_spin_unlock(&vm_table_lock);
940 
941 	/* Reclaim guest pages (including page-table pages) */
942 	mc = &host_kvm->arch.pkvm.teardown_mc;
943 	stage2_mc = &host_kvm->arch.pkvm.stage2_teardown_mc;
944 	reclaim_pgtable_pages(hyp_vm, stage2_mc);
945 	unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->kvm.created_vcpus);
946 
947 	/* Push the metadata pages to the teardown memcache */
948 	for (idx = 0; idx < hyp_vm->kvm.created_vcpus; ++idx) {
949 		struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vm->vcpus[idx];
950 		struct kvm_hyp_memcache *vcpu_mc;
951 
952 		if (!hyp_vcpu)
953 			continue;
954 
955 		vcpu_mc = &hyp_vcpu->vcpu.arch.pkvm_memcache;
956 
957 		while (vcpu_mc->nr_pages) {
958 			void *addr = pop_hyp_memcache(vcpu_mc, hyp_phys_to_virt);
959 
960 			push_hyp_memcache(stage2_mc, addr, hyp_virt_to_phys);
961 			unmap_donated_memory_noclear(addr, PAGE_SIZE);
962 		}
963 
964 		teardown_donated_memory(mc, hyp_vcpu, sizeof(*hyp_vcpu));
965 	}
966 
967 	vm_size = pkvm_get_hyp_vm_size(hyp_vm->kvm.created_vcpus);
968 	teardown_donated_memory(mc, hyp_vm, vm_size);
969 	hyp_unpin_shared_mem(host_kvm, host_kvm + 1);
970 	return 0;
971 
972 err_unlock:
973 	hyp_spin_unlock(&vm_table_lock);
974 	return err;
975 }
976 
977 static u64 __pkvm_memshare_page_req(struct kvm_vcpu *vcpu, u64 ipa)
978 {
979 	u64 elr;
980 
981 	/* Fake up a data abort (level 3 translation fault on write) */
982 	vcpu->arch.fault.esr_el2 = (ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT) |
983 				   ESR_ELx_WNR | ESR_ELx_FSC_FAULT |
984 				   FIELD_PREP(ESR_ELx_FSC_LEVEL, 3);
985 
986 	/* Shuffle the IPA around into the HPFAR */
987 	vcpu->arch.fault.hpfar_el2 = (HPFAR_EL2_NS | (ipa >> 8)) & HPFAR_MASK;
988 
989 	/* This is a virtual address. 0's good. Let's go with 0. */
990 	vcpu->arch.fault.far_el2 = 0;
991 
992 	/* Rewind the ELR so we return to the HVC once the IPA is mapped */
993 	elr = read_sysreg(elr_el2);
994 	elr -= 4;
995 	write_sysreg(elr, elr_el2);
996 
997 	return ARM_EXCEPTION_TRAP;
998 }
999 
1000 static bool pkvm_memshare_call(u64 *ret, struct kvm_vcpu *vcpu, u64 *exit_code)
1001 {
1002 	struct pkvm_hyp_vcpu *hyp_vcpu;
1003 	u64 ipa = smccc_get_arg1(vcpu);
1004 
1005 	if (!PAGE_ALIGNED(ipa))
1006 		goto out_guest;
1007 
1008 	hyp_vcpu = container_of(vcpu, struct pkvm_hyp_vcpu, vcpu);
1009 	switch (__pkvm_guest_share_host(hyp_vcpu, hyp_phys_to_pfn(ipa))) {
1010 	case 0:
1011 		ret[0] = SMCCC_RET_SUCCESS;
1012 		goto out_guest;
1013 	case -ENOENT:
1014 		/*
1015 		 * Convert the exception into a data abort so that the page
1016 		 * being shared is mapped into the guest next time.
1017 		 */
1018 		*exit_code = __pkvm_memshare_page_req(vcpu, ipa);
1019 		goto out_host;
1020 	}
1021 
1022 out_guest:
1023 	return true;
1024 out_host:
1025 	return false;
1026 }
1027 
1028 /*
1029  * Handler for protected VM HVC calls.
1030  *
1031  * Returns true if the hypervisor has handled the exit (and control
1032  * should return to the guest) or false if it hasn't (and the handling
1033  * should be performed by the host).
1034  */
1035 bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code)
1036 {
1037 	u64 val[4] = { SMCCC_RET_INVALID_PARAMETER };
1038 	bool handled = true;
1039 
1040 	switch (smccc_get_function(vcpu)) {
1041 	case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
1042 		val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES);
1043 		val[0] |= BIT(ARM_SMCCC_KVM_FUNC_HYP_MEMINFO);
1044 		val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MEM_SHARE);
1045 		break;
1046 	case ARM_SMCCC_VENDOR_HYP_KVM_HYP_MEMINFO_FUNC_ID:
1047 		if (smccc_get_arg1(vcpu) ||
1048 		    smccc_get_arg2(vcpu) ||
1049 		    smccc_get_arg3(vcpu)) {
1050 			break;
1051 		}
1052 
1053 		val[0] = PAGE_SIZE;
1054 		break;
1055 	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID:
1056 		if (smccc_get_arg2(vcpu) ||
1057 		    smccc_get_arg3(vcpu)) {
1058 			break;
1059 		}
1060 
1061 		handled = pkvm_memshare_call(val, vcpu, exit_code);
1062 		break;
1063 	default:
1064 		/* Punt everything else back to the host, for now. */
1065 		handled = false;
1066 	}
1067 
1068 	if (handled)
1069 		smccc_set_retval(vcpu, val[0], val[1], val[2], val[3]);
1070 	return handled;
1071 }
1072