xref: /linux/arch/arm64/kvm/pkvm.c (revision 72c181399b01bb4836d1fabaa9f5f6438c82178e)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2020 - Google LLC
4  * Author: Quentin Perret <qperret@google.com>
5  */
6 
7 #include <linux/init.h>
8 #include <linux/interval_tree_generic.h>
9 #include <linux/kmemleak.h>
10 #include <linux/kvm_host.h>
11 #include <asm/kvm_mmu.h>
12 #include <linux/memblock.h>
13 #include <linux/mutex.h>
14 
15 #include <asm/kvm_pkvm.h>
16 
17 #include "hyp_constants.h"
18 
19 DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized);
20 
21 static struct memblock_region *hyp_memory = kvm_nvhe_sym(hyp_memory);
22 static unsigned int *hyp_memblock_nr_ptr = &kvm_nvhe_sym(hyp_memblock_nr);
23 
24 phys_addr_t hyp_mem_base;
25 phys_addr_t hyp_mem_size;
26 
register_memblock_regions(void)27 static int __init register_memblock_regions(void)
28 {
29 	struct memblock_region *reg;
30 
31 	for_each_mem_region(reg) {
32 		if (*hyp_memblock_nr_ptr >= HYP_MEMBLOCK_REGIONS)
33 			return -ENOMEM;
34 
35 		hyp_memory[*hyp_memblock_nr_ptr] = *reg;
36 		(*hyp_memblock_nr_ptr)++;
37 	}
38 
39 	return 0;
40 }
41 
kvm_hyp_reserve(void)42 void __init kvm_hyp_reserve(void)
43 {
44 	u64 hyp_mem_pages = 0;
45 	int ret;
46 
47 	if (!is_hyp_mode_available() || is_kernel_in_hyp_mode())
48 		return;
49 
50 	if (kvm_get_mode() != KVM_MODE_PROTECTED)
51 		return;
52 
53 	ret = register_memblock_regions();
54 	if (ret) {
55 		*hyp_memblock_nr_ptr = 0;
56 		kvm_err("Failed to register hyp memblocks: %d\n", ret);
57 		return;
58 	}
59 
60 	hyp_mem_pages += hyp_s1_pgtable_pages();
61 	hyp_mem_pages += host_s2_pgtable_pages();
62 	hyp_mem_pages += hyp_vm_table_pages();
63 	hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE);
64 	hyp_mem_pages += pkvm_selftest_pages();
65 	hyp_mem_pages += hyp_ffa_proxy_pages();
66 
67 	/*
68 	 * Try to allocate a PMD-aligned region to reduce TLB pressure once
69 	 * this is unmapped from the host stage-2, and fallback to PAGE_SIZE.
70 	 */
71 	hyp_mem_size = hyp_mem_pages << PAGE_SHIFT;
72 	hyp_mem_base = memblock_phys_alloc(ALIGN(hyp_mem_size, PMD_SIZE),
73 					   PMD_SIZE);
74 	if (!hyp_mem_base)
75 		hyp_mem_base = memblock_phys_alloc(hyp_mem_size, PAGE_SIZE);
76 	else
77 		hyp_mem_size = ALIGN(hyp_mem_size, PMD_SIZE);
78 
79 	if (!hyp_mem_base) {
80 		kvm_err("Failed to reserve hyp memory\n");
81 		return;
82 	}
83 
84 	kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20,
85 		 hyp_mem_base);
86 }
87 
__pkvm_destroy_hyp_vm(struct kvm * kvm)88 static void __pkvm_destroy_hyp_vm(struct kvm *kvm)
89 {
90 	if (pkvm_hyp_vm_is_created(kvm)) {
91 		WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm,
92 					  kvm->arch.pkvm.handle));
93 	} else if (kvm->arch.pkvm.handle) {
94 		/*
95 		 * The VM could have been reserved but hyp initialization has
96 		 * failed. Make sure to unreserve it.
97 		 */
98 		kvm_call_hyp_nvhe(__pkvm_unreserve_vm, kvm->arch.pkvm.handle);
99 	}
100 
101 	kvm->arch.pkvm.handle = 0;
102 	kvm->arch.pkvm.is_created = false;
103 	free_hyp_memcache(&kvm->arch.pkvm.teardown_mc);
104 	free_hyp_memcache(&kvm->arch.pkvm.stage2_teardown_mc);
105 }
106 
__pkvm_create_hyp_vcpu(struct kvm_vcpu * vcpu)107 static int __pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu)
108 {
109 	size_t hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE);
110 	pkvm_handle_t handle = vcpu->kvm->arch.pkvm.handle;
111 	void *hyp_vcpu;
112 	int ret;
113 
114 	vcpu->arch.pkvm_memcache.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2;
115 
116 	hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT);
117 	if (!hyp_vcpu)
118 		return -ENOMEM;
119 
120 	ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, vcpu, hyp_vcpu);
121 	if (!ret)
122 		vcpu_set_flag(vcpu, VCPU_PKVM_FINALIZED);
123 	else
124 		free_pages_exact(hyp_vcpu, hyp_vcpu_sz);
125 
126 	return ret;
127 }
128 
129 /*
130  * Allocates and donates memory for hypervisor VM structs at EL2.
131  *
132  * Allocates space for the VM state, which includes the hyp vm as well as
133  * the hyp vcpus.
134  *
135  * Stores an opaque handler in the kvm struct for future reference.
136  *
137  * Return 0 on success, negative error code on failure.
138  */
__pkvm_create_hyp_vm(struct kvm * kvm)139 static int __pkvm_create_hyp_vm(struct kvm *kvm)
140 {
141 	size_t pgd_sz, hyp_vm_sz;
142 	void *pgd, *hyp_vm;
143 	int ret;
144 
145 	if (kvm->created_vcpus < 1)
146 		return -EINVAL;
147 
148 	pgd_sz = kvm_pgtable_stage2_pgd_size(kvm->arch.mmu.vtcr);
149 
150 	/*
151 	 * The PGD pages will be reclaimed using a hyp_memcache which implies
152 	 * page granularity. So, use alloc_pages_exact() to get individual
153 	 * refcounts.
154 	 */
155 	pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL_ACCOUNT);
156 	if (!pgd)
157 		return -ENOMEM;
158 
159 	/* Allocate memory to donate to hyp for vm and vcpu pointers. */
160 	hyp_vm_sz = PAGE_ALIGN(size_add(PKVM_HYP_VM_SIZE,
161 					size_mul(sizeof(void *),
162 						 kvm->created_vcpus)));
163 	hyp_vm = alloc_pages_exact(hyp_vm_sz, GFP_KERNEL_ACCOUNT);
164 	if (!hyp_vm) {
165 		ret = -ENOMEM;
166 		goto free_pgd;
167 	}
168 
169 	/* Donate the VM memory to hyp and let hyp initialize it. */
170 	ret = kvm_call_hyp_nvhe(__pkvm_init_vm, kvm, hyp_vm, pgd);
171 	if (ret)
172 		goto free_vm;
173 
174 	kvm->arch.pkvm.is_created = true;
175 	kvm->arch.pkvm.stage2_teardown_mc.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2;
176 	kvm_account_pgtable_pages(pgd, pgd_sz / PAGE_SIZE);
177 
178 	return 0;
179 free_vm:
180 	free_pages_exact(hyp_vm, hyp_vm_sz);
181 free_pgd:
182 	free_pages_exact(pgd, pgd_sz);
183 	return ret;
184 }
185 
pkvm_hyp_vm_is_created(struct kvm * kvm)186 bool pkvm_hyp_vm_is_created(struct kvm *kvm)
187 {
188 	return READ_ONCE(kvm->arch.pkvm.is_created);
189 }
190 
pkvm_create_hyp_vm(struct kvm * kvm)191 int pkvm_create_hyp_vm(struct kvm *kvm)
192 {
193 	int ret = 0;
194 
195 	mutex_lock(&kvm->arch.config_lock);
196 	if (!pkvm_hyp_vm_is_created(kvm))
197 		ret = __pkvm_create_hyp_vm(kvm);
198 	mutex_unlock(&kvm->arch.config_lock);
199 
200 	return ret;
201 }
202 
pkvm_create_hyp_vcpu(struct kvm_vcpu * vcpu)203 int pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu)
204 {
205 	int ret = 0;
206 
207 	mutex_lock(&vcpu->kvm->arch.config_lock);
208 	if (!vcpu_get_flag(vcpu, VCPU_PKVM_FINALIZED))
209 		ret = __pkvm_create_hyp_vcpu(vcpu);
210 	mutex_unlock(&vcpu->kvm->arch.config_lock);
211 
212 	return ret;
213 }
214 
pkvm_destroy_hyp_vm(struct kvm * kvm)215 void pkvm_destroy_hyp_vm(struct kvm *kvm)
216 {
217 	mutex_lock(&kvm->arch.config_lock);
218 	__pkvm_destroy_hyp_vm(kvm);
219 	mutex_unlock(&kvm->arch.config_lock);
220 }
221 
pkvm_init_host_vm(struct kvm * kvm)222 int pkvm_init_host_vm(struct kvm *kvm)
223 {
224 	int ret;
225 
226 	if (pkvm_hyp_vm_is_created(kvm))
227 		return -EINVAL;
228 
229 	/* VM is already reserved, no need to proceed. */
230 	if (kvm->arch.pkvm.handle)
231 		return 0;
232 
233 	/* Reserve the VM in hyp and obtain a hyp handle for the VM. */
234 	ret = kvm_call_hyp_nvhe(__pkvm_reserve_vm);
235 	if (ret < 0)
236 		return ret;
237 
238 	kvm->arch.pkvm.handle = ret;
239 
240 	return 0;
241 }
242 
_kvm_host_prot_finalize(void * arg)243 static void __init _kvm_host_prot_finalize(void *arg)
244 {
245 	int *err = arg;
246 
247 	if (WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize)))
248 		WRITE_ONCE(*err, -EINVAL);
249 }
250 
pkvm_drop_host_privileges(void)251 static int __init pkvm_drop_host_privileges(void)
252 {
253 	int ret = 0;
254 
255 	/*
256 	 * Flip the static key upfront as that may no longer be possible
257 	 * once the host stage 2 is installed.
258 	 */
259 	static_branch_enable(&kvm_protected_mode_initialized);
260 	on_each_cpu(_kvm_host_prot_finalize, &ret, 1);
261 	return ret;
262 }
263 
finalize_pkvm(void)264 static int __init finalize_pkvm(void)
265 {
266 	int ret;
267 
268 	if (!is_protected_kvm_enabled() || !is_kvm_arm_initialised())
269 		return 0;
270 
271 	/*
272 	 * Exclude HYP sections from kmemleak so that they don't get peeked
273 	 * at, which would end badly once inaccessible.
274 	 */
275 	kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start);
276 	kmemleak_free_part(__hyp_data_start, __hyp_data_end - __hyp_data_start);
277 	kmemleak_free_part(__hyp_rodata_start, __hyp_rodata_end - __hyp_rodata_start);
278 	kmemleak_free_part_phys(hyp_mem_base, hyp_mem_size);
279 
280 	ret = pkvm_drop_host_privileges();
281 	if (ret)
282 		pr_err("Failed to finalize Hyp protection: %d\n", ret);
283 
284 	return ret;
285 }
286 device_initcall_sync(finalize_pkvm);
287 
__pkvm_mapping_start(struct pkvm_mapping * m)288 static u64 __pkvm_mapping_start(struct pkvm_mapping *m)
289 {
290 	return m->gfn * PAGE_SIZE;
291 }
292 
__pkvm_mapping_end(struct pkvm_mapping * m)293 static u64 __pkvm_mapping_end(struct pkvm_mapping *m)
294 {
295 	return (m->gfn + m->nr_pages) * PAGE_SIZE - 1;
296 }
297 
298 INTERVAL_TREE_DEFINE(struct pkvm_mapping, node, u64, __subtree_last,
299 		     __pkvm_mapping_start, __pkvm_mapping_end, static,
300 		     pkvm_mapping);
301 
302 /*
303  * __tmp is updated to iter_first(pkvm_mappings) *before* entering the body of the loop to allow
304  * freeing of __map inline.
305  */
306 #define for_each_mapping_in_range_safe(__pgt, __start, __end, __map)				\
307 	for (struct pkvm_mapping *__tmp = pkvm_mapping_iter_first(&(__pgt)->pkvm_mappings,	\
308 								  __start, __end - 1);		\
309 	     __tmp && ({									\
310 				__map = __tmp;							\
311 				__tmp = pkvm_mapping_iter_next(__map, __start, __end - 1);	\
312 				true;								\
313 		       });									\
314 	    )
315 
pkvm_pgtable_stage2_init(struct kvm_pgtable * pgt,struct kvm_s2_mmu * mmu,struct kvm_pgtable_mm_ops * mm_ops)316 int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
317 			     struct kvm_pgtable_mm_ops *mm_ops)
318 {
319 	pgt->pkvm_mappings	= RB_ROOT_CACHED;
320 	pgt->mmu		= mmu;
321 
322 	return 0;
323 }
324 
__pkvm_pgtable_stage2_unmap(struct kvm_pgtable * pgt,u64 start,u64 end)325 static int __pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 start, u64 end)
326 {
327 	struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
328 	pkvm_handle_t handle = kvm->arch.pkvm.handle;
329 	struct pkvm_mapping *mapping;
330 	int ret;
331 
332 	if (!handle)
333 		return 0;
334 
335 	for_each_mapping_in_range_safe(pgt, start, end, mapping) {
336 		ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn,
337 					mapping->nr_pages);
338 		if (WARN_ON(ret))
339 			return ret;
340 		pkvm_mapping_remove(mapping, &pgt->pkvm_mappings);
341 		kfree(mapping);
342 	}
343 
344 	return 0;
345 }
346 
pkvm_pgtable_stage2_destroy(struct kvm_pgtable * pgt)347 void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
348 {
349 	__pkvm_pgtable_stage2_unmap(pgt, 0, ~(0ULL));
350 }
351 
pkvm_pgtable_stage2_map(struct kvm_pgtable * pgt,u64 addr,u64 size,u64 phys,enum kvm_pgtable_prot prot,void * mc,enum kvm_pgtable_walk_flags flags)352 int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
353 			   u64 phys, enum kvm_pgtable_prot prot,
354 			   void *mc, enum kvm_pgtable_walk_flags flags)
355 {
356 	struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
357 	struct pkvm_mapping *mapping = NULL;
358 	struct kvm_hyp_memcache *cache = mc;
359 	u64 gfn = addr >> PAGE_SHIFT;
360 	u64 pfn = phys >> PAGE_SHIFT;
361 	int ret;
362 
363 	if (size != PAGE_SIZE && size != PMD_SIZE)
364 		return -EINVAL;
365 
366 	lockdep_assert_held_write(&kvm->mmu_lock);
367 
368 	/*
369 	 * Calling stage2_map() on top of existing mappings is either happening because of a race
370 	 * with another vCPU, or because we're changing between page and block mappings. As per
371 	 * user_mem_abort(), same-size permission faults are handled in the relax_perms() path.
372 	 */
373 	mapping = pkvm_mapping_iter_first(&pgt->pkvm_mappings, addr, addr + size - 1);
374 	if (mapping) {
375 		if (size == (mapping->nr_pages * PAGE_SIZE))
376 			return -EAGAIN;
377 
378 		/* Remove _any_ pkvm_mapping overlapping with the range, bigger or smaller. */
379 		ret = __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size);
380 		if (ret)
381 			return ret;
382 		mapping = NULL;
383 	}
384 
385 	ret = kvm_call_hyp_nvhe(__pkvm_host_share_guest, pfn, gfn, size / PAGE_SIZE, prot);
386 	if (WARN_ON(ret))
387 		return ret;
388 
389 	swap(mapping, cache->mapping);
390 	mapping->gfn = gfn;
391 	mapping->pfn = pfn;
392 	mapping->nr_pages = size / PAGE_SIZE;
393 	pkvm_mapping_insert(mapping, &pgt->pkvm_mappings);
394 
395 	return ret;
396 }
397 
pkvm_pgtable_stage2_unmap(struct kvm_pgtable * pgt,u64 addr,u64 size)398 int pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
399 {
400 	lockdep_assert_held_write(&kvm_s2_mmu_to_kvm(pgt->mmu)->mmu_lock);
401 
402 	return __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size);
403 }
404 
pkvm_pgtable_stage2_wrprotect(struct kvm_pgtable * pgt,u64 addr,u64 size)405 int pkvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
406 {
407 	struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
408 	pkvm_handle_t handle = kvm->arch.pkvm.handle;
409 	struct pkvm_mapping *mapping;
410 	int ret = 0;
411 
412 	lockdep_assert_held(&kvm->mmu_lock);
413 	for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) {
414 		ret = kvm_call_hyp_nvhe(__pkvm_host_wrprotect_guest, handle, mapping->gfn,
415 					mapping->nr_pages);
416 		if (WARN_ON(ret))
417 			break;
418 	}
419 
420 	return ret;
421 }
422 
pkvm_pgtable_stage2_flush(struct kvm_pgtable * pgt,u64 addr,u64 size)423 int pkvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
424 {
425 	struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
426 	struct pkvm_mapping *mapping;
427 
428 	lockdep_assert_held(&kvm->mmu_lock);
429 	for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping)
430 		__clean_dcache_guest_page(pfn_to_kaddr(mapping->pfn),
431 					  PAGE_SIZE * mapping->nr_pages);
432 
433 	return 0;
434 }
435 
pkvm_pgtable_stage2_test_clear_young(struct kvm_pgtable * pgt,u64 addr,u64 size,bool mkold)436 bool pkvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64 size, bool mkold)
437 {
438 	struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
439 	pkvm_handle_t handle = kvm->arch.pkvm.handle;
440 	struct pkvm_mapping *mapping;
441 	bool young = false;
442 
443 	lockdep_assert_held(&kvm->mmu_lock);
444 	for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping)
445 		young |= kvm_call_hyp_nvhe(__pkvm_host_test_clear_young_guest, handle, mapping->gfn,
446 					   mapping->nr_pages, mkold);
447 
448 	return young;
449 }
450 
pkvm_pgtable_stage2_relax_perms(struct kvm_pgtable * pgt,u64 addr,enum kvm_pgtable_prot prot,enum kvm_pgtable_walk_flags flags)451 int pkvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot,
452 				    enum kvm_pgtable_walk_flags flags)
453 {
454 	return kvm_call_hyp_nvhe(__pkvm_host_relax_perms_guest, addr >> PAGE_SHIFT, prot);
455 }
456 
pkvm_pgtable_stage2_mkyoung(struct kvm_pgtable * pgt,u64 addr,enum kvm_pgtable_walk_flags flags)457 void pkvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr,
458 				 enum kvm_pgtable_walk_flags flags)
459 {
460 	WARN_ON(kvm_call_hyp_nvhe(__pkvm_host_mkyoung_guest, addr >> PAGE_SHIFT));
461 }
462 
pkvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops * mm_ops,void * pgtable,s8 level)463 void pkvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level)
464 {
465 	WARN_ON_ONCE(1);
466 }
467 
pkvm_pgtable_stage2_create_unlinked(struct kvm_pgtable * pgt,u64 phys,s8 level,enum kvm_pgtable_prot prot,void * mc,bool force_pte)468 kvm_pte_t *pkvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, u64 phys, s8 level,
469 					enum kvm_pgtable_prot prot, void *mc, bool force_pte)
470 {
471 	WARN_ON_ONCE(1);
472 	return NULL;
473 }
474 
pkvm_pgtable_stage2_split(struct kvm_pgtable * pgt,u64 addr,u64 size,struct kvm_mmu_memory_cache * mc)475 int pkvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
476 			      struct kvm_mmu_memory_cache *mc)
477 {
478 	WARN_ON_ONCE(1);
479 	return -EINVAL;
480 }
481