xref: /freebsd/sys/riscv/vmm/vmm.c (revision c76c2a19ae3763d17aa6a60a5831ed24cbc16e83)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2015 Mihai Carabas <mihai.carabas@gmail.com>
5  * Copyright (c) 2024 Ruslan Bukin <br@bsdpad.com>
6  *
7  * This software was developed by the University of Cambridge Computer
8  * Laboratory (Department of Computer Science and Technology) under Innovate
9  * UK project 105694, "Digital Security by Design (DSbD) Technology Platform
10  * Prototype".
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/cpuset.h>
37 #include <sys/kernel.h>
38 #include <sys/linker.h>
39 #include <sys/lock.h>
40 #include <sys/malloc.h>
41 #include <sys/module.h>
42 #include <sys/mutex.h>
43 #include <sys/pcpu.h>
44 #include <sys/proc.h>
45 #include <sys/queue.h>
46 #include <sys/rwlock.h>
47 #include <sys/sched.h>
48 #include <sys/smp.h>
49 #include <sys/sysctl.h>
50 
51 #include <vm/vm.h>
52 #include <vm/vm_object.h>
53 #include <vm/vm_page.h>
54 #include <vm/pmap.h>
55 #include <vm/vm_map.h>
56 #include <vm/vm_extern.h>
57 #include <vm/vm_param.h>
58 
59 #include <machine/riscvreg.h>
60 #include <machine/cpu.h>
61 #include <machine/fpe.h>
62 #include <machine/machdep.h>
63 #include <machine/pcb.h>
64 #include <machine/smp.h>
65 #include <machine/vm.h>
66 #include <machine/vmparam.h>
67 #include <machine/vmm.h>
68 #include <machine/vmm_instruction_emul.h>
69 
70 #include <dev/pci/pcireg.h>
71 
72 #include <dev/vmm/vmm_dev.h>
73 #include <dev/vmm/vmm_ktr.h>
74 #include <dev/vmm/vmm_mem.h>
75 
76 #include "vmm_stat.h"
77 #include "riscv.h"
78 
79 #include "vmm_aplic.h"
80 
81 struct vcpu {
82 	int		flags;
83 	enum vcpu_state	state;
84 	struct mtx	mtx;
85 	int		hostcpu;	/* host cpuid this vcpu last ran on */
86 	int		vcpuid;
87 	void		*stats;
88 	struct vm_exit	exitinfo;
89 	uint64_t	nextpc;		/* (x) next instruction to execute */
90 	struct vm	*vm;		/* (o) */
91 	void		*cookie;	/* (i) cpu-specific data */
92 	struct fpreg	*guestfpu;	/* (a,i) guest fpu state */
93 };
94 
95 #define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
96 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
97 #define	vcpu_lock_destroy(v)	mtx_destroy(&((v)->mtx))
98 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
99 #define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
100 #define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
101 
102 struct vmm_mmio_region {
103 	uint64_t start;
104 	uint64_t end;
105 	mem_region_read_t read;
106 	mem_region_write_t write;
107 };
108 #define	VM_MAX_MMIO_REGIONS	4
109 
110 /*
111  * Initialization:
112  * (o) initialized the first time the VM is created
113  * (i) initialized when VM is created and when it is reinitialized
114  * (x) initialized before use
115  */
116 struct vm {
117 	void		*cookie;		/* (i) cpu-specific data */
118 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
119 	volatile cpuset_t debug_cpus;		/* (i) vcpus stopped for debug*/
120 	int		suspend;		/* (i) stop VM execution */
121 	bool		dying;			/* (o) is dying */
122 	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
123 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
124 	struct vmspace	*vmspace;		/* (o) guest's address space */
125 	struct vm_mem	mem;			/* (i) [m+v] guest memory */
126 	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
127 	struct vcpu	**vcpu;			/* (i) guest vcpus */
128 	struct vmm_mmio_region mmio_region[VM_MAX_MMIO_REGIONS];
129 						/* (o) guest MMIO regions */
130 	/* The following describe the vm cpu topology */
131 	uint16_t	sockets;		/* (o) num of sockets */
132 	uint16_t	cores;			/* (o) num of cores/socket */
133 	uint16_t	threads;		/* (o) num of threads/core */
134 	uint16_t	maxcpus;		/* (o) max pluggable cpus */
135 	struct sx	vcpus_init_lock;	/* (o) */
136 };
137 
138 static bool vmm_initialized = false;
139 
140 static MALLOC_DEFINE(M_VMM, "vmm", "vmm");
141 
142 /* statistics */
143 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
144 
145 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
146 
147 static int vmm_ipinum;
148 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
149     "IPI vector used for vcpu notifications");
150 
151 u_int vm_maxcpu;
152 SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
153     &vm_maxcpu, 0, "Maximum number of vCPUs");
154 
155 static void vcpu_notify_event_locked(struct vcpu *vcpu);
156 
157 /* global statistics */
158 VMM_STAT(VMEXIT_COUNT, "total number of vm exits");
159 VMM_STAT(VMEXIT_IRQ, "number of vmexits for an irq");
160 VMM_STAT(VMEXIT_UNHANDLED, "number of vmexits for an unhandled exception");
161 
162 /*
163  * Upper limit on vm_maxcpu. We could increase this to 28 bits, but this
164  * is a safe value for now.
165  */
166 #define	VM_MAXCPU	MIN(0xffff - 1, CPU_SETSIZE)
167 
168 static void
vcpu_cleanup(struct vcpu * vcpu,bool destroy)169 vcpu_cleanup(struct vcpu *vcpu, bool destroy)
170 {
171 	vmmops_vcpu_cleanup(vcpu->cookie);
172 	vcpu->cookie = NULL;
173 	if (destroy) {
174 		vmm_stat_free(vcpu->stats);
175 		fpu_save_area_free(vcpu->guestfpu);
176 		vcpu_lock_destroy(vcpu);
177 	}
178 }
179 
180 static struct vcpu *
vcpu_alloc(struct vm * vm,int vcpu_id)181 vcpu_alloc(struct vm *vm, int vcpu_id)
182 {
183 	struct vcpu *vcpu;
184 
185 	KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
186 	    ("vcpu_alloc: invalid vcpu %d", vcpu_id));
187 
188 	vcpu = malloc(sizeof(*vcpu), M_VMM, M_WAITOK | M_ZERO);
189 	vcpu_lock_init(vcpu);
190 	vcpu->state = VCPU_IDLE;
191 	vcpu->hostcpu = NOCPU;
192 	vcpu->vcpuid = vcpu_id;
193 	vcpu->vm = vm;
194 	vcpu->guestfpu = fpu_save_area_alloc();
195 	vcpu->stats = vmm_stat_alloc();
196 	return (vcpu);
197 }
198 
199 static void
vcpu_init(struct vcpu * vcpu)200 vcpu_init(struct vcpu *vcpu)
201 {
202 	vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid);
203 	MPASS(vcpu->cookie != NULL);
204 	fpu_save_area_reset(vcpu->guestfpu);
205 	vmm_stat_init(vcpu->stats);
206 }
207 
208 struct vm_exit *
vm_exitinfo(struct vcpu * vcpu)209 vm_exitinfo(struct vcpu *vcpu)
210 {
211 	return (&vcpu->exitinfo);
212 }
213 
214 static int
vmm_init(void)215 vmm_init(void)
216 {
217 
218 	vm_maxcpu = mp_ncpus;
219 
220 	TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu);
221 
222 	if (vm_maxcpu > VM_MAXCPU) {
223 		printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU);
224 		vm_maxcpu = VM_MAXCPU;
225 	}
226 
227 	if (vm_maxcpu == 0)
228 		vm_maxcpu = 1;
229 
230 	return (vmmops_modinit());
231 }
232 
233 static int
vmm_handler(module_t mod,int what,void * arg)234 vmm_handler(module_t mod, int what, void *arg)
235 {
236 	int error;
237 
238 	switch (what) {
239 	case MOD_LOAD:
240 		error = vmmdev_init();
241 		if (error != 0)
242 			break;
243 		error = vmm_init();
244 		if (error == 0)
245 			vmm_initialized = true;
246 		else
247 			(void)vmmdev_cleanup();
248 		break;
249 	case MOD_UNLOAD:
250 		error = vmmdev_cleanup();
251 		if (error == 0 && vmm_initialized) {
252 			error = vmmops_modcleanup();
253 			if (error) {
254 				/*
255 				 * Something bad happened - prevent new
256 				 * VMs from being created
257 				 */
258 				vmm_initialized = false;
259 			}
260 		}
261 		break;
262 	default:
263 		error = 0;
264 		break;
265 	}
266 	return (error);
267 }
268 
269 static moduledata_t vmm_kmod = {
270 	"vmm",
271 	vmm_handler,
272 	NULL
273 };
274 
275 /*
276  * vmm initialization has the following dependencies:
277  *
278  * - vmm device initialization requires an initialized devfs.
279  */
280 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_DEVFS + 1, SI_ORDER_ANY);
281 MODULE_VERSION(vmm, 1);
282 
283 static void
vm_init(struct vm * vm,bool create)284 vm_init(struct vm *vm, bool create)
285 {
286 	int i;
287 
288 	vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace));
289 	MPASS(vm->cookie != NULL);
290 
291 	CPU_ZERO(&vm->active_cpus);
292 	CPU_ZERO(&vm->debug_cpus);
293 
294 	vm->suspend = 0;
295 	CPU_ZERO(&vm->suspended_cpus);
296 
297 	memset(vm->mmio_region, 0, sizeof(vm->mmio_region));
298 
299 	if (!create) {
300 		for (i = 0; i < vm->maxcpus; i++) {
301 			if (vm->vcpu[i] != NULL)
302 				vcpu_init(vm->vcpu[i]);
303 		}
304 	}
305 }
306 
307 void
vm_disable_vcpu_creation(struct vm * vm)308 vm_disable_vcpu_creation(struct vm *vm)
309 {
310 	sx_xlock(&vm->vcpus_init_lock);
311 	vm->dying = true;
312 	sx_xunlock(&vm->vcpus_init_lock);
313 }
314 
315 struct vcpu *
vm_alloc_vcpu(struct vm * vm,int vcpuid)316 vm_alloc_vcpu(struct vm *vm, int vcpuid)
317 {
318 	struct vcpu *vcpu;
319 
320 	if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm))
321 		return (NULL);
322 
323 	/* Some interrupt controllers may have a CPU limit */
324 	if (vcpuid >= aplic_max_cpu_count(vm->cookie))
325 		return (NULL);
326 
327 	vcpu = (struct vcpu *)
328 	    atomic_load_acq_ptr((uintptr_t *)&vm->vcpu[vcpuid]);
329 	if (__predict_true(vcpu != NULL))
330 		return (vcpu);
331 
332 	sx_xlock(&vm->vcpus_init_lock);
333 	vcpu = vm->vcpu[vcpuid];
334 	if (vcpu == NULL && !vm->dying) {
335 		vcpu = vcpu_alloc(vm, vcpuid);
336 		vcpu_init(vcpu);
337 
338 		/*
339 		 * Ensure vCPU is fully created before updating pointer
340 		 * to permit unlocked reads above.
341 		 */
342 		atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid],
343 		    (uintptr_t)vcpu);
344 	}
345 	sx_xunlock(&vm->vcpus_init_lock);
346 	return (vcpu);
347 }
348 
349 void
vm_slock_vcpus(struct vm * vm)350 vm_slock_vcpus(struct vm *vm)
351 {
352 	sx_slock(&vm->vcpus_init_lock);
353 }
354 
355 void
vm_unlock_vcpus(struct vm * vm)356 vm_unlock_vcpus(struct vm *vm)
357 {
358 	sx_unlock(&vm->vcpus_init_lock);
359 }
360 
361 int
vm_create(const char * name,struct vm ** retvm)362 vm_create(const char *name, struct vm **retvm)
363 {
364 	struct vm *vm;
365 	struct vmspace *vmspace;
366 
367 	/*
368 	 * If vmm.ko could not be successfully initialized then don't attempt
369 	 * to create the virtual machine.
370 	 */
371 	if (!vmm_initialized)
372 		return (ENXIO);
373 
374 	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
375 		return (EINVAL);
376 
377 	vmspace = vmmops_vmspace_alloc(0, 1ul << 39);
378 	if (vmspace == NULL)
379 		return (ENOMEM);
380 
381 	vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO);
382 	strcpy(vm->name, name);
383 	vm->vmspace = vmspace;
384 	vm_mem_init(&vm->mem);
385 	sx_init(&vm->vcpus_init_lock, "vm vcpus");
386 
387 	vm->sockets = 1;
388 	vm->cores = 1;			/* XXX backwards compatibility */
389 	vm->threads = 1;		/* XXX backwards compatibility */
390 	vm->maxcpus = vm_maxcpu;
391 
392 	vm->vcpu = malloc(sizeof(*vm->vcpu) * vm->maxcpus, M_VMM,
393 	    M_WAITOK | M_ZERO);
394 
395 	vm_init(vm, true);
396 
397 	*retvm = vm;
398 	return (0);
399 }
400 
401 void
vm_get_topology(struct vm * vm,uint16_t * sockets,uint16_t * cores,uint16_t * threads,uint16_t * maxcpus)402 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
403     uint16_t *threads, uint16_t *maxcpus)
404 {
405 	*sockets = vm->sockets;
406 	*cores = vm->cores;
407 	*threads = vm->threads;
408 	*maxcpus = vm->maxcpus;
409 }
410 
411 uint16_t
vm_get_maxcpus(struct vm * vm)412 vm_get_maxcpus(struct vm *vm)
413 {
414 	return (vm->maxcpus);
415 }
416 
417 int
vm_set_topology(struct vm * vm,uint16_t sockets,uint16_t cores,uint16_t threads,uint16_t maxcpus)418 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
419     uint16_t threads, uint16_t maxcpus)
420 {
421 	/* Ignore maxcpus. */
422 	if ((sockets * cores * threads) > vm->maxcpus)
423 		return (EINVAL);
424 	vm->sockets = sockets;
425 	vm->cores = cores;
426 	vm->threads = threads;
427 	return(0);
428 }
429 
430 static void
vm_cleanup(struct vm * vm,bool destroy)431 vm_cleanup(struct vm *vm, bool destroy)
432 {
433 	int i;
434 
435 	if (destroy)
436 		vm_xlock_memsegs(vm);
437 	else
438 		vm_assert_memseg_xlocked(vm);
439 
440 	aplic_detach_from_vm(vm->cookie);
441 
442 	for (i = 0; i < vm->maxcpus; i++) {
443 		if (vm->vcpu[i] != NULL)
444 			vcpu_cleanup(vm->vcpu[i], destroy);
445 	}
446 
447 	vmmops_cleanup(vm->cookie);
448 
449 	vm_mem_cleanup(vm);
450 	if (destroy) {
451 		vm_mem_destroy(vm);
452 
453 		vmmops_vmspace_free(vm->vmspace);
454 		vm->vmspace = NULL;
455 
456 		for (i = 0; i < vm->maxcpus; i++)
457 			free(vm->vcpu[i], M_VMM);
458 		free(vm->vcpu, M_VMM);
459 		sx_destroy(&vm->vcpus_init_lock);
460 	}
461 }
462 
463 void
vm_destroy(struct vm * vm)464 vm_destroy(struct vm *vm)
465 {
466 
467 	vm_cleanup(vm, true);
468 
469 	free(vm, M_VMM);
470 }
471 
472 int
vm_reinit(struct vm * vm)473 vm_reinit(struct vm *vm)
474 {
475 	int error;
476 
477 	/*
478 	 * A virtual machine can be reset only if all vcpus are suspended.
479 	 */
480 	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
481 		vm_cleanup(vm, false);
482 		vm_init(vm, false);
483 		error = 0;
484 	} else {
485 		error = EBUSY;
486 	}
487 
488 	return (error);
489 }
490 
491 const char *
vm_name(struct vm * vm)492 vm_name(struct vm *vm)
493 {
494 	return (vm->name);
495 }
496 
497 int
vm_gla2gpa_nofault(struct vcpu * vcpu,struct vm_guest_paging * paging,uint64_t gla,int prot,uint64_t * gpa,int * is_fault)498 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
499     uint64_t gla, int prot, uint64_t *gpa, int *is_fault)
500 {
501 	return (vmmops_gla2gpa(vcpu->cookie, paging, gla, prot, gpa, is_fault));
502 }
503 
504 void
vm_register_inst_handler(struct vm * vm,uint64_t start,uint64_t size,mem_region_read_t mmio_read,mem_region_write_t mmio_write)505 vm_register_inst_handler(struct vm *vm, uint64_t start, uint64_t size,
506     mem_region_read_t mmio_read, mem_region_write_t mmio_write)
507 {
508 	int i;
509 
510 	for (i = 0; i < nitems(vm->mmio_region); i++) {
511 		if (vm->mmio_region[i].start == 0 &&
512 		    vm->mmio_region[i].end == 0) {
513 			vm->mmio_region[i].start = start;
514 			vm->mmio_region[i].end = start + size;
515 			vm->mmio_region[i].read = mmio_read;
516 			vm->mmio_region[i].write = mmio_write;
517 			return;
518 		}
519 	}
520 
521 	panic("%s: No free MMIO region", __func__);
522 }
523 
524 void
vm_deregister_inst_handler(struct vm * vm,uint64_t start,uint64_t size)525 vm_deregister_inst_handler(struct vm *vm, uint64_t start, uint64_t size)
526 {
527 	int i;
528 
529 	for (i = 0; i < nitems(vm->mmio_region); i++) {
530 		if (vm->mmio_region[i].start == start &&
531 		    vm->mmio_region[i].end == start + size) {
532 			memset(&vm->mmio_region[i], 0,
533 			    sizeof(vm->mmio_region[i]));
534 			return;
535 		}
536 	}
537 
538 	panic("%s: Invalid MMIO region: %lx - %lx", __func__, start,
539 	    start + size);
540 }
541 
542 static int
vm_handle_inst_emul(struct vcpu * vcpu,bool * retu)543 vm_handle_inst_emul(struct vcpu *vcpu, bool *retu)
544 {
545 	struct vm *vm;
546 	struct vm_exit *vme;
547 	struct vie *vie;
548 	struct hyp *hyp;
549 	uint64_t fault_ipa;
550 	struct vm_guest_paging *paging;
551 	struct vmm_mmio_region *vmr;
552 	int error, i;
553 
554 	vm = vcpu->vm;
555 	hyp = vm->cookie;
556 	if (!hyp->aplic_attached)
557 		goto out_user;
558 
559 	vme = &vcpu->exitinfo;
560 	vie = &vme->u.inst_emul.vie;
561 	paging = &vme->u.inst_emul.paging;
562 
563 	fault_ipa = vme->u.inst_emul.gpa;
564 
565 	vmr = NULL;
566 	for (i = 0; i < nitems(vm->mmio_region); i++) {
567 		if (vm->mmio_region[i].start <= fault_ipa &&
568 		    vm->mmio_region[i].end > fault_ipa) {
569 			vmr = &vm->mmio_region[i];
570 			break;
571 		}
572 	}
573 	if (vmr == NULL)
574 		goto out_user;
575 
576 	error = vmm_emulate_instruction(vcpu, fault_ipa, vie, paging,
577 	    vmr->read, vmr->write, retu);
578 	return (error);
579 
580 out_user:
581 	*retu = true;
582 	return (0);
583 }
584 
585 int
vm_suspend(struct vm * vm,enum vm_suspend_how how)586 vm_suspend(struct vm *vm, enum vm_suspend_how how)
587 {
588 	int i;
589 
590 	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
591 		return (EINVAL);
592 
593 	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
594 		VM_CTR2(vm, "virtual machine already suspended %d/%d",
595 		    vm->suspend, how);
596 		return (EALREADY);
597 	}
598 
599 	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
600 
601 	/*
602 	 * Notify all active vcpus that they are now suspended.
603 	 */
604 	for (i = 0; i < vm->maxcpus; i++) {
605 		if (CPU_ISSET(i, &vm->active_cpus))
606 			vcpu_notify_event(vm_vcpu(vm, i));
607 	}
608 
609 	return (0);
610 }
611 
612 void
vm_exit_suspended(struct vcpu * vcpu,uint64_t pc)613 vm_exit_suspended(struct vcpu *vcpu, uint64_t pc)
614 {
615 	struct vm *vm = vcpu->vm;
616 	struct vm_exit *vmexit;
617 
618 	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
619 	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
620 
621 	vmexit = vm_exitinfo(vcpu);
622 	vmexit->pc = pc;
623 	vmexit->inst_length = 4;
624 	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
625 	vmexit->u.suspended.how = vm->suspend;
626 }
627 
628 void
vm_exit_debug(struct vcpu * vcpu,uint64_t pc)629 vm_exit_debug(struct vcpu *vcpu, uint64_t pc)
630 {
631 	struct vm_exit *vmexit;
632 
633 	vmexit = vm_exitinfo(vcpu);
634 	vmexit->pc = pc;
635 	vmexit->inst_length = 4;
636 	vmexit->exitcode = VM_EXITCODE_DEBUG;
637 }
638 
639 int
vm_activate_cpu(struct vcpu * vcpu)640 vm_activate_cpu(struct vcpu *vcpu)
641 {
642 	struct vm *vm = vcpu->vm;
643 
644 	if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
645 		return (EBUSY);
646 
647 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus);
648 	return (0);
649 
650 }
651 
652 int
vm_suspend_cpu(struct vm * vm,struct vcpu * vcpu)653 vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu)
654 {
655 	if (vcpu == NULL) {
656 		vm->debug_cpus = vm->active_cpus;
657 		for (int i = 0; i < vm->maxcpus; i++) {
658 			if (CPU_ISSET(i, &vm->active_cpus))
659 				vcpu_notify_event(vm_vcpu(vm, i));
660 		}
661 	} else {
662 		if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
663 			return (EINVAL);
664 
665 		CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
666 		vcpu_notify_event(vcpu);
667 	}
668 	return (0);
669 }
670 
671 int
vm_resume_cpu(struct vm * vm,struct vcpu * vcpu)672 vm_resume_cpu(struct vm *vm, struct vcpu *vcpu)
673 {
674 
675 	if (vcpu == NULL) {
676 		CPU_ZERO(&vm->debug_cpus);
677 	} else {
678 		if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus))
679 			return (EINVAL);
680 
681 		CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
682 	}
683 	return (0);
684 }
685 
686 int
vcpu_debugged(struct vcpu * vcpu)687 vcpu_debugged(struct vcpu *vcpu)
688 {
689 
690 	return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus));
691 }
692 
693 cpuset_t
vm_active_cpus(struct vm * vm)694 vm_active_cpus(struct vm *vm)
695 {
696 
697 	return (vm->active_cpus);
698 }
699 
700 cpuset_t
vm_debug_cpus(struct vm * vm)701 vm_debug_cpus(struct vm *vm)
702 {
703 
704 	return (vm->debug_cpus);
705 }
706 
707 cpuset_t
vm_suspended_cpus(struct vm * vm)708 vm_suspended_cpus(struct vm *vm)
709 {
710 
711 	return (vm->suspended_cpus);
712 }
713 
714 
715 void *
vcpu_stats(struct vcpu * vcpu)716 vcpu_stats(struct vcpu *vcpu)
717 {
718 
719 	return (vcpu->stats);
720 }
721 
722 /*
723  * This function is called to ensure that a vcpu "sees" a pending event
724  * as soon as possible:
725  * - If the vcpu thread is sleeping then it is woken up.
726  * - If the vcpu is running on a different host_cpu then an IPI will be directed
727  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
728  */
729 static void
vcpu_notify_event_locked(struct vcpu * vcpu)730 vcpu_notify_event_locked(struct vcpu *vcpu)
731 {
732 	int hostcpu;
733 
734 	hostcpu = vcpu->hostcpu;
735 	if (vcpu->state == VCPU_RUNNING) {
736 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
737 		if (hostcpu != curcpu) {
738 			ipi_cpu(hostcpu, vmm_ipinum);
739 		} else {
740 			/*
741 			 * If the 'vcpu' is running on 'curcpu' then it must
742 			 * be sending a notification to itself (e.g. SELF_IPI).
743 			 * The pending event will be picked up when the vcpu
744 			 * transitions back to guest context.
745 			 */
746 		}
747 	} else {
748 		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
749 		    "with hostcpu %d", vcpu->state, hostcpu));
750 		if (vcpu->state == VCPU_SLEEPING)
751 			wakeup_one(vcpu);
752 	}
753 }
754 
755 void
vcpu_notify_event(struct vcpu * vcpu)756 vcpu_notify_event(struct vcpu *vcpu)
757 {
758 	vcpu_lock(vcpu);
759 	vcpu_notify_event_locked(vcpu);
760 	vcpu_unlock(vcpu);
761 }
762 
763 struct vmspace *
vm_vmspace(struct vm * vm)764 vm_vmspace(struct vm *vm)
765 {
766 	return (vm->vmspace);
767 }
768 
769 struct vm_mem *
vm_mem(struct vm * vm)770 vm_mem(struct vm *vm)
771 {
772 	return (&vm->mem);
773 }
774 
775 static void
restore_guest_fpustate(struct vcpu * vcpu)776 restore_guest_fpustate(struct vcpu *vcpu)
777 {
778 
779 	/* Flush host state to the pcb. */
780 	fpe_state_save(curthread);
781 
782 	/* Ensure the VFP state will be re-loaded when exiting the guest. */
783 	PCPU_SET(fpcurthread, NULL);
784 
785 	/* restore guest FPU state */
786 	fpe_enable();
787 	fpe_restore(vcpu->guestfpu);
788 
789 	/*
790 	 * The FPU is now "dirty" with the guest's state so turn on emulation
791 	 * to trap any access to the FPU by the host.
792 	 */
793 	fpe_disable();
794 }
795 
796 static void
save_guest_fpustate(struct vcpu * vcpu)797 save_guest_fpustate(struct vcpu *vcpu)
798 {
799 
800 	/* Save guest FPE state. */
801 	fpe_enable();
802 	fpe_store(vcpu->guestfpu);
803 	fpe_disable();
804 
805 	KASSERT(PCPU_GET(fpcurthread) == NULL,
806 	    ("%s: fpcurthread set with guest registers", __func__));
807 }
808 
809 static int
vcpu_set_state_locked(struct vcpu * vcpu,enum vcpu_state newstate,bool from_idle)810 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
811     bool from_idle)
812 {
813 	int error;
814 
815 	vcpu_assert_locked(vcpu);
816 
817 	/*
818 	 * State transitions from the vmmdev_ioctl() must always begin from
819 	 * the VCPU_IDLE state. This guarantees that there is only a single
820 	 * ioctl() operating on a vcpu at any point.
821 	 */
822 	if (from_idle) {
823 		while (vcpu->state != VCPU_IDLE) {
824 			vcpu_notify_event_locked(vcpu);
825 			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
826 		}
827 	} else {
828 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
829 		    "vcpu idle state"));
830 	}
831 
832 	if (vcpu->state == VCPU_RUNNING) {
833 		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
834 		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
835 	} else {
836 		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
837 		    "vcpu that is not running", vcpu->hostcpu));
838 	}
839 
840 	/*
841 	 * The following state transitions are allowed:
842 	 * IDLE -> FROZEN -> IDLE
843 	 * FROZEN -> RUNNING -> FROZEN
844 	 * FROZEN -> SLEEPING -> FROZEN
845 	 */
846 	switch (vcpu->state) {
847 	case VCPU_IDLE:
848 	case VCPU_RUNNING:
849 	case VCPU_SLEEPING:
850 		error = (newstate != VCPU_FROZEN);
851 		break;
852 	case VCPU_FROZEN:
853 		error = (newstate == VCPU_FROZEN);
854 		break;
855 	default:
856 		error = 1;
857 		break;
858 	}
859 
860 	if (error)
861 		return (EBUSY);
862 
863 	vcpu->state = newstate;
864 	if (newstate == VCPU_RUNNING)
865 		vcpu->hostcpu = curcpu;
866 	else
867 		vcpu->hostcpu = NOCPU;
868 
869 	if (newstate == VCPU_IDLE)
870 		wakeup(&vcpu->state);
871 
872 	return (0);
873 }
874 
875 static void
vcpu_require_state(struct vcpu * vcpu,enum vcpu_state newstate)876 vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate)
877 {
878 	int error;
879 
880 	if ((error = vcpu_set_state(vcpu, newstate, false)) != 0)
881 		panic("Error %d setting state to %d\n", error, newstate);
882 }
883 
884 static void
vcpu_require_state_locked(struct vcpu * vcpu,enum vcpu_state newstate)885 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
886 {
887 	int error;
888 
889 	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
890 		panic("Error %d setting state to %d", error, newstate);
891 }
892 
893 int
vm_get_capability(struct vcpu * vcpu,int type,int * retval)894 vm_get_capability(struct vcpu *vcpu, int type, int *retval)
895 {
896 
897 	if (type < 0 || type >= VM_CAP_MAX)
898 		return (EINVAL);
899 
900 	return (vmmops_getcap(vcpu->cookie, type, retval));
901 }
902 
903 int
vm_set_capability(struct vcpu * vcpu,int type,int val)904 vm_set_capability(struct vcpu *vcpu, int type, int val)
905 {
906 
907 	if (type < 0 || type >= VM_CAP_MAX)
908 		return (EINVAL);
909 
910 	return (vmmops_setcap(vcpu->cookie, type, val));
911 }
912 
913 struct vm *
vcpu_vm(struct vcpu * vcpu)914 vcpu_vm(struct vcpu *vcpu)
915 {
916 
917 	return (vcpu->vm);
918 }
919 
920 int
vcpu_vcpuid(struct vcpu * vcpu)921 vcpu_vcpuid(struct vcpu *vcpu)
922 {
923 
924 	return (vcpu->vcpuid);
925 }
926 
927 void *
vcpu_get_cookie(struct vcpu * vcpu)928 vcpu_get_cookie(struct vcpu *vcpu)
929 {
930 
931 	return (vcpu->cookie);
932 }
933 
934 struct vcpu *
vm_vcpu(struct vm * vm,int vcpuid)935 vm_vcpu(struct vm *vm, int vcpuid)
936 {
937 
938 	return (vm->vcpu[vcpuid]);
939 }
940 
941 int
vcpu_set_state(struct vcpu * vcpu,enum vcpu_state newstate,bool from_idle)942 vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle)
943 {
944 	int error;
945 
946 	vcpu_lock(vcpu);
947 	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
948 	vcpu_unlock(vcpu);
949 
950 	return (error);
951 }
952 
953 enum vcpu_state
vcpu_get_state(struct vcpu * vcpu,int * hostcpu)954 vcpu_get_state(struct vcpu *vcpu, int *hostcpu)
955 {
956 	enum vcpu_state state;
957 
958 	vcpu_lock(vcpu);
959 	state = vcpu->state;
960 	if (hostcpu != NULL)
961 		*hostcpu = vcpu->hostcpu;
962 	vcpu_unlock(vcpu);
963 
964 	return (state);
965 }
966 
967 int
vm_get_register(struct vcpu * vcpu,int reg,uint64_t * retval)968 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval)
969 {
970 
971 	if (reg >= VM_REG_LAST)
972 		return (EINVAL);
973 
974 	return (vmmops_getreg(vcpu->cookie, reg, retval));
975 }
976 
977 int
vm_set_register(struct vcpu * vcpu,int reg,uint64_t val)978 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
979 {
980 	int error;
981 
982 	if (reg >= VM_REG_LAST)
983 		return (EINVAL);
984 	error = vmmops_setreg(vcpu->cookie, reg, val);
985 	if (error || reg != VM_REG_GUEST_SEPC)
986 		return (error);
987 
988 	vcpu->nextpc = val;
989 
990 	return (0);
991 }
992 
993 void *
vm_get_cookie(struct vm * vm)994 vm_get_cookie(struct vm *vm)
995 {
996 
997 	return (vm->cookie);
998 }
999 
1000 int
vm_inject_exception(struct vcpu * vcpu,uint64_t scause)1001 vm_inject_exception(struct vcpu *vcpu, uint64_t scause)
1002 {
1003 
1004 	return (vmmops_exception(vcpu->cookie, scause));
1005 }
1006 
1007 int
vm_attach_aplic(struct vm * vm,struct vm_aplic_descr * descr)1008 vm_attach_aplic(struct vm *vm, struct vm_aplic_descr *descr)
1009 {
1010 
1011 	return (aplic_attach_to_vm(vm->cookie, descr));
1012 }
1013 
1014 int
vm_assert_irq(struct vm * vm,uint32_t irq)1015 vm_assert_irq(struct vm *vm, uint32_t irq)
1016 {
1017 
1018 	return (aplic_inject_irq(vm->cookie, -1, irq, true));
1019 }
1020 
1021 int
vm_deassert_irq(struct vm * vm,uint32_t irq)1022 vm_deassert_irq(struct vm *vm, uint32_t irq)
1023 {
1024 
1025 	return (aplic_inject_irq(vm->cookie, -1, irq, false));
1026 }
1027 
1028 int
vm_raise_msi(struct vm * vm,uint64_t msg,uint64_t addr,int bus,int slot,int func)1029 vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot,
1030     int func)
1031 {
1032 
1033 	return (aplic_inject_msi(vm->cookie, msg, addr));
1034 }
1035 
1036 static int
vm_handle_wfi(struct vcpu * vcpu,struct vm_exit * vme,bool * retu)1037 vm_handle_wfi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
1038 {
1039 
1040 	vcpu_lock(vcpu);
1041 
1042 	while (1) {
1043 		if (aplic_check_pending(vcpu->cookie))
1044 			break;
1045 
1046 		if (riscv_check_ipi(vcpu->cookie, false))
1047 			break;
1048 
1049 		if (riscv_check_interrupts_pending(vcpu->cookie))
1050 			break;
1051 
1052 		if (vcpu_should_yield(vcpu))
1053 			break;
1054 
1055 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1056 		/*
1057 		 * XXX msleep_spin() cannot be interrupted by signals so
1058 		 * wake up periodically to check pending signals.
1059 		 */
1060 		msleep_spin(vcpu, &vcpu->mtx, "vmidle", hz);
1061 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1062 	}
1063 	vcpu_unlock(vcpu);
1064 
1065 	*retu = false;
1066 
1067 	return (0);
1068 }
1069 
1070 static int
vm_handle_paging(struct vcpu * vcpu,bool * retu)1071 vm_handle_paging(struct vcpu *vcpu, bool *retu)
1072 {
1073 	struct vm *vm;
1074 	struct vm_exit *vme;
1075 	struct vm_map *map;
1076 	uint64_t addr;
1077 	pmap_t pmap;
1078 	int ftype, rv;
1079 
1080 	vm = vcpu->vm;
1081 	vme = &vcpu->exitinfo;
1082 
1083 	pmap = vmspace_pmap(vm->vmspace);
1084 	addr = (vme->htval << 2) & ~(PAGE_SIZE - 1);
1085 
1086 	dprintf("%s: %lx\n", __func__, addr);
1087 
1088 	switch (vme->scause) {
1089 	case SCAUSE_STORE_GUEST_PAGE_FAULT:
1090 		ftype = VM_PROT_WRITE;
1091 		break;
1092 	case SCAUSE_FETCH_GUEST_PAGE_FAULT:
1093 		ftype = VM_PROT_EXECUTE;
1094 		break;
1095 	case SCAUSE_LOAD_GUEST_PAGE_FAULT:
1096 		ftype = VM_PROT_READ;
1097 		break;
1098 	default:
1099 		panic("unknown page trap: %lu", vme->scause);
1100 	}
1101 
1102 	/* The page exists, but the page table needs to be updated. */
1103 	if (pmap_fault(pmap, addr, ftype))
1104 		return (0);
1105 
1106 	map = &vm->vmspace->vm_map;
1107 	rv = vm_fault(map, addr, ftype, VM_FAULT_NORMAL, NULL);
1108 	if (rv != KERN_SUCCESS) {
1109 		printf("%s: vm_fault failed, addr %lx, ftype %d, err %d\n",
1110 		    __func__, addr, ftype, rv);
1111 		return (EFAULT);
1112 	}
1113 
1114 	return (0);
1115 }
1116 
1117 static int
vm_handle_suspend(struct vcpu * vcpu,bool * retu)1118 vm_handle_suspend(struct vcpu *vcpu, bool *retu)
1119 {
1120 	struct vm *vm = vcpu->vm;
1121 	int error, i;
1122 	struct thread *td;
1123 
1124 	error = 0;
1125 	td = curthread;
1126 
1127 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->suspended_cpus);
1128 
1129 	/*
1130 	 * Wait until all 'active_cpus' have suspended themselves.
1131 	 *
1132 	 * Since a VM may be suspended at any time including when one or
1133 	 * more vcpus are doing a rendezvous we need to call the rendezvous
1134 	 * handler while we are waiting to prevent a deadlock.
1135 	 */
1136 	vcpu_lock(vcpu);
1137 	while (error == 0) {
1138 		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0)
1139 			break;
1140 
1141 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1142 		msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
1143 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1144 		if (td_ast_pending(td, TDA_SUSPEND)) {
1145 			vcpu_unlock(vcpu);
1146 			error = thread_check_susp(td, false);
1147 			vcpu_lock(vcpu);
1148 		}
1149 	}
1150 	vcpu_unlock(vcpu);
1151 
1152 	/*
1153 	 * Wakeup the other sleeping vcpus and return to userspace.
1154 	 */
1155 	for (i = 0; i < vm->maxcpus; i++) {
1156 		if (CPU_ISSET(i, &vm->suspended_cpus)) {
1157 			vcpu_notify_event(vm_vcpu(vm, i));
1158 		}
1159 	}
1160 
1161 	*retu = true;
1162 	return (error);
1163 }
1164 
1165 int
vm_run(struct vcpu * vcpu)1166 vm_run(struct vcpu *vcpu)
1167 {
1168 	struct vm_eventinfo evinfo;
1169 	struct vm_exit *vme;
1170 	struct vm *vm;
1171 	pmap_t pmap;
1172 	int error;
1173 	int vcpuid;
1174 	bool retu;
1175 
1176 	vm = vcpu->vm;
1177 
1178 	dprintf("%s\n", __func__);
1179 
1180 	vcpuid = vcpu->vcpuid;
1181 
1182 	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
1183 		return (EINVAL);
1184 
1185 	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
1186 		return (EINVAL);
1187 
1188 	pmap = vmspace_pmap(vm->vmspace);
1189 	vme = &vcpu->exitinfo;
1190 	evinfo.rptr = NULL;
1191 	evinfo.sptr = &vm->suspend;
1192 	evinfo.iptr = NULL;
1193 restart:
1194 	critical_enter();
1195 
1196 	restore_guest_fpustate(vcpu);
1197 
1198 	vcpu_require_state(vcpu, VCPU_RUNNING);
1199 	error = vmmops_run(vcpu->cookie, vcpu->nextpc, pmap, &evinfo);
1200 	vcpu_require_state(vcpu, VCPU_FROZEN);
1201 
1202 	save_guest_fpustate(vcpu);
1203 
1204 	critical_exit();
1205 
1206 	if (error == 0) {
1207 		retu = false;
1208 		switch (vme->exitcode) {
1209 		case VM_EXITCODE_INST_EMUL:
1210 			vcpu->nextpc = vme->pc + vme->inst_length;
1211 			error = vm_handle_inst_emul(vcpu, &retu);
1212 			break;
1213 		case VM_EXITCODE_WFI:
1214 			vcpu->nextpc = vme->pc + vme->inst_length;
1215 			error = vm_handle_wfi(vcpu, vme, &retu);
1216 			break;
1217 		case VM_EXITCODE_ECALL:
1218 			/* Handle in userland. */
1219 			vcpu->nextpc = vme->pc + vme->inst_length;
1220 			retu = true;
1221 			break;
1222 		case VM_EXITCODE_PAGING:
1223 			vcpu->nextpc = vme->pc;
1224 			error = vm_handle_paging(vcpu, &retu);
1225 			break;
1226 		case VM_EXITCODE_BOGUS:
1227 			vcpu->nextpc = vme->pc;
1228 			retu = false;
1229 			error = 0;
1230 			break;
1231 		case VM_EXITCODE_SUSPENDED:
1232 			vcpu->nextpc = vme->pc;
1233 			error = vm_handle_suspend(vcpu, &retu);
1234 			break;
1235 		default:
1236 			/* Handle in userland. */
1237 			vcpu->nextpc = vme->pc;
1238 			retu = true;
1239 			break;
1240 		}
1241 	}
1242 
1243 	if (error == 0 && retu == false)
1244 		goto restart;
1245 
1246 	return (error);
1247 }
1248