1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2015 Mihai Carabas <mihai.carabas@gmail.com> 5 * Copyright (c) 2024 Ruslan Bukin <br@bsdpad.com> 6 * 7 * This software was developed by the University of Cambridge Computer 8 * Laboratory (Department of Computer Science and Technology) under Innovate 9 * UK project 105694, "Digital Security by Design (DSbD) Technology Platform 10 * Prototype". 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 #include <sys/param.h> 35 #include <sys/systm.h> 36 #include <sys/cpuset.h> 37 #include <sys/kernel.h> 38 #include <sys/linker.h> 39 #include <sys/lock.h> 40 #include <sys/malloc.h> 41 #include <sys/module.h> 42 #include <sys/mutex.h> 43 #include <sys/pcpu.h> 44 #include <sys/proc.h> 45 #include <sys/queue.h> 46 #include <sys/rwlock.h> 47 #include <sys/sched.h> 48 #include <sys/smp.h> 49 #include <sys/sysctl.h> 50 51 #include <vm/vm.h> 52 #include <vm/vm_object.h> 53 #include <vm/vm_page.h> 54 #include <vm/pmap.h> 55 #include <vm/vm_map.h> 56 #include <vm/vm_extern.h> 57 #include <vm/vm_param.h> 58 59 #include <machine/riscvreg.h> 60 #include <machine/cpu.h> 61 #include <machine/fpe.h> 62 #include <machine/machdep.h> 63 #include <machine/pcb.h> 64 #include <machine/smp.h> 65 #include <machine/vm.h> 66 #include <machine/vmparam.h> 67 #include <machine/vmm.h> 68 #include <machine/vmm_instruction_emul.h> 69 70 #include <dev/pci/pcireg.h> 71 72 #include <dev/vmm/vmm_dev.h> 73 #include <dev/vmm/vmm_ktr.h> 74 #include <dev/vmm/vmm_mem.h> 75 76 #include "vmm_stat.h" 77 #include "riscv.h" 78 79 #include "vmm_aplic.h" 80 81 struct vcpu { 82 int flags; 83 enum vcpu_state state; 84 struct mtx mtx; 85 int hostcpu; /* host cpuid this vcpu last ran on */ 86 int vcpuid; 87 void *stats; 88 struct vm_exit exitinfo; 89 uint64_t nextpc; /* (x) next instruction to execute */ 90 struct vm *vm; /* (o) */ 91 void *cookie; /* (i) cpu-specific data */ 92 struct fpreg *guestfpu; /* (a,i) guest fpu state */ 93 }; 94 95 #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) 96 #define vcpu_lock_destroy(v) mtx_destroy(&((v)->mtx)) 97 #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) 98 #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) 99 #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) 100 101 struct vmm_mmio_region { 102 uint64_t start; 103 uint64_t end; 104 mem_region_read_t read; 105 mem_region_write_t write; 106 }; 107 #define VM_MAX_MMIO_REGIONS 4 108 109 /* 110 * Initialization: 111 * (o) initialized the first time the VM is created 112 * (i) initialized when VM is created and when it is reinitialized 113 * (x) initialized before use 114 */ 115 struct vm { 116 void *cookie; /* (i) cpu-specific data */ 117 volatile cpuset_t active_cpus; /* (i) active vcpus */ 118 volatile cpuset_t debug_cpus; /* (i) vcpus stopped for debug*/ 119 int suspend; /* (i) stop VM execution */ 120 bool dying; /* (o) is dying */ 121 volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ 122 volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ 123 struct vm_mem mem; /* (i) [m+v] guest memory */ 124 char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ 125 struct vcpu **vcpu; /* (i) guest vcpus */ 126 struct vmm_mmio_region mmio_region[VM_MAX_MMIO_REGIONS]; 127 /* (o) guest MMIO regions */ 128 /* The following describe the vm cpu topology */ 129 uint16_t sockets; /* (o) num of sockets */ 130 uint16_t cores; /* (o) num of cores/socket */ 131 uint16_t threads; /* (o) num of threads/core */ 132 uint16_t maxcpus; /* (o) max pluggable cpus */ 133 struct sx vcpus_init_lock; /* (o) */ 134 }; 135 136 static bool vmm_initialized = false; 137 138 static MALLOC_DEFINE(M_VMM, "vmm", "vmm"); 139 140 /* statistics */ 141 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); 142 143 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); 144 145 static int vmm_ipinum; 146 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, 147 "IPI vector used for vcpu notifications"); 148 149 u_int vm_maxcpu; 150 SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 151 &vm_maxcpu, 0, "Maximum number of vCPUs"); 152 153 static void vcpu_notify_event_locked(struct vcpu *vcpu); 154 155 /* global statistics */ 156 VMM_STAT(VMEXIT_COUNT, "total number of vm exits"); 157 VMM_STAT(VMEXIT_IRQ, "number of vmexits for an irq"); 158 VMM_STAT(VMEXIT_UNHANDLED, "number of vmexits for an unhandled exception"); 159 160 /* 161 * Upper limit on vm_maxcpu. We could increase this to 28 bits, but this 162 * is a safe value for now. 163 */ 164 #define VM_MAXCPU MIN(0xffff - 1, CPU_SETSIZE) 165 166 static void 167 vcpu_cleanup(struct vcpu *vcpu, bool destroy) 168 { 169 vmmops_vcpu_cleanup(vcpu->cookie); 170 vcpu->cookie = NULL; 171 if (destroy) { 172 vmm_stat_free(vcpu->stats); 173 fpu_save_area_free(vcpu->guestfpu); 174 vcpu_lock_destroy(vcpu); 175 free(vcpu, M_VMM); 176 } 177 } 178 179 static struct vcpu * 180 vcpu_alloc(struct vm *vm, int vcpu_id) 181 { 182 struct vcpu *vcpu; 183 184 KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, 185 ("vcpu_alloc: invalid vcpu %d", vcpu_id)); 186 187 vcpu = malloc(sizeof(*vcpu), M_VMM, M_WAITOK | M_ZERO); 188 vcpu_lock_init(vcpu); 189 vcpu->state = VCPU_IDLE; 190 vcpu->hostcpu = NOCPU; 191 vcpu->vcpuid = vcpu_id; 192 vcpu->vm = vm; 193 vcpu->guestfpu = fpu_save_area_alloc(); 194 vcpu->stats = vmm_stat_alloc(); 195 return (vcpu); 196 } 197 198 static void 199 vcpu_init(struct vcpu *vcpu) 200 { 201 vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid); 202 MPASS(vcpu->cookie != NULL); 203 fpu_save_area_reset(vcpu->guestfpu); 204 vmm_stat_init(vcpu->stats); 205 } 206 207 struct vm_exit * 208 vm_exitinfo(struct vcpu *vcpu) 209 { 210 return (&vcpu->exitinfo); 211 } 212 213 static int 214 vmm_init(void) 215 { 216 217 vm_maxcpu = mp_ncpus; 218 219 TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu); 220 221 if (vm_maxcpu > VM_MAXCPU) { 222 printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU); 223 vm_maxcpu = VM_MAXCPU; 224 } 225 226 if (vm_maxcpu == 0) 227 vm_maxcpu = 1; 228 229 return (vmmops_modinit()); 230 } 231 232 static int 233 vmm_handler(module_t mod, int what, void *arg) 234 { 235 int error; 236 237 switch (what) { 238 case MOD_LOAD: 239 error = vmmdev_init(); 240 if (error != 0) 241 break; 242 error = vmm_init(); 243 if (error == 0) 244 vmm_initialized = true; 245 else 246 (void)vmmdev_cleanup(); 247 break; 248 case MOD_UNLOAD: 249 error = vmmdev_cleanup(); 250 if (error == 0 && vmm_initialized) { 251 error = vmmops_modcleanup(); 252 if (error) { 253 /* 254 * Something bad happened - prevent new 255 * VMs from being created 256 */ 257 vmm_initialized = false; 258 } 259 } 260 break; 261 default: 262 error = 0; 263 break; 264 } 265 return (error); 266 } 267 268 static moduledata_t vmm_kmod = { 269 "vmm", 270 vmm_handler, 271 NULL 272 }; 273 274 /* 275 * vmm initialization has the following dependencies: 276 * 277 * - vmm device initialization requires an initialized devfs. 278 */ 279 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_DEVFS + 1, SI_ORDER_ANY); 280 MODULE_VERSION(vmm, 1); 281 282 static void 283 vm_init(struct vm *vm, bool create) 284 { 285 int i; 286 287 vm->cookie = vmmops_init(vm, vmspace_pmap(vm_vmspace(vm))); 288 MPASS(vm->cookie != NULL); 289 290 CPU_ZERO(&vm->active_cpus); 291 CPU_ZERO(&vm->debug_cpus); 292 293 vm->suspend = 0; 294 CPU_ZERO(&vm->suspended_cpus); 295 296 memset(vm->mmio_region, 0, sizeof(vm->mmio_region)); 297 298 if (!create) { 299 for (i = 0; i < vm->maxcpus; i++) { 300 if (vm->vcpu[i] != NULL) 301 vcpu_init(vm->vcpu[i]); 302 } 303 } 304 } 305 306 void 307 vm_disable_vcpu_creation(struct vm *vm) 308 { 309 sx_xlock(&vm->vcpus_init_lock); 310 vm->dying = true; 311 sx_xunlock(&vm->vcpus_init_lock); 312 } 313 314 struct vcpu * 315 vm_alloc_vcpu(struct vm *vm, int vcpuid) 316 { 317 struct vcpu *vcpu; 318 319 if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm)) 320 return (NULL); 321 322 vcpu = (struct vcpu *) 323 atomic_load_acq_ptr((uintptr_t *)&vm->vcpu[vcpuid]); 324 if (__predict_true(vcpu != NULL)) 325 return (vcpu); 326 327 sx_xlock(&vm->vcpus_init_lock); 328 vcpu = vm->vcpu[vcpuid]; 329 if (vcpu == NULL && !vm->dying) { 330 vcpu = vcpu_alloc(vm, vcpuid); 331 vcpu_init(vcpu); 332 333 /* 334 * Ensure vCPU is fully created before updating pointer 335 * to permit unlocked reads above. 336 */ 337 atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid], 338 (uintptr_t)vcpu); 339 } 340 sx_xunlock(&vm->vcpus_init_lock); 341 return (vcpu); 342 } 343 344 void 345 vm_lock_vcpus(struct vm *vm) 346 { 347 sx_xlock(&vm->vcpus_init_lock); 348 } 349 350 void 351 vm_unlock_vcpus(struct vm *vm) 352 { 353 sx_unlock(&vm->vcpus_init_lock); 354 } 355 356 int 357 vm_create(const char *name, struct vm **retvm) 358 { 359 struct vm *vm; 360 int error; 361 362 /* 363 * If vmm.ko could not be successfully initialized then don't attempt 364 * to create the virtual machine. 365 */ 366 if (!vmm_initialized) 367 return (ENXIO); 368 369 if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) 370 return (EINVAL); 371 372 vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO); 373 error = vm_mem_init(&vm->mem, 0, 1ul << 39); 374 if (error != 0) { 375 free(vm, M_VMM); 376 return (error); 377 } 378 strcpy(vm->name, name); 379 sx_init(&vm->vcpus_init_lock, "vm vcpus"); 380 381 vm->sockets = 1; 382 vm->cores = 1; /* XXX backwards compatibility */ 383 vm->threads = 1; /* XXX backwards compatibility */ 384 vm->maxcpus = vm_maxcpu; 385 386 vm->vcpu = malloc(sizeof(*vm->vcpu) * vm->maxcpus, M_VMM, 387 M_WAITOK | M_ZERO); 388 389 vm_init(vm, true); 390 391 *retvm = vm; 392 return (0); 393 } 394 395 void 396 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, 397 uint16_t *threads, uint16_t *maxcpus) 398 { 399 *sockets = vm->sockets; 400 *cores = vm->cores; 401 *threads = vm->threads; 402 *maxcpus = vm->maxcpus; 403 } 404 405 uint16_t 406 vm_get_maxcpus(struct vm *vm) 407 { 408 return (vm->maxcpus); 409 } 410 411 int 412 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, 413 uint16_t threads, uint16_t maxcpus) 414 { 415 /* Ignore maxcpus. */ 416 if ((sockets * cores * threads) > vm->maxcpus) 417 return (EINVAL); 418 vm->sockets = sockets; 419 vm->cores = cores; 420 vm->threads = threads; 421 return(0); 422 } 423 424 static void 425 vm_cleanup(struct vm *vm, bool destroy) 426 { 427 int i; 428 429 if (destroy) 430 vm_xlock_memsegs(vm); 431 else 432 vm_assert_memseg_xlocked(vm); 433 434 aplic_detach_from_vm(vm->cookie); 435 436 for (i = 0; i < vm->maxcpus; i++) { 437 if (vm->vcpu[i] != NULL) 438 vcpu_cleanup(vm->vcpu[i], destroy); 439 } 440 441 vmmops_cleanup(vm->cookie); 442 443 vm_mem_cleanup(vm); 444 if (destroy) { 445 vm_mem_destroy(vm); 446 447 free(vm->vcpu, M_VMM); 448 sx_destroy(&vm->vcpus_init_lock); 449 } 450 } 451 452 void 453 vm_destroy(struct vm *vm) 454 { 455 456 vm_cleanup(vm, true); 457 458 free(vm, M_VMM); 459 } 460 461 int 462 vm_reinit(struct vm *vm) 463 { 464 int error; 465 466 /* 467 * A virtual machine can be reset only if all vcpus are suspended. 468 */ 469 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { 470 vm_cleanup(vm, false); 471 vm_init(vm, false); 472 error = 0; 473 } else { 474 error = EBUSY; 475 } 476 477 return (error); 478 } 479 480 const char * 481 vm_name(struct vm *vm) 482 { 483 return (vm->name); 484 } 485 486 int 487 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, 488 uint64_t gla, int prot, uint64_t *gpa, int *is_fault) 489 { 490 return (vmmops_gla2gpa(vcpu->cookie, paging, gla, prot, gpa, is_fault)); 491 } 492 493 void 494 vm_register_inst_handler(struct vm *vm, uint64_t start, uint64_t size, 495 mem_region_read_t mmio_read, mem_region_write_t mmio_write) 496 { 497 int i; 498 499 for (i = 0; i < nitems(vm->mmio_region); i++) { 500 if (vm->mmio_region[i].start == 0 && 501 vm->mmio_region[i].end == 0) { 502 vm->mmio_region[i].start = start; 503 vm->mmio_region[i].end = start + size; 504 vm->mmio_region[i].read = mmio_read; 505 vm->mmio_region[i].write = mmio_write; 506 return; 507 } 508 } 509 510 panic("%s: No free MMIO region", __func__); 511 } 512 513 void 514 vm_deregister_inst_handler(struct vm *vm, uint64_t start, uint64_t size) 515 { 516 int i; 517 518 for (i = 0; i < nitems(vm->mmio_region); i++) { 519 if (vm->mmio_region[i].start == start && 520 vm->mmio_region[i].end == start + size) { 521 memset(&vm->mmio_region[i], 0, 522 sizeof(vm->mmio_region[i])); 523 return; 524 } 525 } 526 527 panic("%s: Invalid MMIO region: %lx - %lx", __func__, start, 528 start + size); 529 } 530 531 static int 532 vm_handle_inst_emul(struct vcpu *vcpu, bool *retu) 533 { 534 struct vm *vm; 535 struct vm_exit *vme; 536 struct vie *vie; 537 struct hyp *hyp; 538 uint64_t fault_ipa; 539 struct vm_guest_paging *paging; 540 struct vmm_mmio_region *vmr; 541 int error, i; 542 543 vm = vcpu->vm; 544 hyp = vm->cookie; 545 if (!hyp->aplic_attached) 546 goto out_user; 547 548 vme = &vcpu->exitinfo; 549 vie = &vme->u.inst_emul.vie; 550 paging = &vme->u.inst_emul.paging; 551 552 fault_ipa = vme->u.inst_emul.gpa; 553 554 vmr = NULL; 555 for (i = 0; i < nitems(vm->mmio_region); i++) { 556 if (vm->mmio_region[i].start <= fault_ipa && 557 vm->mmio_region[i].end > fault_ipa) { 558 vmr = &vm->mmio_region[i]; 559 break; 560 } 561 } 562 if (vmr == NULL) 563 goto out_user; 564 565 error = vmm_emulate_instruction(vcpu, fault_ipa, vie, paging, 566 vmr->read, vmr->write, retu); 567 return (error); 568 569 out_user: 570 *retu = true; 571 return (0); 572 } 573 574 int 575 vm_suspend(struct vm *vm, enum vm_suspend_how how) 576 { 577 int i; 578 579 if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) 580 return (EINVAL); 581 582 if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) { 583 VM_CTR2(vm, "virtual machine already suspended %d/%d", 584 vm->suspend, how); 585 return (EALREADY); 586 } 587 588 VM_CTR1(vm, "virtual machine successfully suspended %d", how); 589 590 /* 591 * Notify all active vcpus that they are now suspended. 592 */ 593 for (i = 0; i < vm->maxcpus; i++) { 594 if (CPU_ISSET(i, &vm->active_cpus)) 595 vcpu_notify_event(vm_vcpu(vm, i)); 596 } 597 598 return (0); 599 } 600 601 void 602 vm_exit_suspended(struct vcpu *vcpu, uint64_t pc) 603 { 604 struct vm *vm = vcpu->vm; 605 struct vm_exit *vmexit; 606 607 KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, 608 ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); 609 610 vmexit = vm_exitinfo(vcpu); 611 vmexit->pc = pc; 612 vmexit->inst_length = 4; 613 vmexit->exitcode = VM_EXITCODE_SUSPENDED; 614 vmexit->u.suspended.how = vm->suspend; 615 } 616 617 void 618 vm_exit_debug(struct vcpu *vcpu, uint64_t pc) 619 { 620 struct vm_exit *vmexit; 621 622 vmexit = vm_exitinfo(vcpu); 623 vmexit->pc = pc; 624 vmexit->inst_length = 4; 625 vmexit->exitcode = VM_EXITCODE_DEBUG; 626 } 627 628 int 629 vm_activate_cpu(struct vcpu *vcpu) 630 { 631 struct vm *vm = vcpu->vm; 632 633 if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus)) 634 return (EBUSY); 635 636 CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus); 637 return (0); 638 639 } 640 641 int 642 vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu) 643 { 644 if (vcpu == NULL) { 645 vm->debug_cpus = vm->active_cpus; 646 for (int i = 0; i < vm->maxcpus; i++) { 647 if (CPU_ISSET(i, &vm->active_cpus)) 648 vcpu_notify_event(vm_vcpu(vm, i)); 649 } 650 } else { 651 if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus)) 652 return (EINVAL); 653 654 CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus); 655 vcpu_notify_event(vcpu); 656 } 657 return (0); 658 } 659 660 int 661 vm_resume_cpu(struct vm *vm, struct vcpu *vcpu) 662 { 663 664 if (vcpu == NULL) { 665 CPU_ZERO(&vm->debug_cpus); 666 } else { 667 if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus)) 668 return (EINVAL); 669 670 CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus); 671 } 672 return (0); 673 } 674 675 int 676 vcpu_debugged(struct vcpu *vcpu) 677 { 678 679 return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus)); 680 } 681 682 cpuset_t 683 vm_active_cpus(struct vm *vm) 684 { 685 686 return (vm->active_cpus); 687 } 688 689 cpuset_t 690 vm_debug_cpus(struct vm *vm) 691 { 692 693 return (vm->debug_cpus); 694 } 695 696 cpuset_t 697 vm_suspended_cpus(struct vm *vm) 698 { 699 700 return (vm->suspended_cpus); 701 } 702 703 704 void * 705 vcpu_stats(struct vcpu *vcpu) 706 { 707 708 return (vcpu->stats); 709 } 710 711 /* 712 * This function is called to ensure that a vcpu "sees" a pending event 713 * as soon as possible: 714 * - If the vcpu thread is sleeping then it is woken up. 715 * - If the vcpu is running on a different host_cpu then an IPI will be directed 716 * to the host_cpu to cause the vcpu to trap into the hypervisor. 717 */ 718 static void 719 vcpu_notify_event_locked(struct vcpu *vcpu) 720 { 721 int hostcpu; 722 723 hostcpu = vcpu->hostcpu; 724 if (vcpu->state == VCPU_RUNNING) { 725 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); 726 if (hostcpu != curcpu) { 727 ipi_cpu(hostcpu, vmm_ipinum); 728 } else { 729 /* 730 * If the 'vcpu' is running on 'curcpu' then it must 731 * be sending a notification to itself (e.g. SELF_IPI). 732 * The pending event will be picked up when the vcpu 733 * transitions back to guest context. 734 */ 735 } 736 } else { 737 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " 738 "with hostcpu %d", vcpu->state, hostcpu)); 739 if (vcpu->state == VCPU_SLEEPING) 740 wakeup_one(vcpu); 741 } 742 } 743 744 void 745 vcpu_notify_event(struct vcpu *vcpu) 746 { 747 vcpu_lock(vcpu); 748 vcpu_notify_event_locked(vcpu); 749 vcpu_unlock(vcpu); 750 } 751 752 struct vm_mem * 753 vm_mem(struct vm *vm) 754 { 755 return (&vm->mem); 756 } 757 758 static void 759 restore_guest_fpustate(struct vcpu *vcpu) 760 { 761 762 /* Flush host state to the pcb. */ 763 fpe_state_save(curthread); 764 765 /* Ensure the VFP state will be re-loaded when exiting the guest. */ 766 PCPU_SET(fpcurthread, NULL); 767 768 /* restore guest FPU state */ 769 fpe_enable(); 770 fpe_restore(vcpu->guestfpu); 771 772 /* 773 * The FPU is now "dirty" with the guest's state so turn on emulation 774 * to trap any access to the FPU by the host. 775 */ 776 fpe_disable(); 777 } 778 779 static void 780 save_guest_fpustate(struct vcpu *vcpu) 781 { 782 783 /* Save guest FPE state. */ 784 fpe_enable(); 785 fpe_store(vcpu->guestfpu); 786 fpe_disable(); 787 788 KASSERT(PCPU_GET(fpcurthread) == NULL, 789 ("%s: fpcurthread set with guest registers", __func__)); 790 } 791 792 static int 793 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, 794 bool from_idle) 795 { 796 int error; 797 798 vcpu_assert_locked(vcpu); 799 800 /* 801 * State transitions from the vmmdev_ioctl() must always begin from 802 * the VCPU_IDLE state. This guarantees that there is only a single 803 * ioctl() operating on a vcpu at any point. 804 */ 805 if (from_idle) { 806 while (vcpu->state != VCPU_IDLE) { 807 vcpu_notify_event_locked(vcpu); 808 msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); 809 } 810 } else { 811 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " 812 "vcpu idle state")); 813 } 814 815 if (vcpu->state == VCPU_RUNNING) { 816 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " 817 "mismatch for running vcpu", curcpu, vcpu->hostcpu)); 818 } else { 819 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " 820 "vcpu that is not running", vcpu->hostcpu)); 821 } 822 823 /* 824 * The following state transitions are allowed: 825 * IDLE -> FROZEN -> IDLE 826 * FROZEN -> RUNNING -> FROZEN 827 * FROZEN -> SLEEPING -> FROZEN 828 */ 829 switch (vcpu->state) { 830 case VCPU_IDLE: 831 case VCPU_RUNNING: 832 case VCPU_SLEEPING: 833 error = (newstate != VCPU_FROZEN); 834 break; 835 case VCPU_FROZEN: 836 error = (newstate == VCPU_FROZEN); 837 break; 838 default: 839 error = 1; 840 break; 841 } 842 843 if (error) 844 return (EBUSY); 845 846 vcpu->state = newstate; 847 if (newstate == VCPU_RUNNING) 848 vcpu->hostcpu = curcpu; 849 else 850 vcpu->hostcpu = NOCPU; 851 852 if (newstate == VCPU_IDLE) 853 wakeup(&vcpu->state); 854 855 return (0); 856 } 857 858 static void 859 vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate) 860 { 861 int error; 862 863 if ((error = vcpu_set_state(vcpu, newstate, false)) != 0) 864 panic("Error %d setting state to %d\n", error, newstate); 865 } 866 867 static void 868 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) 869 { 870 int error; 871 872 if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0) 873 panic("Error %d setting state to %d", error, newstate); 874 } 875 876 int 877 vm_get_capability(struct vcpu *vcpu, int type, int *retval) 878 { 879 880 if (type < 0 || type >= VM_CAP_MAX) 881 return (EINVAL); 882 883 return (vmmops_getcap(vcpu->cookie, type, retval)); 884 } 885 886 int 887 vm_set_capability(struct vcpu *vcpu, int type, int val) 888 { 889 890 if (type < 0 || type >= VM_CAP_MAX) 891 return (EINVAL); 892 893 return (vmmops_setcap(vcpu->cookie, type, val)); 894 } 895 896 struct vm * 897 vcpu_vm(struct vcpu *vcpu) 898 { 899 900 return (vcpu->vm); 901 } 902 903 int 904 vcpu_vcpuid(struct vcpu *vcpu) 905 { 906 907 return (vcpu->vcpuid); 908 } 909 910 void * 911 vcpu_get_cookie(struct vcpu *vcpu) 912 { 913 914 return (vcpu->cookie); 915 } 916 917 struct vcpu * 918 vm_vcpu(struct vm *vm, int vcpuid) 919 { 920 921 return (vm->vcpu[vcpuid]); 922 } 923 924 int 925 vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle) 926 { 927 int error; 928 929 vcpu_lock(vcpu); 930 error = vcpu_set_state_locked(vcpu, newstate, from_idle); 931 vcpu_unlock(vcpu); 932 933 return (error); 934 } 935 936 enum vcpu_state 937 vcpu_get_state(struct vcpu *vcpu, int *hostcpu) 938 { 939 enum vcpu_state state; 940 941 vcpu_lock(vcpu); 942 state = vcpu->state; 943 if (hostcpu != NULL) 944 *hostcpu = vcpu->hostcpu; 945 vcpu_unlock(vcpu); 946 947 return (state); 948 } 949 950 int 951 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval) 952 { 953 if (reg < 0 || reg >= VM_REG_LAST) 954 return (EINVAL); 955 956 return (vmmops_getreg(vcpu->cookie, reg, retval)); 957 } 958 959 int 960 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val) 961 { 962 int error; 963 964 if (reg < 0 || reg >= VM_REG_LAST) 965 return (EINVAL); 966 error = vmmops_setreg(vcpu->cookie, reg, val); 967 if (error || reg != VM_REG_GUEST_SEPC) 968 return (error); 969 970 vcpu->nextpc = val; 971 972 return (0); 973 } 974 975 void * 976 vm_get_cookie(struct vm *vm) 977 { 978 979 return (vm->cookie); 980 } 981 982 int 983 vm_inject_exception(struct vcpu *vcpu, uint64_t scause) 984 { 985 986 return (vmmops_exception(vcpu->cookie, scause)); 987 } 988 989 int 990 vm_attach_aplic(struct vm *vm, struct vm_aplic_descr *descr) 991 { 992 993 return (aplic_attach_to_vm(vm->cookie, descr)); 994 } 995 996 int 997 vm_assert_irq(struct vm *vm, uint32_t irq) 998 { 999 1000 return (aplic_inject_irq(vm->cookie, -1, irq, true)); 1001 } 1002 1003 int 1004 vm_deassert_irq(struct vm *vm, uint32_t irq) 1005 { 1006 1007 return (aplic_inject_irq(vm->cookie, -1, irq, false)); 1008 } 1009 1010 int 1011 vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot, 1012 int func) 1013 { 1014 1015 return (aplic_inject_msi(vm->cookie, msg, addr)); 1016 } 1017 1018 static int 1019 vm_handle_wfi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu) 1020 { 1021 struct vm *vm; 1022 1023 vm = vcpu->vm; 1024 vcpu_lock(vcpu); 1025 while (1) { 1026 if (vm->suspend) 1027 break; 1028 1029 if (aplic_check_pending(vcpu->cookie)) 1030 break; 1031 1032 if (riscv_check_ipi(vcpu->cookie, false)) 1033 break; 1034 1035 if (riscv_check_interrupts_pending(vcpu->cookie)) 1036 break; 1037 1038 if (vcpu_should_yield(vcpu)) 1039 break; 1040 1041 vcpu_require_state_locked(vcpu, VCPU_SLEEPING); 1042 /* 1043 * XXX msleep_spin() cannot be interrupted by signals so 1044 * wake up periodically to check pending signals. 1045 */ 1046 msleep_spin(vcpu, &vcpu->mtx, "vmidle", hz); 1047 vcpu_require_state_locked(vcpu, VCPU_FROZEN); 1048 } 1049 vcpu_unlock(vcpu); 1050 1051 *retu = false; 1052 1053 return (0); 1054 } 1055 1056 static int 1057 vm_handle_paging(struct vcpu *vcpu, bool *retu) 1058 { 1059 struct vm *vm; 1060 struct vm_exit *vme; 1061 struct vm_map *map; 1062 uint64_t addr; 1063 pmap_t pmap; 1064 int ftype, rv; 1065 1066 vm = vcpu->vm; 1067 vme = &vcpu->exitinfo; 1068 1069 pmap = vmspace_pmap(vm_vmspace(vm)); 1070 addr = (vme->htval << 2) & ~(PAGE_SIZE - 1); 1071 1072 dprintf("%s: %lx\n", __func__, addr); 1073 1074 switch (vme->scause) { 1075 case SCAUSE_STORE_GUEST_PAGE_FAULT: 1076 ftype = VM_PROT_WRITE; 1077 break; 1078 case SCAUSE_FETCH_GUEST_PAGE_FAULT: 1079 ftype = VM_PROT_EXECUTE; 1080 break; 1081 case SCAUSE_LOAD_GUEST_PAGE_FAULT: 1082 ftype = VM_PROT_READ; 1083 break; 1084 default: 1085 panic("unknown page trap: %lu", vme->scause); 1086 } 1087 1088 /* The page exists, but the page table needs to be updated. */ 1089 if (pmap_fault(pmap, addr, ftype)) 1090 return (0); 1091 1092 map = &vm_vmspace(vm)->vm_map; 1093 rv = vm_fault(map, addr, ftype, VM_FAULT_NORMAL, NULL); 1094 if (rv != KERN_SUCCESS) { 1095 printf("%s: vm_fault failed, addr %lx, ftype %d, err %d\n", 1096 __func__, addr, ftype, rv); 1097 return (EFAULT); 1098 } 1099 1100 return (0); 1101 } 1102 1103 static int 1104 vm_handle_suspend(struct vcpu *vcpu, bool *retu) 1105 { 1106 struct vm *vm = vcpu->vm; 1107 int error, i; 1108 struct thread *td; 1109 1110 error = 0; 1111 td = curthread; 1112 1113 CPU_SET_ATOMIC(vcpu->vcpuid, &vm->suspended_cpus); 1114 1115 /* 1116 * Wait until all 'active_cpus' have suspended themselves. 1117 * 1118 * Since a VM may be suspended at any time including when one or 1119 * more vcpus are doing a rendezvous we need to call the rendezvous 1120 * handler while we are waiting to prevent a deadlock. 1121 */ 1122 vcpu_lock(vcpu); 1123 while (error == 0) { 1124 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) 1125 break; 1126 1127 vcpu_require_state_locked(vcpu, VCPU_SLEEPING); 1128 msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz); 1129 vcpu_require_state_locked(vcpu, VCPU_FROZEN); 1130 if (td_ast_pending(td, TDA_SUSPEND)) { 1131 vcpu_unlock(vcpu); 1132 error = thread_check_susp(td, false); 1133 vcpu_lock(vcpu); 1134 } 1135 } 1136 vcpu_unlock(vcpu); 1137 1138 /* 1139 * Wakeup the other sleeping vcpus and return to userspace. 1140 */ 1141 for (i = 0; i < vm->maxcpus; i++) { 1142 if (CPU_ISSET(i, &vm->suspended_cpus)) { 1143 vcpu_notify_event(vm_vcpu(vm, i)); 1144 } 1145 } 1146 1147 *retu = true; 1148 return (error); 1149 } 1150 1151 int 1152 vm_run(struct vcpu *vcpu) 1153 { 1154 struct vm_eventinfo evinfo; 1155 struct vm_exit *vme; 1156 struct vm *vm; 1157 pmap_t pmap; 1158 int error; 1159 int vcpuid; 1160 bool retu; 1161 1162 vm = vcpu->vm; 1163 1164 dprintf("%s\n", __func__); 1165 1166 vcpuid = vcpu->vcpuid; 1167 1168 if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 1169 return (EINVAL); 1170 1171 if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) 1172 return (EINVAL); 1173 1174 pmap = vmspace_pmap(vm_vmspace(vm)); 1175 vme = &vcpu->exitinfo; 1176 evinfo.rptr = NULL; 1177 evinfo.sptr = &vm->suspend; 1178 evinfo.iptr = NULL; 1179 restart: 1180 critical_enter(); 1181 1182 restore_guest_fpustate(vcpu); 1183 1184 vcpu_require_state(vcpu, VCPU_RUNNING); 1185 error = vmmops_run(vcpu->cookie, vcpu->nextpc, pmap, &evinfo); 1186 vcpu_require_state(vcpu, VCPU_FROZEN); 1187 1188 save_guest_fpustate(vcpu); 1189 1190 critical_exit(); 1191 1192 if (error == 0) { 1193 retu = false; 1194 switch (vme->exitcode) { 1195 case VM_EXITCODE_INST_EMUL: 1196 vcpu->nextpc = vme->pc + vme->inst_length; 1197 error = vm_handle_inst_emul(vcpu, &retu); 1198 break; 1199 case VM_EXITCODE_WFI: 1200 vcpu->nextpc = vme->pc + vme->inst_length; 1201 error = vm_handle_wfi(vcpu, vme, &retu); 1202 break; 1203 case VM_EXITCODE_ECALL: 1204 /* Handle in userland. */ 1205 vcpu->nextpc = vme->pc + vme->inst_length; 1206 retu = true; 1207 break; 1208 case VM_EXITCODE_PAGING: 1209 vcpu->nextpc = vme->pc; 1210 error = vm_handle_paging(vcpu, &retu); 1211 break; 1212 case VM_EXITCODE_BOGUS: 1213 vcpu->nextpc = vme->pc; 1214 retu = false; 1215 error = 0; 1216 break; 1217 case VM_EXITCODE_SUSPENDED: 1218 vcpu->nextpc = vme->pc; 1219 error = vm_handle_suspend(vcpu, &retu); 1220 break; 1221 default: 1222 /* Handle in userland. */ 1223 vcpu->nextpc = vme->pc; 1224 retu = true; 1225 break; 1226 } 1227 } 1228 1229 if (error == 0 && retu == false) 1230 goto restart; 1231 1232 return (error); 1233 } 1234