1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2024, Microsoft Corporation. 4 * 5 * The main part of the mshv_root module, providing APIs to create 6 * and manage guest partitions. 7 * 8 * Authors: Microsoft Linux virtualization team 9 */ 10 11 #include <linux/entry-virt.h> 12 #include <linux/kernel.h> 13 #include <linux/module.h> 14 #include <linux/fs.h> 15 #include <linux/miscdevice.h> 16 #include <linux/slab.h> 17 #include <linux/file.h> 18 #include <linux/anon_inodes.h> 19 #include <linux/mm.h> 20 #include <linux/io.h> 21 #include <linux/cpuhotplug.h> 22 #include <linux/random.h> 23 #include <asm/mshyperv.h> 24 #include <linux/hyperv.h> 25 #include <linux/notifier.h> 26 #include <linux/reboot.h> 27 #include <linux/kexec.h> 28 #include <linux/page-flags.h> 29 #include <linux/crash_dump.h> 30 #include <linux/panic_notifier.h> 31 #include <linux/vmalloc.h> 32 #include <linux/rseq.h> 33 34 #include "mshv_eventfd.h" 35 #include "mshv.h" 36 #include "mshv_root.h" 37 38 MODULE_AUTHOR("Microsoft"); 39 MODULE_LICENSE("GPL"); 40 MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv"); 41 42 /* HV_THREAD_COUNTER */ 43 #if defined(CONFIG_X86_64) 44 #define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 202 45 #elif defined(CONFIG_ARM64) 46 #define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 95 47 #endif 48 49 struct mshv_root mshv_root; 50 51 enum hv_scheduler_type hv_scheduler_type; 52 53 /* Once we implement the fast extended hypercall ABI they can go away. */ 54 static void * __percpu *root_scheduler_input; 55 static void * __percpu *root_scheduler_output; 56 57 static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); 58 static int mshv_dev_open(struct inode *inode, struct file *filp); 59 static int mshv_dev_release(struct inode *inode, struct file *filp); 60 static int mshv_vp_release(struct inode *inode, struct file *filp); 61 static long mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); 62 static int mshv_partition_release(struct inode *inode, struct file *filp); 63 static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); 64 static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma); 65 static vm_fault_t mshv_vp_fault(struct vm_fault *vmf); 66 static int mshv_init_async_handler(struct mshv_partition *partition); 67 static void mshv_async_hvcall_handler(void *data, u64 *status); 68 69 static const union hv_input_vtl input_vtl_zero; 70 static const union hv_input_vtl input_vtl_normal = { 71 .target_vtl = HV_NORMAL_VTL, 72 .use_target_vtl = 1, 73 }; 74 75 static const struct vm_operations_struct mshv_vp_vm_ops = { 76 .fault = mshv_vp_fault, 77 }; 78 79 static const struct file_operations mshv_vp_fops = { 80 .owner = THIS_MODULE, 81 .release = mshv_vp_release, 82 .unlocked_ioctl = mshv_vp_ioctl, 83 .llseek = noop_llseek, 84 .mmap = mshv_vp_mmap, 85 }; 86 87 static const struct file_operations mshv_partition_fops = { 88 .owner = THIS_MODULE, 89 .release = mshv_partition_release, 90 .unlocked_ioctl = mshv_partition_ioctl, 91 .llseek = noop_llseek, 92 }; 93 94 static const struct file_operations mshv_dev_fops = { 95 .owner = THIS_MODULE, 96 .open = mshv_dev_open, 97 .release = mshv_dev_release, 98 .unlocked_ioctl = mshv_dev_ioctl, 99 .llseek = noop_llseek, 100 }; 101 102 static struct miscdevice mshv_dev = { 103 .minor = MISC_DYNAMIC_MINOR, 104 .name = "mshv", 105 .fops = &mshv_dev_fops, 106 .mode = 0600, 107 }; 108 109 /* 110 * Only allow hypercalls that have a u64 partition id as the first member of 111 * the input structure. 112 * These are sorted by value. 113 */ 114 static u16 mshv_passthru_hvcalls[] = { 115 HVCALL_GET_PARTITION_PROPERTY, 116 HVCALL_GET_PARTITION_PROPERTY_EX, 117 HVCALL_SET_PARTITION_PROPERTY, 118 HVCALL_INSTALL_INTERCEPT, 119 HVCALL_GET_VP_REGISTERS, 120 HVCALL_SET_VP_REGISTERS, 121 HVCALL_TRANSLATE_VIRTUAL_ADDRESS, 122 HVCALL_CLEAR_VIRTUAL_INTERRUPT, 123 HVCALL_REGISTER_INTERCEPT_RESULT, 124 HVCALL_ASSERT_VIRTUAL_INTERRUPT, 125 HVCALL_GET_GPA_PAGES_ACCESS_STATES, 126 HVCALL_SIGNAL_EVENT_DIRECT, 127 HVCALL_POST_MESSAGE_DIRECT, 128 HVCALL_GET_VP_CPUID_VALUES, 129 }; 130 131 /* 132 * Only allow hypercalls that are safe to be called by the VMM with the host 133 * partition as target (i.e. HV_PARTITION_ID_SELF). Carefully audit that a 134 * hypercall cannot be misused by the VMM before adding it to this list. 135 */ 136 static u16 mshv_self_passthru_hvcalls[] = { 137 HVCALL_GET_PARTITION_PROPERTY, 138 HVCALL_GET_PARTITION_PROPERTY_EX, 139 }; 140 141 static bool mshv_hvcall_is_async(u16 code) 142 { 143 switch (code) { 144 case HVCALL_SET_PARTITION_PROPERTY: 145 return true; 146 default: 147 break; 148 } 149 return false; 150 } 151 152 static bool mshv_passthru_hvcall_allowed(u16 code, u64 pt_id) 153 { 154 int i; 155 int n = ARRAY_SIZE(mshv_passthru_hvcalls); 156 u16 *allowed_hvcalls = mshv_passthru_hvcalls; 157 158 if (pt_id == HV_PARTITION_ID_SELF) { 159 n = ARRAY_SIZE(mshv_self_passthru_hvcalls); 160 allowed_hvcalls = mshv_self_passthru_hvcalls; 161 } 162 163 for (i = 0; i < n; ++i) 164 if (allowed_hvcalls[i] == code) 165 return true; 166 167 return false; 168 } 169 170 static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition, 171 bool partition_locked, 172 void __user *user_args) 173 { 174 u64 status; 175 int ret = 0; 176 bool is_async; 177 struct mshv_root_hvcall args; 178 struct page *page; 179 unsigned int pages_order; 180 void *input_pg = NULL; 181 void *output_pg = NULL; 182 u16 reps_completed; 183 u64 pt_id = partition ? partition->pt_id : HV_PARTITION_ID_SELF; 184 185 if (copy_from_user(&args, user_args, sizeof(args))) 186 return -EFAULT; 187 188 if (args.status || !args.in_ptr || args.in_sz < sizeof(u64) || 189 mshv_field_nonzero(args, rsvd) || args.in_sz > HV_HYP_PAGE_SIZE) 190 return -EINVAL; 191 192 if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE)) 193 return -EINVAL; 194 195 if (!mshv_passthru_hvcall_allowed(args.code, pt_id)) 196 return -EINVAL; 197 198 is_async = mshv_hvcall_is_async(args.code); 199 if (is_async) { 200 /* async hypercalls can only be called from partition fd */ 201 if (!partition || !partition_locked) 202 return -EINVAL; 203 ret = mshv_init_async_handler(partition); 204 if (ret) 205 return ret; 206 } 207 208 pages_order = args.out_ptr ? 1 : 0; 209 page = alloc_pages(GFP_KERNEL, pages_order); 210 if (!page) 211 return -ENOMEM; 212 input_pg = page_address(page); 213 214 if (args.out_ptr) 215 output_pg = (char *)input_pg + PAGE_SIZE; 216 else 217 output_pg = NULL; 218 219 if (copy_from_user(input_pg, (void __user *)args.in_ptr, 220 args.in_sz)) { 221 ret = -EFAULT; 222 goto free_pages_out; 223 } 224 225 /* 226 * NOTE: This only works because all the allowed hypercalls' input 227 * structs begin with a u64 partition_id field. 228 */ 229 *(u64 *)input_pg = pt_id; 230 231 reps_completed = 0; 232 do { 233 if (args.reps) { 234 status = hv_do_rep_hypercall_ex(args.code, args.reps, 235 0, reps_completed, 236 input_pg, output_pg); 237 reps_completed = hv_repcomp(status); 238 } else { 239 status = hv_do_hypercall(args.code, input_pg, output_pg); 240 } 241 242 if (hv_result(status) == HV_STATUS_CALL_PENDING) { 243 if (is_async) { 244 mshv_async_hvcall_handler(partition, &status); 245 } else { /* Paranoia check. This shouldn't happen! */ 246 ret = -EBADFD; 247 goto free_pages_out; 248 } 249 } 250 251 if (hv_result_success(status)) 252 break; 253 254 if (!hv_result_needs_memory(status)) 255 ret = hv_result_to_errno(status); 256 else 257 ret = hv_deposit_memory(pt_id, status); 258 } while (!ret); 259 260 args.status = hv_result(status); 261 args.reps = reps_completed; 262 if (copy_to_user(user_args, &args, sizeof(args))) 263 ret = -EFAULT; 264 265 if (!ret && output_pg && 266 copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz)) 267 ret = -EFAULT; 268 269 free_pages_out: 270 free_pages((unsigned long)input_pg, pages_order); 271 272 return ret; 273 } 274 275 static inline bool is_ghcb_mapping_available(void) 276 { 277 #if IS_ENABLED(CONFIG_X86_64) 278 return ms_hyperv.ext_features & HV_VP_GHCB_ROOT_MAPPING_AVAILABLE; 279 #else 280 return 0; 281 #endif 282 } 283 284 static int mshv_get_vp_registers(u32 vp_index, u64 partition_id, u16 count, 285 struct hv_register_assoc *registers) 286 { 287 return hv_call_get_vp_registers(vp_index, partition_id, 288 count, input_vtl_zero, registers); 289 } 290 291 static int mshv_set_vp_registers(u32 vp_index, u64 partition_id, u16 count, 292 struct hv_register_assoc *registers) 293 { 294 return hv_call_set_vp_registers(vp_index, partition_id, 295 count, input_vtl_zero, registers); 296 } 297 298 /* 299 * Explicit guest vCPU suspend is asynchronous by nature (as it is requested by 300 * dom0 vCPU for guest vCPU) and thus it can race with "intercept" suspend, 301 * done by the hypervisor. 302 * "Intercept" suspend leads to asynchronous message delivery to dom0 which 303 * should be awaited to keep the VP loop consistent (i.e. no message pending 304 * upon VP resume). 305 * VP intercept suspend can't be done when the VP is explicitly suspended 306 * already, and thus can be only two possible race scenarios: 307 * 1. implicit suspend bit set -> explicit suspend bit set -> message sent 308 * 2. implicit suspend bit set -> message sent -> explicit suspend bit set 309 * Checking for implicit suspend bit set after explicit suspend request has 310 * succeeded in either case allows us to reliably identify, if there is a 311 * message to receive and deliver to VMM. 312 */ 313 static int 314 mshv_suspend_vp(const struct mshv_vp *vp, bool *message_in_flight) 315 { 316 struct hv_register_assoc explicit_suspend = { 317 .name = HV_REGISTER_EXPLICIT_SUSPEND 318 }; 319 struct hv_register_assoc intercept_suspend = { 320 .name = HV_REGISTER_INTERCEPT_SUSPEND 321 }; 322 union hv_explicit_suspend_register *es = 323 &explicit_suspend.value.explicit_suspend; 324 union hv_intercept_suspend_register *is = 325 &intercept_suspend.value.intercept_suspend; 326 int ret; 327 328 es->suspended = 1; 329 330 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 331 1, &explicit_suspend); 332 if (ret) { 333 vp_err(vp, "Failed to explicitly suspend vCPU\n"); 334 return ret; 335 } 336 337 ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 338 1, &intercept_suspend); 339 if (ret) { 340 vp_err(vp, "Failed to get intercept suspend state\n"); 341 return ret; 342 } 343 344 *message_in_flight = is->suspended; 345 346 return 0; 347 } 348 349 /* 350 * This function is used when VPs are scheduled by the hypervisor's 351 * scheduler. 352 * 353 * Caller has to make sure the registers contain cleared 354 * HV_REGISTER_INTERCEPT_SUSPEND and HV_REGISTER_EXPLICIT_SUSPEND registers 355 * exactly in this order (the hypervisor clears them sequentially) to avoid 356 * potential invalid clearing a newly arrived HV_REGISTER_INTERCEPT_SUSPEND 357 * after VP is released from HV_REGISTER_EXPLICIT_SUSPEND in case of the 358 * opposite order. 359 */ 360 static long mshv_run_vp_with_hyp_scheduler(struct mshv_vp *vp) 361 { 362 long ret; 363 struct hv_register_assoc suspend_regs[2] = { 364 { .name = HV_REGISTER_INTERCEPT_SUSPEND }, 365 { .name = HV_REGISTER_EXPLICIT_SUSPEND } 366 }; 367 size_t count = ARRAY_SIZE(suspend_regs); 368 369 /* Resume VP execution */ 370 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 371 count, suspend_regs); 372 if (ret) { 373 vp_err(vp, "Failed to resume vp execution. %lx\n", ret); 374 return ret; 375 } 376 377 ret = wait_event_interruptible(vp->run.vp_suspend_queue, 378 vp->run.kicked_by_hv == 1); 379 if (ret) { 380 bool message_in_flight; 381 382 /* 383 * Otherwise the waiting was interrupted by a signal: suspend 384 * the vCPU explicitly and copy message in flight (if any). 385 */ 386 ret = mshv_suspend_vp(vp, &message_in_flight); 387 if (ret) 388 return ret; 389 390 /* Return if no message in flight */ 391 if (!message_in_flight) 392 return -EINTR; 393 394 /* Wait for the message in flight. */ 395 wait_event(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1); 396 } 397 398 /* 399 * Reset the flag to make the wait_event call above work 400 * next time. 401 */ 402 vp->run.kicked_by_hv = 0; 403 404 return 0; 405 } 406 407 static int 408 mshv_vp_dispatch(struct mshv_vp *vp, u32 flags, 409 struct hv_output_dispatch_vp *res) 410 { 411 struct hv_input_dispatch_vp *input; 412 struct hv_output_dispatch_vp *output; 413 u64 status; 414 415 preempt_disable(); 416 input = *this_cpu_ptr(root_scheduler_input); 417 output = *this_cpu_ptr(root_scheduler_output); 418 419 memset(input, 0, sizeof(*input)); 420 memset(output, 0, sizeof(*output)); 421 422 input->partition_id = vp->vp_partition->pt_id; 423 input->vp_index = vp->vp_index; 424 input->time_slice = 0; /* Run forever until something happens */ 425 input->spec_ctrl = 0; /* TODO: set sensible flags */ 426 input->flags = flags; 427 428 vp->run.flags.root_sched_dispatched = 1; 429 status = hv_do_hypercall(HVCALL_DISPATCH_VP, input, output); 430 vp->run.flags.root_sched_dispatched = 0; 431 432 trace_mshv_hvcall_dispatch_vp(vp->vp_partition->pt_id, 433 vp->vp_index, flags, 434 output->dispatch_state, 435 output->dispatch_event, 436 #if defined(CONFIG_X86_64) 437 vp->vp_register_page->interrupt_vectors.as_uint64, 438 #else 439 0, 440 #endif 441 status); 442 443 *res = *output; 444 preempt_enable(); 445 446 if (!hv_result_success(status)) 447 vp_err(vp, "%s: status %s\n", __func__, 448 hv_result_to_string(status)); 449 450 return hv_result_to_errno(status); 451 } 452 453 static int 454 mshv_vp_clear_explicit_suspend(struct mshv_vp *vp) 455 { 456 struct hv_register_assoc explicit_suspend = { 457 .name = HV_REGISTER_EXPLICIT_SUSPEND, 458 .value.explicit_suspend.suspended = 0, 459 }; 460 int ret; 461 462 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 463 1, &explicit_suspend); 464 465 trace_mshv_vp_clear_explicit_suspend(vp->vp_partition->pt_id, 466 vp->vp_index, ret); 467 468 if (ret) 469 vp_err(vp, "Failed to unsuspend\n"); 470 471 return ret; 472 } 473 474 #if IS_ENABLED(CONFIG_X86_64) 475 static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp) 476 { 477 if (!vp->vp_register_page) 478 return 0; 479 return vp->vp_register_page->interrupt_vectors.as_uint64; 480 } 481 #else 482 static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp) 483 { 484 return 0; 485 } 486 #endif 487 488 static bool mshv_vp_dispatch_thread_blocked(struct mshv_vp *vp) 489 { 490 struct hv_stats_page **stats = vp->vp_stats_pages; 491 u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->data; 492 u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->data; 493 494 return parent_vp_cntrs[HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED] || 495 self_vp_cntrs[HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED]; 496 } 497 498 static int 499 mshv_vp_wait_for_hv_kick(struct mshv_vp *vp) 500 { 501 int ret; 502 503 ret = wait_event_interruptible(vp->run.vp_suspend_queue, 504 (vp->run.kicked_by_hv == 1 && 505 !mshv_vp_dispatch_thread_blocked(vp)) || 506 mshv_vp_interrupt_pending(vp)); 507 if (ret) 508 return -EINTR; 509 510 trace_mshv_vp_wait_for_hv_kick(vp->vp_partition->pt_id, 511 vp->vp_index, 512 vp->run.kicked_by_hv, 513 mshv_vp_dispatch_thread_blocked(vp), 514 mshv_vp_interrupt_pending(vp)); 515 516 vp->run.flags.root_sched_blocked = 0; 517 vp->run.kicked_by_hv = 0; 518 519 return 0; 520 } 521 522 /* Must be called with interrupts enabled */ 523 static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp) 524 { 525 long ret; 526 527 if (vp->run.flags.root_sched_blocked) { 528 /* 529 * Dispatch state of this VP is blocked. Need to wait 530 * for the hypervisor to clear the blocked state before 531 * dispatching it. 532 */ 533 ret = mshv_vp_wait_for_hv_kick(vp); 534 if (ret) 535 return ret; 536 } 537 538 do { 539 u32 flags = 0; 540 struct hv_output_dispatch_vp output; 541 542 if (__xfer_to_guest_mode_work_pending()) { 543 ret = xfer_to_guest_mode_handle_work(); 544 545 trace_mshv_xfer_to_guest_mode_work(vp->vp_partition->pt_id, 546 vp->vp_index, 547 read_thread_flags(), 548 ret); 549 550 if (ret) 551 break; 552 } 553 554 if (vp->run.flags.intercept_suspend) 555 flags |= HV_DISPATCH_VP_FLAG_CLEAR_INTERCEPT_SUSPEND; 556 557 if (mshv_vp_interrupt_pending(vp)) 558 flags |= HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION; 559 560 ret = mshv_vp_dispatch(vp, flags, &output); 561 if (ret) 562 break; 563 564 vp->run.flags.intercept_suspend = 0; 565 566 if (output.dispatch_state == HV_VP_DISPATCH_STATE_BLOCKED) { 567 if (output.dispatch_event == 568 HV_VP_DISPATCH_EVENT_SUSPEND) { 569 /* 570 * TODO: remove the warning once VP canceling 571 * is supported 572 */ 573 WARN_ONCE(atomic64_read(&vp->run.vp_signaled_count), 574 "%s: vp#%d: unexpected explicit suspend\n", 575 __func__, vp->vp_index); 576 /* 577 * Need to clear explicit suspend before 578 * dispatching. 579 * Explicit suspend is either: 580 * - set right after the first VP dispatch or 581 * - set explicitly via hypercall 582 * Since the latter case is not yet supported, 583 * simply clear it here. 584 */ 585 ret = mshv_vp_clear_explicit_suspend(vp); 586 if (ret) 587 break; 588 589 ret = mshv_vp_wait_for_hv_kick(vp); 590 if (ret) 591 break; 592 } else { 593 vp->run.flags.root_sched_blocked = 1; 594 ret = mshv_vp_wait_for_hv_kick(vp); 595 if (ret) 596 break; 597 } 598 } else { 599 /* HV_VP_DISPATCH_STATE_READY */ 600 if (output.dispatch_event == 601 HV_VP_DISPATCH_EVENT_INTERCEPT) 602 vp->run.flags.intercept_suspend = 1; 603 } 604 } while (!vp->run.flags.intercept_suspend); 605 606 rseq_virt_userspace_exit(); 607 608 return ret; 609 } 610 611 static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ, 612 "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ"); 613 614 static struct mshv_mem_region * 615 mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn) 616 { 617 struct mshv_mem_region *region; 618 619 hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { 620 if (gfn >= region->start_gfn && 621 gfn < region->start_gfn + region->nr_pages) 622 return region; 623 } 624 625 return NULL; 626 } 627 628 static struct mshv_mem_region * 629 mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn) 630 { 631 struct mshv_mem_region *region; 632 633 spin_lock(&p->pt_mem_regions_lock); 634 region = mshv_partition_region_by_gfn(p, gfn); 635 if (!region || !mshv_region_get(region)) { 636 spin_unlock(&p->pt_mem_regions_lock); 637 return NULL; 638 } 639 spin_unlock(&p->pt_mem_regions_lock); 640 641 return region; 642 } 643 644 /** 645 * mshv_handle_gpa_intercept - Handle GPA (Guest Physical Address) intercepts. 646 * @vp: Pointer to the virtual processor structure. 647 * 648 * This function processes GPA intercepts by identifying the memory region 649 * corresponding to the intercepted GPA, aligning the page offset, and 650 * mapping the required pages. It ensures that the region is valid and 651 * handles faults efficiently by mapping multiple pages at once. 652 * 653 * Return: true if the intercept was handled successfully, false otherwise. 654 */ 655 static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) 656 { 657 struct mshv_partition *p = vp->vp_partition; 658 struct mshv_mem_region *region; 659 bool ret = false; 660 u64 gfn; 661 #if defined(CONFIG_X86_64) 662 struct hv_x64_memory_intercept_message *msg = 663 (struct hv_x64_memory_intercept_message *) 664 vp->vp_intercept_msg_page->u.payload; 665 #elif defined(CONFIG_ARM64) 666 struct hv_arm64_memory_intercept_message *msg = 667 (struct hv_arm64_memory_intercept_message *) 668 vp->vp_intercept_msg_page->u.payload; 669 #endif 670 enum hv_intercept_access_type access_type = 671 msg->header.intercept_access_type; 672 673 gfn = HVPFN_DOWN(msg->guest_physical_address); 674 675 region = mshv_partition_region_by_gfn_get(p, gfn); 676 if (!region) 677 goto out; 678 679 if (access_type == HV_INTERCEPT_ACCESS_WRITE && 680 !(region->hv_map_flags & HV_MAP_GPA_WRITABLE)) 681 goto put_region; 682 683 if (access_type == HV_INTERCEPT_ACCESS_EXECUTE && 684 !(region->hv_map_flags & HV_MAP_GPA_EXECUTABLE)) 685 goto put_region; 686 687 /* Only movable memory ranges are supported for GPA intercepts */ 688 if (region->mreg_type == MSHV_REGION_TYPE_MEM_MOVABLE) 689 ret = mshv_region_handle_gfn_fault(region, gfn); 690 691 put_region: 692 mshv_region_put(region); 693 out: 694 trace_mshv_handle_gpa_intercept(p->pt_id, vp->vp_index, gfn, 695 access_type, ret); 696 return ret; 697 } 698 699 static bool mshv_vp_handle_intercept(struct mshv_vp *vp) 700 { 701 switch (vp->vp_intercept_msg_page->header.message_type) { 702 case HVMSG_GPA_INTERCEPT: 703 return mshv_handle_gpa_intercept(vp); 704 } 705 return false; 706 } 707 708 static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg) 709 { 710 long rc; 711 712 trace_mshv_run_vp_entry(vp->vp_partition->pt_id, vp->vp_index); 713 714 do { 715 if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) 716 rc = mshv_run_vp_with_root_scheduler(vp); 717 else 718 rc = mshv_run_vp_with_hyp_scheduler(vp); 719 } while (rc == 0 && mshv_vp_handle_intercept(vp)); 720 721 trace_mshv_run_vp_exit(vp->vp_partition->pt_id, vp->vp_index, 722 vp->vp_intercept_msg_page->header.message_type, 723 rc); 724 725 if (rc) 726 return rc; 727 728 if (copy_to_user(ret_msg, vp->vp_intercept_msg_page, 729 sizeof(struct hv_message))) 730 rc = -EFAULT; 731 732 return rc; 733 } 734 735 static int 736 mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp *vp, 737 struct hv_vp_state_data state_data, 738 unsigned long user_pfn, size_t page_count, 739 bool is_set) 740 { 741 int completed, ret = 0; 742 unsigned long check; 743 struct page **pages; 744 745 if (page_count > INT_MAX) 746 return -EINVAL; 747 /* 748 * Check the arithmetic for wraparound/overflow. 749 * The last page address in the buffer is: 750 * (user_pfn + (page_count - 1)) * PAGE_SIZE 751 */ 752 if (check_add_overflow(user_pfn, (page_count - 1), &check)) 753 return -EOVERFLOW; 754 if (check_mul_overflow(check, PAGE_SIZE, &check)) 755 return -EOVERFLOW; 756 757 /* Pin user pages so hypervisor can copy directly to them */ 758 pages = kzalloc_objs(struct page *, page_count); 759 if (!pages) 760 return -ENOMEM; 761 762 for (completed = 0; completed < page_count; completed += ret) { 763 unsigned long user_addr = (user_pfn + completed) * PAGE_SIZE; 764 int remaining = page_count - completed; 765 766 ret = pin_user_pages_fast(user_addr, remaining, FOLL_WRITE, 767 &pages[completed]); 768 if (ret < 0) { 769 vp_err(vp, "%s: Failed to pin user pages error %i\n", 770 __func__, ret); 771 goto unpin_pages; 772 } 773 } 774 775 if (is_set) 776 ret = hv_call_set_vp_state(vp->vp_index, 777 vp->vp_partition->pt_id, 778 state_data, page_count, pages, 779 0, NULL); 780 else 781 ret = hv_call_get_vp_state(vp->vp_index, 782 vp->vp_partition->pt_id, 783 state_data, page_count, pages, 784 NULL); 785 786 unpin_pages: 787 unpin_user_pages(pages, completed); 788 kfree(pages); 789 return ret; 790 } 791 792 static long 793 mshv_vp_ioctl_get_set_state(struct mshv_vp *vp, 794 struct mshv_get_set_vp_state __user *user_args, 795 bool is_set) 796 { 797 struct mshv_get_set_vp_state args; 798 long ret = 0; 799 union hv_output_get_vp_state vp_state; 800 u32 data_sz; 801 struct hv_vp_state_data state_data = {}; 802 803 if (copy_from_user(&args, user_args, sizeof(args))) 804 return -EFAULT; 805 806 if (args.type >= MSHV_VP_STATE_COUNT || mshv_field_nonzero(args, rsvd) || 807 !args.buf_sz || !PAGE_ALIGNED(args.buf_sz) || 808 !PAGE_ALIGNED(args.buf_ptr)) 809 return -EINVAL; 810 811 if (!access_ok((void __user *)args.buf_ptr, args.buf_sz)) 812 return -EFAULT; 813 814 switch (args.type) { 815 case MSHV_VP_STATE_LAPIC: 816 state_data.type = HV_GET_SET_VP_STATE_LAPIC_STATE; 817 data_sz = HV_HYP_PAGE_SIZE; 818 break; 819 case MSHV_VP_STATE_XSAVE: 820 { 821 u64 data_sz_64; 822 823 ret = hv_call_get_partition_property(vp->vp_partition->pt_id, 824 HV_PARTITION_PROPERTY_XSAVE_STATES, 825 &state_data.xsave.states.as_uint64); 826 if (ret) 827 return ret; 828 829 ret = hv_call_get_partition_property(vp->vp_partition->pt_id, 830 HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE, 831 &data_sz_64); 832 if (ret) 833 return ret; 834 835 data_sz = (u32)data_sz_64; 836 state_data.xsave.flags = 0; 837 /* Always request legacy states */ 838 state_data.xsave.states.legacy_x87 = 1; 839 state_data.xsave.states.legacy_sse = 1; 840 state_data.type = HV_GET_SET_VP_STATE_XSAVE; 841 break; 842 } 843 case MSHV_VP_STATE_SIMP: 844 state_data.type = HV_GET_SET_VP_STATE_SIM_PAGE; 845 data_sz = HV_HYP_PAGE_SIZE; 846 break; 847 case MSHV_VP_STATE_SIEFP: 848 state_data.type = HV_GET_SET_VP_STATE_SIEF_PAGE; 849 data_sz = HV_HYP_PAGE_SIZE; 850 break; 851 case MSHV_VP_STATE_SYNTHETIC_TIMERS: 852 state_data.type = HV_GET_SET_VP_STATE_SYNTHETIC_TIMERS; 853 data_sz = sizeof(vp_state.synthetic_timers_state); 854 break; 855 default: 856 return -EINVAL; 857 } 858 859 if (copy_to_user(&user_args->buf_sz, &data_sz, sizeof(user_args->buf_sz))) 860 return -EFAULT; 861 862 if (data_sz > args.buf_sz) 863 return -EINVAL; 864 865 /* If the data is transmitted via pfns, delegate to helper */ 866 if (state_data.type & HV_GET_SET_VP_STATE_TYPE_PFN) { 867 unsigned long user_pfn = PFN_DOWN(args.buf_ptr); 868 size_t page_count = PFN_DOWN(args.buf_sz); 869 870 return mshv_vp_ioctl_get_set_state_pfn(vp, state_data, user_pfn, 871 page_count, is_set); 872 } 873 874 /* Paranoia check - this shouldn't happen! */ 875 if (data_sz > sizeof(vp_state)) { 876 vp_err(vp, "Invalid vp state data size!\n"); 877 return -EINVAL; 878 } 879 880 if (is_set) { 881 if (copy_from_user(&vp_state, (__user void *)args.buf_ptr, data_sz)) 882 return -EFAULT; 883 884 return hv_call_set_vp_state(vp->vp_index, 885 vp->vp_partition->pt_id, 886 state_data, 0, NULL, 887 sizeof(vp_state), (u8 *)&vp_state); 888 } 889 890 ret = hv_call_get_vp_state(vp->vp_index, vp->vp_partition->pt_id, 891 state_data, 0, NULL, &vp_state); 892 if (ret) 893 return ret; 894 895 if (copy_to_user((void __user *)args.buf_ptr, &vp_state, data_sz)) 896 return -EFAULT; 897 898 return 0; 899 } 900 901 static long 902 mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) 903 { 904 struct mshv_vp *vp = filp->private_data; 905 long r = -ENOTTY; 906 907 if (mutex_lock_killable(&vp->vp_mutex)) 908 return -EINTR; 909 910 switch (ioctl) { 911 case MSHV_RUN_VP: 912 r = mshv_vp_ioctl_run_vp(vp, (void __user *)arg); 913 break; 914 case MSHV_GET_VP_STATE: 915 r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, false); 916 break; 917 case MSHV_SET_VP_STATE: 918 r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, true); 919 break; 920 case MSHV_ROOT_HVCALL: 921 r = mshv_ioctl_passthru_hvcall(vp->vp_partition, false, 922 (void __user *)arg); 923 break; 924 default: 925 vp_warn(vp, "Invalid ioctl: %#x\n", ioctl); 926 break; 927 } 928 mutex_unlock(&vp->vp_mutex); 929 930 return r; 931 } 932 933 static vm_fault_t mshv_vp_fault(struct vm_fault *vmf) 934 { 935 struct mshv_vp *vp = vmf->vma->vm_file->private_data; 936 937 switch (vmf->vma->vm_pgoff) { 938 case MSHV_VP_MMAP_OFFSET_REGISTERS: 939 vmf->page = virt_to_page(vp->vp_register_page); 940 break; 941 case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE: 942 vmf->page = virt_to_page(vp->vp_intercept_msg_page); 943 break; 944 case MSHV_VP_MMAP_OFFSET_GHCB: 945 vmf->page = virt_to_page(vp->vp_ghcb_page); 946 break; 947 default: 948 return VM_FAULT_SIGBUS; 949 } 950 951 get_page(vmf->page); 952 953 return 0; 954 } 955 956 static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma) 957 { 958 struct mshv_vp *vp = file->private_data; 959 960 switch (vma->vm_pgoff) { 961 case MSHV_VP_MMAP_OFFSET_REGISTERS: 962 if (!vp->vp_register_page) 963 return -ENODEV; 964 break; 965 case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE: 966 if (!vp->vp_intercept_msg_page) 967 return -ENODEV; 968 break; 969 case MSHV_VP_MMAP_OFFSET_GHCB: 970 if (!vp->vp_ghcb_page) 971 return -ENODEV; 972 break; 973 default: 974 return -EINVAL; 975 } 976 977 vma->vm_ops = &mshv_vp_vm_ops; 978 return 0; 979 } 980 981 static int 982 mshv_vp_release(struct inode *inode, struct file *filp) 983 { 984 struct mshv_vp *vp = filp->private_data; 985 986 trace_mshv_vp_release(vp->vp_partition->pt_id, vp->vp_index); 987 988 /* Rest of VP cleanup happens in destroy_partition() */ 989 mshv_partition_put(vp->vp_partition); 990 return 0; 991 } 992 993 void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index, 994 struct hv_stats_page *stats_pages[]) 995 { 996 union hv_stats_object_identity identity = { 997 .vp.partition_id = partition_id, 998 .vp.vp_index = vp_index, 999 }; 1000 int err; 1001 1002 identity.vp.stats_area_type = HV_STATS_AREA_SELF; 1003 err = hv_unmap_stats_page(HV_STATS_OBJECT_VP, 1004 stats_pages[HV_STATS_AREA_SELF], 1005 &identity); 1006 if (err) 1007 pr_err("%s: failed to unmap partition %llu vp %u self stats, err: %d\n", 1008 __func__, partition_id, vp_index, err); 1009 1010 if (stats_pages[HV_STATS_AREA_PARENT] != stats_pages[HV_STATS_AREA_SELF]) { 1011 identity.vp.stats_area_type = HV_STATS_AREA_PARENT; 1012 err = hv_unmap_stats_page(HV_STATS_OBJECT_VP, 1013 stats_pages[HV_STATS_AREA_PARENT], 1014 &identity); 1015 if (err) 1016 pr_err("%s: failed to unmap partition %llu vp %u parent stats, err: %d\n", 1017 __func__, partition_id, vp_index, err); 1018 } 1019 } 1020 1021 int mshv_vp_stats_map(u64 partition_id, u32 vp_index, 1022 struct hv_stats_page *stats_pages[]) 1023 { 1024 union hv_stats_object_identity identity = { 1025 .vp.partition_id = partition_id, 1026 .vp.vp_index = vp_index, 1027 }; 1028 int err; 1029 1030 identity.vp.stats_area_type = HV_STATS_AREA_SELF; 1031 err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity, 1032 &stats_pages[HV_STATS_AREA_SELF]); 1033 if (err) { 1034 pr_err("%s: failed to map partition %llu vp %u self stats, err: %d\n", 1035 __func__, partition_id, vp_index, err); 1036 return err; 1037 } 1038 1039 /* 1040 * L1VH partition cannot access its vp stats in parent area. 1041 */ 1042 if (is_l1vh_parent(partition_id)) { 1043 stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF]; 1044 } else { 1045 identity.vp.stats_area_type = HV_STATS_AREA_PARENT; 1046 err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity, 1047 &stats_pages[HV_STATS_AREA_PARENT]); 1048 if (err) { 1049 pr_err("%s: failed to map partition %llu vp %u parent stats, err: %d\n", 1050 __func__, partition_id, vp_index, err); 1051 goto unmap_self; 1052 } 1053 if (!stats_pages[HV_STATS_AREA_PARENT]) 1054 stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF]; 1055 } 1056 1057 return 0; 1058 1059 unmap_self: 1060 identity.vp.stats_area_type = HV_STATS_AREA_SELF; 1061 hv_unmap_stats_page(HV_STATS_OBJECT_VP, 1062 stats_pages[HV_STATS_AREA_SELF], 1063 &identity); 1064 return err; 1065 } 1066 1067 static long 1068 mshv_partition_ioctl_create_vp(struct mshv_partition *partition, 1069 void __user *arg) 1070 { 1071 struct mshv_create_vp args; 1072 struct mshv_vp *vp; 1073 struct page *intercept_msg_page, *register_page, *ghcb_page; 1074 struct hv_stats_page *stats_pages[2]; 1075 long ret; 1076 1077 if (copy_from_user(&args, arg, sizeof(args))) 1078 return -EFAULT; 1079 1080 if (args.vp_index >= MSHV_MAX_VPS) 1081 return -EINVAL; 1082 1083 if (partition->pt_vp_array[args.vp_index]) 1084 return -EEXIST; 1085 1086 ret = hv_call_create_vp(NUMA_NO_NODE, partition->pt_id, args.vp_index, 1087 0 /* Only valid for root partition VPs */); 1088 if (ret) 1089 return ret; 1090 1091 ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, 1092 HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, 1093 input_vtl_zero, &intercept_msg_page); 1094 if (ret) 1095 goto destroy_vp; 1096 1097 if (!mshv_partition_encrypted(partition)) { 1098 ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, 1099 HV_VP_STATE_PAGE_REGISTERS, 1100 input_vtl_zero, ®ister_page); 1101 if (ret) 1102 goto unmap_intercept_message_page; 1103 } 1104 1105 if (mshv_partition_encrypted(partition) && 1106 is_ghcb_mapping_available()) { 1107 ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, 1108 HV_VP_STATE_PAGE_GHCB, 1109 input_vtl_normal, &ghcb_page); 1110 if (ret) 1111 goto unmap_register_page; 1112 } 1113 1114 ret = mshv_vp_stats_map(partition->pt_id, args.vp_index, 1115 stats_pages); 1116 if (ret) 1117 goto unmap_ghcb_page; 1118 1119 vp = kzalloc_obj(*vp); 1120 if (!vp) 1121 goto unmap_stats_pages; 1122 1123 vp->vp_partition = mshv_partition_get(partition); 1124 if (!vp->vp_partition) { 1125 ret = -EBADF; 1126 goto free_vp; 1127 } 1128 1129 mutex_init(&vp->vp_mutex); 1130 init_waitqueue_head(&vp->run.vp_suspend_queue); 1131 atomic64_set(&vp->run.vp_signaled_count, 0); 1132 1133 vp->vp_index = args.vp_index; 1134 vp->vp_intercept_msg_page = page_to_virt(intercept_msg_page); 1135 if (!mshv_partition_encrypted(partition)) 1136 vp->vp_register_page = page_to_virt(register_page); 1137 1138 if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) 1139 vp->vp_ghcb_page = page_to_virt(ghcb_page); 1140 1141 memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages)); 1142 1143 ret = mshv_debugfs_vp_create(vp); 1144 if (ret) 1145 goto put_partition; 1146 1147 /* 1148 * Keep anon_inode_getfd last: it installs fd in the file struct and 1149 * thus makes the state accessible in user space. 1150 */ 1151 ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp, 1152 O_RDWR | O_CLOEXEC); 1153 if (ret < 0) 1154 goto remove_debugfs_vp; 1155 1156 /* already exclusive with the partition mutex for all ioctls */ 1157 partition->pt_vp_count++; 1158 partition->pt_vp_array[args.vp_index] = vp; 1159 1160 goto out; 1161 1162 remove_debugfs_vp: 1163 mshv_debugfs_vp_remove(vp); 1164 put_partition: 1165 mshv_partition_put(partition); 1166 free_vp: 1167 kfree(vp); 1168 unmap_stats_pages: 1169 mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages); 1170 unmap_ghcb_page: 1171 if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) 1172 hv_unmap_vp_state_page(partition->pt_id, args.vp_index, 1173 HV_VP_STATE_PAGE_GHCB, ghcb_page, 1174 input_vtl_normal); 1175 unmap_register_page: 1176 if (!mshv_partition_encrypted(partition)) 1177 hv_unmap_vp_state_page(partition->pt_id, args.vp_index, 1178 HV_VP_STATE_PAGE_REGISTERS, 1179 register_page, input_vtl_zero); 1180 unmap_intercept_message_page: 1181 hv_unmap_vp_state_page(partition->pt_id, args.vp_index, 1182 HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, 1183 intercept_msg_page, input_vtl_zero); 1184 destroy_vp: 1185 hv_call_delete_vp(partition->pt_id, args.vp_index); 1186 out: 1187 trace_mshv_create_vp(partition->pt_id, args.vp_index, ret); 1188 return ret; 1189 } 1190 1191 static int mshv_init_async_handler(struct mshv_partition *partition) 1192 { 1193 if (completion_done(&partition->async_hypercall)) { 1194 pt_err(partition, 1195 "Cannot issue async hypercall while another one in progress!\n"); 1196 return -EPERM; 1197 } 1198 1199 reinit_completion(&partition->async_hypercall); 1200 return 0; 1201 } 1202 1203 static void mshv_async_hvcall_handler(void *data, u64 *status) 1204 { 1205 struct mshv_partition *partition = data; 1206 1207 wait_for_completion(&partition->async_hypercall); 1208 pt_dbg(partition, "Async hypercall completed!\n"); 1209 1210 *status = partition->async_hypercall_status; 1211 } 1212 1213 /* 1214 * NB: caller checks and makes sure mem->size is page aligned 1215 * Returns: 0 with regionpp updated on success, or -errno 1216 */ 1217 static int mshv_partition_create_region(struct mshv_partition *partition, 1218 struct mshv_user_mem_region *mem, 1219 struct mshv_mem_region **regionpp, 1220 bool is_mmio) 1221 { 1222 struct mshv_mem_region *rg; 1223 u64 nr_pages = HVPFN_DOWN(mem->size); 1224 1225 /* Reject overlapping regions */ 1226 spin_lock(&partition->pt_mem_regions_lock); 1227 hlist_for_each_entry(rg, &partition->pt_mem_regions, hnode) { 1228 if (mem->guest_pfn + nr_pages <= rg->start_gfn || 1229 rg->start_gfn + rg->nr_pages <= mem->guest_pfn) 1230 continue; 1231 spin_unlock(&partition->pt_mem_regions_lock); 1232 return -EEXIST; 1233 } 1234 spin_unlock(&partition->pt_mem_regions_lock); 1235 1236 rg = mshv_region_create(mem->guest_pfn, nr_pages, 1237 mem->userspace_addr, mem->flags); 1238 if (IS_ERR(rg)) 1239 return PTR_ERR(rg); 1240 1241 if (is_mmio) 1242 rg->mreg_type = MSHV_REGION_TYPE_MMIO; 1243 else if (mshv_partition_encrypted(partition) || 1244 !mshv_region_movable_init(rg)) 1245 rg->mreg_type = MSHV_REGION_TYPE_MEM_PINNED; 1246 else 1247 rg->mreg_type = MSHV_REGION_TYPE_MEM_MOVABLE; 1248 1249 rg->partition = partition; 1250 1251 *regionpp = rg; 1252 1253 return 0; 1254 } 1255 1256 /** 1257 * mshv_prepare_pinned_region - Pin and map memory regions 1258 * @region: Pointer to the memory region structure 1259 * 1260 * This function processes memory regions that are explicitly marked as pinned. 1261 * Pinned regions are preallocated, mapped upfront, and do not rely on fault-based 1262 * population. The function ensures the region is properly populated, handles 1263 * encryption requirements for SNP partitions if applicable, maps the region, 1264 * and performs necessary sharing or eviction operations based on the mapping 1265 * result. 1266 * 1267 * Return: 0 on success, negative error code on failure. 1268 */ 1269 static int mshv_prepare_pinned_region(struct mshv_mem_region *region) 1270 { 1271 struct mshv_partition *partition = region->partition; 1272 int ret; 1273 1274 ret = mshv_region_pin(region); 1275 if (ret) { 1276 pt_err(partition, "Failed to pin memory region: %d\n", 1277 ret); 1278 goto err_out; 1279 } 1280 1281 /* 1282 * For an SNP partition it is a requirement that for every memory region 1283 * that we are going to map for this partition we should make sure that 1284 * host access to that region is released. This is ensured by doing an 1285 * additional hypercall which will update the SLAT to release host 1286 * access to guest memory regions. 1287 */ 1288 if (mshv_partition_encrypted(partition)) { 1289 ret = mshv_region_unshare(region); 1290 if (ret) { 1291 pt_err(partition, 1292 "Failed to unshare memory region (guest_pfn: %llu): %d\n", 1293 region->start_gfn, ret); 1294 goto invalidate_region; 1295 } 1296 } 1297 1298 ret = mshv_region_map(region); 1299 if (ret && mshv_partition_encrypted(partition)) { 1300 int shrc; 1301 1302 shrc = mshv_region_share(region); 1303 if (!shrc) 1304 goto invalidate_region; 1305 1306 pt_err(partition, 1307 "Failed to share memory region (guest_pfn: %llu): %d\n", 1308 region->start_gfn, shrc); 1309 /* 1310 * Don't unpin if marking shared failed because pages are no 1311 * longer mapped in the host, ie root, anymore. 1312 */ 1313 goto err_out; 1314 } 1315 1316 return 0; 1317 1318 invalidate_region: 1319 mshv_region_invalidate(region); 1320 err_out: 1321 return ret; 1322 } 1323 1324 /* 1325 * This maps two things: guest RAM and for pci passthru mmio space. 1326 * 1327 * mmio: 1328 * - vfio overloads vm_pgoff to store the mmio start pfn/spa. 1329 * - Two things need to happen for mapping mmio range: 1330 * 1. mapped in the uaddr so VMM can access it. 1331 * 2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it. 1332 * 1333 * This function takes care of the second. The first one is managed by vfio, 1334 * and hence is taken care of via vfio_pci_mmap_fault(). 1335 */ 1336 static long 1337 mshv_map_user_memory(struct mshv_partition *partition, 1338 struct mshv_user_mem_region *mem) 1339 { 1340 struct mshv_mem_region *region; 1341 struct vm_area_struct *vma; 1342 bool is_mmio; 1343 ulong mmio_pfn; 1344 long ret; 1345 1346 if (mem->flags & BIT(MSHV_SET_MEM_BIT_UNMAP) || 1347 !access_ok((const void __user *)mem->userspace_addr, mem->size)) 1348 return -EINVAL; 1349 1350 mmap_read_lock(current->mm); 1351 vma = vma_lookup(current->mm, mem->userspace_addr); 1352 is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0; 1353 mmio_pfn = is_mmio ? vma->vm_pgoff : 0; 1354 mmap_read_unlock(current->mm); 1355 1356 if (!vma) 1357 return -EINVAL; 1358 1359 ret = mshv_partition_create_region(partition, mem, ®ion, 1360 is_mmio); 1361 if (ret) 1362 return ret; 1363 1364 switch (region->mreg_type) { 1365 case MSHV_REGION_TYPE_MEM_PINNED: 1366 ret = mshv_prepare_pinned_region(region); 1367 break; 1368 case MSHV_REGION_TYPE_MEM_MOVABLE: 1369 /* 1370 * For movable memory regions, remap with no access to let 1371 * the hypervisor track dirty pages, enabling pre-copy live 1372 * migration. 1373 */ 1374 ret = hv_call_map_gpa_pages(partition->pt_id, 1375 region->start_gfn, 1376 region->nr_pages, 1377 HV_MAP_GPA_NO_ACCESS, NULL); 1378 break; 1379 case MSHV_REGION_TYPE_MMIO: 1380 ret = hv_call_map_mmio_pages(partition->pt_id, 1381 region->start_gfn, 1382 mmio_pfn, 1383 region->nr_pages); 1384 break; 1385 } 1386 1387 trace_mshv_map_user_memory(partition->pt_id, region->start_uaddr, 1388 region->start_gfn, region->nr_pages, 1389 region->hv_map_flags, ret); 1390 1391 if (ret) 1392 goto errout; 1393 1394 spin_lock(&partition->pt_mem_regions_lock); 1395 hlist_add_head(®ion->hnode, &partition->pt_mem_regions); 1396 spin_unlock(&partition->pt_mem_regions_lock); 1397 1398 return 0; 1399 1400 errout: 1401 mshv_region_put(region); 1402 return ret; 1403 } 1404 1405 /* Called for unmapping both the guest ram and the mmio space */ 1406 static long 1407 mshv_unmap_user_memory(struct mshv_partition *partition, 1408 struct mshv_user_mem_region *mem) 1409 { 1410 struct mshv_mem_region *region; 1411 1412 if (!(mem->flags & BIT(MSHV_SET_MEM_BIT_UNMAP))) 1413 return -EINVAL; 1414 1415 spin_lock(&partition->pt_mem_regions_lock); 1416 1417 region = mshv_partition_region_by_gfn(partition, mem->guest_pfn); 1418 if (!region) { 1419 spin_unlock(&partition->pt_mem_regions_lock); 1420 return -ENOENT; 1421 } 1422 1423 /* Paranoia check */ 1424 if (region->start_uaddr != mem->userspace_addr || 1425 region->start_gfn != mem->guest_pfn || 1426 region->nr_pages != HVPFN_DOWN(mem->size)) { 1427 spin_unlock(&partition->pt_mem_regions_lock); 1428 return -EINVAL; 1429 } 1430 1431 hlist_del(®ion->hnode); 1432 1433 spin_unlock(&partition->pt_mem_regions_lock); 1434 1435 mshv_region_put(region); 1436 1437 return 0; 1438 } 1439 1440 static long 1441 mshv_partition_ioctl_set_memory(struct mshv_partition *partition, 1442 struct mshv_user_mem_region __user *user_mem) 1443 { 1444 struct mshv_user_mem_region mem; 1445 1446 if (copy_from_user(&mem, user_mem, sizeof(mem))) 1447 return -EFAULT; 1448 1449 if (!mem.size || 1450 !PAGE_ALIGNED(mem.size) || 1451 !PAGE_ALIGNED(mem.userspace_addr) || 1452 (mem.flags & ~MSHV_SET_MEM_FLAGS_MASK) || 1453 mshv_field_nonzero(mem, rsvd)) 1454 return -EINVAL; 1455 1456 if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP)) 1457 return mshv_unmap_user_memory(partition, &mem); 1458 1459 return mshv_map_user_memory(partition, &mem); 1460 } 1461 1462 static long 1463 mshv_partition_ioctl_ioeventfd(struct mshv_partition *partition, 1464 void __user *user_args) 1465 { 1466 struct mshv_user_ioeventfd args; 1467 1468 if (copy_from_user(&args, user_args, sizeof(args))) 1469 return -EFAULT; 1470 1471 return mshv_set_unset_ioeventfd(partition, &args); 1472 } 1473 1474 static long 1475 mshv_partition_ioctl_irqfd(struct mshv_partition *partition, 1476 void __user *user_args) 1477 { 1478 struct mshv_user_irqfd args; 1479 1480 if (copy_from_user(&args, user_args, sizeof(args))) 1481 return -EFAULT; 1482 1483 return mshv_set_unset_irqfd(partition, &args); 1484 } 1485 1486 static long 1487 mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition *partition, 1488 void __user *user_args) 1489 { 1490 struct mshv_gpap_access_bitmap args; 1491 union hv_gpa_page_access_state *states; 1492 long ret, i; 1493 union hv_gpa_page_access_state_flags hv_flags = {}; 1494 u8 hv_type_mask; 1495 ulong bitmap_buf_sz, states_buf_sz; 1496 int written = 0; 1497 1498 if (copy_from_user(&args, user_args, sizeof(args))) 1499 return -EFAULT; 1500 1501 if (args.access_type >= MSHV_GPAP_ACCESS_TYPE_COUNT || 1502 args.access_op >= MSHV_GPAP_ACCESS_OP_COUNT || 1503 mshv_field_nonzero(args, rsvd) || !args.page_count || 1504 !args.bitmap_ptr) 1505 return -EINVAL; 1506 1507 if (check_mul_overflow(args.page_count, sizeof(*states), &states_buf_sz)) 1508 return -E2BIG; 1509 1510 /* Num bytes needed to store bitmap; one bit per page rounded up */ 1511 bitmap_buf_sz = DIV_ROUND_UP(args.page_count, 8); 1512 1513 /* Sanity check */ 1514 if (bitmap_buf_sz > states_buf_sz) 1515 return -EBADFD; 1516 1517 switch (args.access_type) { 1518 case MSHV_GPAP_ACCESS_TYPE_ACCESSED: 1519 hv_type_mask = 1; 1520 if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) { 1521 hv_flags.clear_accessed = 1; 1522 /* not accessed implies not dirty */ 1523 hv_flags.clear_dirty = 1; 1524 } else { /* MSHV_GPAP_ACCESS_OP_SET */ 1525 hv_flags.set_accessed = 1; 1526 } 1527 break; 1528 case MSHV_GPAP_ACCESS_TYPE_DIRTY: 1529 hv_type_mask = 2; 1530 if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) { 1531 hv_flags.clear_dirty = 1; 1532 } else { /* MSHV_GPAP_ACCESS_OP_SET */ 1533 hv_flags.set_dirty = 1; 1534 /* dirty implies accessed */ 1535 hv_flags.set_accessed = 1; 1536 } 1537 break; 1538 } 1539 1540 states = vzalloc(states_buf_sz); 1541 if (!states) 1542 return -ENOMEM; 1543 1544 ret = hv_call_get_gpa_access_states(partition->pt_id, args.page_count, 1545 args.gpap_base, hv_flags, &written, 1546 states); 1547 if (ret) 1548 goto free_return; 1549 1550 /* 1551 * Overwrite states buffer with bitmap - the bits in hv_type_mask 1552 * correspond to bitfields in hv_gpa_page_access_state 1553 */ 1554 for (i = 0; i < written; ++i) 1555 __assign_bit(i, (ulong *)states, 1556 states[i].as_uint8 & hv_type_mask); 1557 1558 /* zero the unused bits in the last byte(s) of the returned bitmap */ 1559 for (i = written; i < bitmap_buf_sz * 8; ++i) 1560 __clear_bit(i, (ulong *)states); 1561 1562 if (copy_to_user((void __user *)args.bitmap_ptr, states, bitmap_buf_sz)) 1563 ret = -EFAULT; 1564 1565 free_return: 1566 vfree(states); 1567 return ret; 1568 } 1569 1570 static long 1571 mshv_partition_ioctl_set_msi_routing(struct mshv_partition *partition, 1572 void __user *user_args) 1573 { 1574 struct mshv_user_irq_entry *entries = NULL; 1575 struct mshv_user_irq_table args; 1576 long ret; 1577 1578 if (copy_from_user(&args, user_args, sizeof(args))) 1579 return -EFAULT; 1580 1581 if (args.nr > MSHV_MAX_GUEST_IRQS || 1582 mshv_field_nonzero(args, rsvd)) 1583 return -EINVAL; 1584 1585 if (args.nr) { 1586 struct mshv_user_irq_table __user *urouting = user_args; 1587 1588 entries = vmemdup_user(urouting->entries, 1589 array_size(sizeof(*entries), 1590 args.nr)); 1591 if (IS_ERR(entries)) 1592 return PTR_ERR(entries); 1593 } 1594 ret = mshv_update_routing_table(partition, entries, args.nr); 1595 kvfree(entries); 1596 1597 return ret; 1598 } 1599 1600 static long 1601 mshv_partition_ioctl_initialize(struct mshv_partition *partition) 1602 { 1603 long ret; 1604 1605 if (partition->pt_initialized) 1606 return 0; 1607 1608 ret = hv_call_initialize_partition(partition->pt_id); 1609 if (ret) 1610 goto withdraw_mem; 1611 1612 ret = mshv_debugfs_partition_create(partition); 1613 if (ret) 1614 goto finalize_partition; 1615 1616 partition->pt_initialized = true; 1617 1618 return 0; 1619 1620 finalize_partition: 1621 hv_call_finalize_partition(partition->pt_id); 1622 withdraw_mem: 1623 hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); 1624 1625 return ret; 1626 } 1627 1628 static long 1629 mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) 1630 { 1631 struct mshv_partition *partition = filp->private_data; 1632 long ret; 1633 void __user *uarg = (void __user *)arg; 1634 1635 if (mutex_lock_killable(&partition->pt_mutex)) 1636 return -EINTR; 1637 1638 switch (ioctl) { 1639 case MSHV_INITIALIZE_PARTITION: 1640 ret = mshv_partition_ioctl_initialize(partition); 1641 break; 1642 case MSHV_SET_GUEST_MEMORY: 1643 ret = mshv_partition_ioctl_set_memory(partition, uarg); 1644 break; 1645 case MSHV_CREATE_VP: 1646 ret = mshv_partition_ioctl_create_vp(partition, uarg); 1647 break; 1648 case MSHV_IRQFD: 1649 ret = mshv_partition_ioctl_irqfd(partition, uarg); 1650 break; 1651 case MSHV_IOEVENTFD: 1652 ret = mshv_partition_ioctl_ioeventfd(partition, uarg); 1653 break; 1654 case MSHV_SET_MSI_ROUTING: 1655 ret = mshv_partition_ioctl_set_msi_routing(partition, uarg); 1656 break; 1657 case MSHV_GET_GPAP_ACCESS_BITMAP: 1658 ret = mshv_partition_ioctl_get_gpap_access_bitmap(partition, 1659 uarg); 1660 break; 1661 case MSHV_ROOT_HVCALL: 1662 ret = mshv_ioctl_passthru_hvcall(partition, true, uarg); 1663 break; 1664 default: 1665 ret = -ENOTTY; 1666 } 1667 1668 mutex_unlock(&partition->pt_mutex); 1669 return ret; 1670 } 1671 1672 static int 1673 disable_vp_dispatch(struct mshv_vp *vp) 1674 { 1675 int ret; 1676 struct hv_register_assoc dispatch_suspend = { 1677 .name = HV_REGISTER_DISPATCH_SUSPEND, 1678 .value.dispatch_suspend.suspended = 1, 1679 }; 1680 1681 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 1682 1, &dispatch_suspend); 1683 if (ret) 1684 vp_err(vp, "failed to suspend\n"); 1685 1686 trace_mshv_disable_vp_dispatch(vp->vp_partition->pt_id, 1687 vp->vp_index, ret); 1688 1689 return ret; 1690 } 1691 1692 static int 1693 get_vp_signaled_count(struct mshv_vp *vp, u64 *count) 1694 { 1695 int ret; 1696 struct hv_register_assoc root_signal_count = { 1697 .name = HV_REGISTER_VP_ROOT_SIGNAL_COUNT, 1698 }; 1699 1700 ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 1701 1, &root_signal_count); 1702 1703 if (ret) { 1704 vp_err(vp, "Failed to get root signal count"); 1705 *count = 0; 1706 return ret; 1707 } 1708 1709 *count = root_signal_count.value.reg64; 1710 1711 return ret; 1712 } 1713 1714 static void 1715 drain_vp_signals(struct mshv_vp *vp) 1716 { 1717 u64 hv_signal_count; 1718 u64 vp_signal_count; 1719 1720 get_vp_signaled_count(vp, &hv_signal_count); 1721 1722 vp_signal_count = atomic64_read(&vp->run.vp_signaled_count); 1723 1724 /* 1725 * There should be at most 1 outstanding notification, but be extra 1726 * careful anyway. 1727 */ 1728 while (hv_signal_count != vp_signal_count) { 1729 WARN_ON(hv_signal_count - vp_signal_count != 1); 1730 1731 if (wait_event_interruptible(vp->run.vp_suspend_queue, 1732 vp->run.kicked_by_hv == 1)) 1733 break; 1734 vp->run.kicked_by_hv = 0; 1735 vp_signal_count = atomic64_read(&vp->run.vp_signaled_count); 1736 } 1737 1738 trace_mshv_drain_vp_signals(vp->vp_partition->pt_id, vp->vp_index); 1739 } 1740 1741 static void drain_all_vps(const struct mshv_partition *partition) 1742 { 1743 int i; 1744 struct mshv_vp *vp; 1745 1746 /* 1747 * VPs are reachable from ISR. It is safe to not take the partition 1748 * lock because nobody else can enter this function and drop the 1749 * partition from the list. 1750 */ 1751 for (i = 0; i < MSHV_MAX_VPS; i++) { 1752 vp = partition->pt_vp_array[i]; 1753 if (!vp) 1754 continue; 1755 /* 1756 * Disable dispatching of the VP in the hypervisor. After this 1757 * the hypervisor guarantees it won't generate any signals for 1758 * the VP and the hypervisor's VP signal count won't change. 1759 */ 1760 disable_vp_dispatch(vp); 1761 drain_vp_signals(vp); 1762 } 1763 } 1764 1765 static void 1766 remove_partition(struct mshv_partition *partition) 1767 { 1768 spin_lock(&mshv_root.pt_ht_lock); 1769 hlist_del_rcu(&partition->pt_hnode); 1770 spin_unlock(&mshv_root.pt_ht_lock); 1771 1772 synchronize_rcu(); 1773 } 1774 1775 /* 1776 * Tear down a partition and remove it from the list. 1777 * Partition's refcount must be 0 1778 */ 1779 static void destroy_partition(struct mshv_partition *partition) 1780 { 1781 struct mshv_vp *vp; 1782 struct mshv_mem_region *region; 1783 struct hlist_node *n; 1784 int i; 1785 1786 if (refcount_read(&partition->pt_ref_count)) { 1787 pt_err(partition, 1788 "Attempt to destroy partition but refcount > 0\n"); 1789 return; 1790 } 1791 1792 trace_mshv_destroy_partition(partition->pt_id); 1793 1794 if (partition->pt_initialized) { 1795 /* 1796 * We only need to drain signals for root scheduler. This should be 1797 * done before removing the partition from the partition list. 1798 */ 1799 if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) 1800 drain_all_vps(partition); 1801 1802 /* Remove vps */ 1803 for (i = 0; i < MSHV_MAX_VPS; ++i) { 1804 vp = partition->pt_vp_array[i]; 1805 if (!vp) 1806 continue; 1807 1808 mshv_debugfs_vp_remove(vp); 1809 mshv_vp_stats_unmap(partition->pt_id, vp->vp_index, 1810 vp->vp_stats_pages); 1811 1812 if (vp->vp_register_page) { 1813 (void)hv_unmap_vp_state_page(partition->pt_id, 1814 vp->vp_index, 1815 HV_VP_STATE_PAGE_REGISTERS, 1816 virt_to_page(vp->vp_register_page), 1817 input_vtl_zero); 1818 vp->vp_register_page = NULL; 1819 } 1820 1821 (void)hv_unmap_vp_state_page(partition->pt_id, 1822 vp->vp_index, 1823 HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, 1824 virt_to_page(vp->vp_intercept_msg_page), 1825 input_vtl_zero); 1826 vp->vp_intercept_msg_page = NULL; 1827 1828 if (vp->vp_ghcb_page) { 1829 (void)hv_unmap_vp_state_page(partition->pt_id, 1830 vp->vp_index, 1831 HV_VP_STATE_PAGE_GHCB, 1832 virt_to_page(vp->vp_ghcb_page), 1833 input_vtl_normal); 1834 vp->vp_ghcb_page = NULL; 1835 } 1836 1837 kfree(vp); 1838 1839 partition->pt_vp_array[i] = NULL; 1840 } 1841 1842 mshv_debugfs_partition_remove(partition); 1843 1844 /* Deallocates and unmaps everything including vcpus, GPA mappings etc */ 1845 hv_call_finalize_partition(partition->pt_id); 1846 1847 partition->pt_initialized = false; 1848 } 1849 1850 remove_partition(partition); 1851 1852 hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions, 1853 hnode) { 1854 hlist_del(®ion->hnode); 1855 mshv_region_put(region); 1856 } 1857 1858 /* Withdraw and free all pages we deposited */ 1859 hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); 1860 hv_call_delete_partition(partition->pt_id); 1861 1862 mshv_free_routing_table(partition); 1863 kfree(partition); 1864 } 1865 1866 struct 1867 mshv_partition *mshv_partition_get(struct mshv_partition *partition) 1868 { 1869 if (refcount_inc_not_zero(&partition->pt_ref_count)) 1870 return partition; 1871 return NULL; 1872 } 1873 1874 struct 1875 mshv_partition *mshv_partition_find(u64 partition_id) 1876 __must_hold(RCU) 1877 { 1878 struct mshv_partition *p; 1879 1880 hash_for_each_possible_rcu(mshv_root.pt_htable, p, pt_hnode, 1881 partition_id) 1882 if (p->pt_id == partition_id) 1883 return p; 1884 1885 return NULL; 1886 } 1887 1888 void 1889 mshv_partition_put(struct mshv_partition *partition) 1890 { 1891 if (refcount_dec_and_test(&partition->pt_ref_count)) 1892 destroy_partition(partition); 1893 } 1894 1895 static int 1896 mshv_partition_release(struct inode *inode, struct file *filp) 1897 { 1898 struct mshv_partition *partition = filp->private_data; 1899 1900 trace_mshv_partition_release(partition->pt_id); 1901 1902 mshv_eventfd_release(partition); 1903 1904 cleanup_srcu_struct(&partition->pt_irq_srcu); 1905 1906 mshv_partition_put(partition); 1907 1908 return 0; 1909 } 1910 1911 static int 1912 add_partition(struct mshv_partition *partition) 1913 { 1914 spin_lock(&mshv_root.pt_ht_lock); 1915 1916 hash_add_rcu(mshv_root.pt_htable, &partition->pt_hnode, 1917 partition->pt_id); 1918 1919 spin_unlock(&mshv_root.pt_ht_lock); 1920 1921 return 0; 1922 } 1923 1924 static_assert(MSHV_NUM_CPU_FEATURES_BANKS == 1925 HV_PARTITION_PROCESSOR_FEATURES_BANKS); 1926 1927 static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags, 1928 struct hv_partition_creation_properties *cr_props, 1929 union hv_partition_isolation_properties *isol_props) 1930 { 1931 int i; 1932 struct mshv_create_partition_v2 args; 1933 union hv_partition_processor_features *disabled_procs; 1934 union hv_partition_processor_xsave_features *disabled_xsave; 1935 1936 /* First, copy v1 struct in case user is on previous versions */ 1937 if (copy_from_user(&args, user_arg, 1938 sizeof(struct mshv_create_partition))) 1939 return -EFAULT; 1940 1941 if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) || 1942 args.pt_isolation >= MSHV_PT_ISOLATION_COUNT) 1943 return -EINVAL; 1944 1945 disabled_procs = &cr_props->disabled_processor_features; 1946 disabled_xsave = &cr_props->disabled_processor_xsave_features; 1947 1948 /* Check if user provided newer struct with feature fields */ 1949 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES)) { 1950 if (copy_from_user(&args, user_arg, sizeof(args))) 1951 return -EFAULT; 1952 1953 /* Re-validate v1 fields after second copy_from_user() */ 1954 if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) || 1955 args.pt_isolation >= MSHV_PT_ISOLATION_COUNT) 1956 return -EINVAL; 1957 1958 if (args.pt_num_cpu_fbanks != MSHV_NUM_CPU_FEATURES_BANKS || 1959 mshv_field_nonzero(args, pt_rsvd) || 1960 mshv_field_nonzero(args, pt_rsvd1)) 1961 return -EINVAL; 1962 1963 /* 1964 * Note this assumes MSHV_NUM_CPU_FEATURES_BANKS will never 1965 * change and equals HV_PARTITION_PROCESSOR_FEATURES_BANKS 1966 * (i.e. 2). 1967 * 1968 * Further banks (index >= 2) will be modifiable as 'early' 1969 * properties via the set partition property hypercall. 1970 */ 1971 for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++) 1972 disabled_procs->as_uint64[i] = args.pt_cpu_fbanks[i]; 1973 1974 #if IS_ENABLED(CONFIG_X86_64) 1975 disabled_xsave->as_uint64 = args.pt_disabled_xsave; 1976 #else 1977 /* 1978 * In practice this field is ignored on arm64, but safer to 1979 * zero it in case it is ever used. 1980 */ 1981 disabled_xsave->as_uint64 = 0; 1982 1983 if (mshv_field_nonzero(args, pt_rsvd2)) 1984 return -EINVAL; 1985 #endif 1986 } else { 1987 /* 1988 * v1 behavior: try to enable everything. The hypervisor will 1989 * disable features that are not supported. The banks can be 1990 * queried via the get partition property hypercall. 1991 */ 1992 for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++) 1993 disabled_procs->as_uint64[i] = 0; 1994 1995 disabled_xsave->as_uint64 = 0; 1996 } 1997 1998 /* Only support EXO partitions */ 1999 *pt_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION | 2000 HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED; 2001 2002 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_LAPIC)) 2003 *pt_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED; 2004 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_X2APIC)) 2005 *pt_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE; 2006 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_GPA_SUPER_PAGES)) 2007 *pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED; 2008 if (args.pt_flags & BIT(MSHV_PT_BIT_NESTED_VIRTUALIZATION)) 2009 *pt_flags |= HV_PARTITION_CREATION_FLAG_NESTED_VIRTUALIZATION_CAPABLE; 2010 if (args.pt_flags & BIT(MSHV_PT_BIT_SMT_ENABLED_GUEST)) 2011 *pt_flags |= HV_PARTITION_CREATION_FLAG_SMT_ENABLED_GUEST; 2012 2013 isol_props->as_uint64 = 0; 2014 2015 switch (args.pt_isolation) { 2016 case MSHV_PT_ISOLATION_NONE: 2017 isol_props->isolation_type = HV_PARTITION_ISOLATION_TYPE_NONE; 2018 break; 2019 } 2020 2021 return 0; 2022 } 2023 2024 static long 2025 mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev) 2026 { 2027 u64 creation_flags; 2028 struct hv_partition_creation_properties creation_properties; 2029 union hv_partition_isolation_properties isolation_properties; 2030 struct mshv_partition *partition; 2031 u64 pt_id = -1; 2032 long ret; 2033 2034 ret = mshv_ioctl_process_pt_flags(user_arg, &creation_flags, 2035 &creation_properties, 2036 &isolation_properties); 2037 if (ret) 2038 return ret; 2039 2040 partition = kzalloc_obj(*partition); 2041 if (!partition) 2042 return -ENOMEM; 2043 2044 partition->pt_module_dev = module_dev; 2045 partition->isolation_type = isolation_properties.isolation_type; 2046 2047 refcount_set(&partition->pt_ref_count, 1); 2048 2049 mutex_init(&partition->pt_mutex); 2050 2051 mutex_init(&partition->pt_irq_lock); 2052 2053 init_completion(&partition->async_hypercall); 2054 2055 INIT_HLIST_HEAD(&partition->irq_ack_notifier_list); 2056 2057 INIT_HLIST_HEAD(&partition->pt_devices); 2058 2059 spin_lock_init(&partition->pt_mem_regions_lock); 2060 INIT_HLIST_HEAD(&partition->pt_mem_regions); 2061 2062 mshv_eventfd_init(partition); 2063 2064 ret = init_srcu_struct(&partition->pt_irq_srcu); 2065 if (ret) 2066 goto free_partition; 2067 2068 ret = hv_call_create_partition(creation_flags, 2069 creation_properties, 2070 isolation_properties, 2071 &pt_id); 2072 if (ret) 2073 goto cleanup_irq_srcu; 2074 2075 partition->pt_id = pt_id; 2076 2077 ret = add_partition(partition); 2078 if (ret) 2079 goto delete_partition; 2080 2081 ret = mshv_init_async_handler(partition); 2082 if (ret) 2083 goto remove_partition; 2084 2085 ret = FD_ADD(O_CLOEXEC, anon_inode_getfile("mshv_partition", 2086 &mshv_partition_fops, 2087 partition, O_RDWR)); 2088 if (ret < 0) 2089 goto remove_partition; 2090 2091 goto out; 2092 2093 remove_partition: 2094 remove_partition(partition); 2095 delete_partition: 2096 hv_call_delete_partition(partition->pt_id); 2097 cleanup_irq_srcu: 2098 cleanup_srcu_struct(&partition->pt_irq_srcu); 2099 free_partition: 2100 kfree(partition); 2101 out: 2102 trace_mshv_create_partition(pt_id, ret); 2103 return ret; 2104 } 2105 2106 static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, 2107 unsigned long arg) 2108 { 2109 struct miscdevice *misc = filp->private_data; 2110 2111 switch (ioctl) { 2112 case MSHV_CREATE_PARTITION: 2113 return mshv_ioctl_create_partition((void __user *)arg, 2114 misc->this_device); 2115 case MSHV_ROOT_HVCALL: 2116 return mshv_ioctl_passthru_hvcall(NULL, false, 2117 (void __user *)arg); 2118 } 2119 2120 return -ENOTTY; 2121 } 2122 2123 static int 2124 mshv_dev_open(struct inode *inode, struct file *filp) 2125 { 2126 return 0; 2127 } 2128 2129 static int 2130 mshv_dev_release(struct inode *inode, struct file *filp) 2131 { 2132 return 0; 2133 } 2134 2135 static int mshv_root_sched_online; 2136 2137 static const char *scheduler_type_to_string(enum hv_scheduler_type type) 2138 { 2139 switch (type) { 2140 case HV_SCHEDULER_TYPE_LP: 2141 return "classic scheduler without SMT"; 2142 case HV_SCHEDULER_TYPE_LP_SMT: 2143 return "classic scheduler with SMT"; 2144 case HV_SCHEDULER_TYPE_CORE_SMT: 2145 return "core scheduler"; 2146 case HV_SCHEDULER_TYPE_ROOT: 2147 return "root scheduler"; 2148 default: 2149 return "unknown scheduler"; 2150 }; 2151 } 2152 2153 static int __init l1vh_retrieve_scheduler_type(enum hv_scheduler_type *out) 2154 { 2155 u64 integrated_sched_enabled; 2156 int ret; 2157 2158 *out = HV_SCHEDULER_TYPE_CORE_SMT; 2159 2160 if (!mshv_root.vmm_caps.vmm_enable_integrated_scheduler) 2161 return 0; 2162 2163 ret = hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF, 2164 HV_PARTITION_PROPERTY_INTEGRATED_SCHEDULER_ENABLED, 2165 0, &integrated_sched_enabled, 2166 sizeof(integrated_sched_enabled)); 2167 if (ret) 2168 return ret; 2169 2170 if (integrated_sched_enabled) 2171 *out = HV_SCHEDULER_TYPE_ROOT; 2172 2173 return 0; 2174 } 2175 2176 /* TODO move this to hv_common.c when needed outside */ 2177 static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out) 2178 { 2179 struct hv_input_get_system_property *input; 2180 struct hv_output_get_system_property *output; 2181 unsigned long flags; 2182 u64 status; 2183 2184 local_irq_save(flags); 2185 input = *this_cpu_ptr(hyperv_pcpu_input_arg); 2186 output = *this_cpu_ptr(hyperv_pcpu_output_arg); 2187 2188 memset(input, 0, sizeof(*input)); 2189 memset(output, 0, sizeof(*output)); 2190 input->property_id = HV_SYSTEM_PROPERTY_SCHEDULER_TYPE; 2191 2192 status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output); 2193 if (!hv_result_success(status)) { 2194 local_irq_restore(flags); 2195 pr_err("%s: %s\n", __func__, hv_result_to_string(status)); 2196 return hv_result_to_errno(status); 2197 } 2198 2199 *out = output->scheduler_type; 2200 local_irq_restore(flags); 2201 2202 return 0; 2203 } 2204 2205 /* Retrieve and stash the supported scheduler type */ 2206 static int __init mshv_retrieve_scheduler_type(struct device *dev) 2207 { 2208 int ret; 2209 2210 if (hv_l1vh_partition()) 2211 ret = l1vh_retrieve_scheduler_type(&hv_scheduler_type); 2212 else 2213 ret = hv_retrieve_scheduler_type(&hv_scheduler_type); 2214 if (ret) 2215 return ret; 2216 2217 dev_info(dev, "Hypervisor using %s\n", 2218 scheduler_type_to_string(hv_scheduler_type)); 2219 2220 switch (hv_scheduler_type) { 2221 case HV_SCHEDULER_TYPE_CORE_SMT: 2222 case HV_SCHEDULER_TYPE_LP_SMT: 2223 case HV_SCHEDULER_TYPE_ROOT: 2224 case HV_SCHEDULER_TYPE_LP: 2225 /* Supported scheduler, nothing to do */ 2226 break; 2227 default: 2228 dev_err(dev, "unsupported scheduler 0x%x, bailing.\n", 2229 hv_scheduler_type); 2230 return -EOPNOTSUPP; 2231 } 2232 2233 return 0; 2234 } 2235 2236 static int mshv_root_scheduler_init(unsigned int cpu) 2237 { 2238 void **inputarg, **outputarg, *p; 2239 2240 inputarg = (void **)this_cpu_ptr(root_scheduler_input); 2241 outputarg = (void **)this_cpu_ptr(root_scheduler_output); 2242 2243 /* Allocate two consecutive pages. One for input, one for output. */ 2244 p = kmalloc(2 * HV_HYP_PAGE_SIZE, GFP_KERNEL); 2245 if (!p) 2246 return -ENOMEM; 2247 2248 *inputarg = p; 2249 *outputarg = (char *)p + HV_HYP_PAGE_SIZE; 2250 2251 return 0; 2252 } 2253 2254 static int mshv_root_scheduler_cleanup(unsigned int cpu) 2255 { 2256 void *p, **inputarg, **outputarg; 2257 2258 inputarg = (void **)this_cpu_ptr(root_scheduler_input); 2259 outputarg = (void **)this_cpu_ptr(root_scheduler_output); 2260 2261 p = *inputarg; 2262 2263 *inputarg = NULL; 2264 *outputarg = NULL; 2265 2266 kfree(p); 2267 2268 return 0; 2269 } 2270 2271 /* Must be called after retrieving the scheduler type */ 2272 static int 2273 root_scheduler_init(struct device *dev) 2274 { 2275 int ret; 2276 2277 if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) 2278 return 0; 2279 2280 root_scheduler_input = alloc_percpu(void *); 2281 root_scheduler_output = alloc_percpu(void *); 2282 2283 if (!root_scheduler_input || !root_scheduler_output) { 2284 dev_err(dev, "Failed to allocate root scheduler buffers\n"); 2285 ret = -ENOMEM; 2286 goto out; 2287 } 2288 2289 ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_root_sched", 2290 mshv_root_scheduler_init, 2291 mshv_root_scheduler_cleanup); 2292 2293 if (ret < 0) { 2294 dev_err(dev, "Failed to setup root scheduler state: %i\n", ret); 2295 goto out; 2296 } 2297 2298 mshv_root_sched_online = ret; 2299 2300 return 0; 2301 2302 out: 2303 free_percpu(root_scheduler_input); 2304 free_percpu(root_scheduler_output); 2305 return ret; 2306 } 2307 2308 static void 2309 root_scheduler_deinit(void) 2310 { 2311 if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) 2312 return; 2313 2314 cpuhp_remove_state(mshv_root_sched_online); 2315 free_percpu(root_scheduler_input); 2316 free_percpu(root_scheduler_output); 2317 } 2318 2319 static int __init mshv_init_vmm_caps(struct device *dev) 2320 { 2321 int ret; 2322 2323 ret = hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF, 2324 HV_PARTITION_PROPERTY_VMM_CAPABILITIES, 2325 0, &mshv_root.vmm_caps, 2326 sizeof(mshv_root.vmm_caps)); 2327 if (ret && hv_l1vh_partition()) { 2328 dev_err(dev, "Failed to get VMM capabilities: %d\n", ret); 2329 return ret; 2330 } 2331 2332 dev_dbg(dev, "vmm_caps = %#llx\n", mshv_root.vmm_caps.as_uint64[0]); 2333 2334 return 0; 2335 } 2336 2337 static int __init mshv_parent_partition_init(void) 2338 { 2339 int ret; 2340 struct device *dev; 2341 union hv_hypervisor_version_info version_info; 2342 2343 if (!hv_parent_partition() || is_kdump_kernel()) 2344 return -ENODEV; 2345 2346 if (hv_get_hypervisor_version(&version_info)) 2347 return -ENODEV; 2348 2349 ret = misc_register(&mshv_dev); 2350 if (ret) 2351 return ret; 2352 2353 dev = mshv_dev.this_device; 2354 2355 if (version_info.build_number < MSHV_HV_MIN_VERSION || 2356 version_info.build_number > MSHV_HV_MAX_VERSION) { 2357 dev_err(dev, "Running on unvalidated Hyper-V version\n"); 2358 dev_err(dev, "Versions: current: %u min: %u max: %u\n", 2359 version_info.build_number, MSHV_HV_MIN_VERSION, 2360 MSHV_HV_MAX_VERSION); 2361 } 2362 2363 ret = mshv_synic_init(dev); 2364 if (ret) 2365 goto device_deregister; 2366 2367 ret = mshv_init_vmm_caps(dev); 2368 if (ret) 2369 goto synic_cleanup; 2370 2371 ret = mshv_retrieve_scheduler_type(dev); 2372 if (ret) 2373 goto synic_cleanup; 2374 2375 ret = root_scheduler_init(dev); 2376 if (ret) 2377 goto synic_cleanup; 2378 2379 ret = mshv_debugfs_init(); 2380 if (ret) 2381 goto deinit_root_scheduler; 2382 2383 ret = mshv_irqfd_wq_init(); 2384 if (ret) 2385 goto exit_debugfs; 2386 2387 spin_lock_init(&mshv_root.pt_ht_lock); 2388 hash_init(mshv_root.pt_htable); 2389 2390 hv_setup_mshv_handler(mshv_isr); 2391 2392 return 0; 2393 2394 exit_debugfs: 2395 mshv_debugfs_exit(); 2396 deinit_root_scheduler: 2397 root_scheduler_deinit(); 2398 synic_cleanup: 2399 mshv_synic_exit(); 2400 device_deregister: 2401 misc_deregister(&mshv_dev); 2402 return ret; 2403 } 2404 2405 static void __exit mshv_parent_partition_exit(void) 2406 { 2407 hv_setup_mshv_handler(NULL); 2408 mshv_port_table_fini(); 2409 mshv_debugfs_exit(); 2410 misc_deregister(&mshv_dev); 2411 mshv_irqfd_wq_cleanup(); 2412 root_scheduler_deinit(); 2413 mshv_synic_exit(); 2414 } 2415 2416 module_init(mshv_parent_partition_init); 2417 module_exit(mshv_parent_partition_exit); 2418