1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2024, Microsoft Corporation. 4 * 5 * The main part of the mshv_root module, providing APIs to create 6 * and manage guest partitions. 7 * 8 * Authors: Microsoft Linux virtualization team 9 */ 10 11 #include <linux/entry-virt.h> 12 #include <linux/kernel.h> 13 #include <linux/module.h> 14 #include <linux/fs.h> 15 #include <linux/miscdevice.h> 16 #include <linux/slab.h> 17 #include <linux/file.h> 18 #include <linux/anon_inodes.h> 19 #include <linux/mm.h> 20 #include <linux/io.h> 21 #include <linux/cpuhotplug.h> 22 #include <linux/random.h> 23 #include <asm/mshyperv.h> 24 #include <linux/hyperv.h> 25 #include <linux/notifier.h> 26 #include <linux/reboot.h> 27 #include <linux/kexec.h> 28 #include <linux/page-flags.h> 29 #include <linux/crash_dump.h> 30 #include <linux/panic_notifier.h> 31 #include <linux/vmalloc.h> 32 #include <linux/rseq.h> 33 34 #include "mshv_eventfd.h" 35 #include "mshv.h" 36 #include "mshv_root.h" 37 38 MODULE_AUTHOR("Microsoft"); 39 MODULE_LICENSE("GPL"); 40 MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv"); 41 42 /* TODO move this to another file when debugfs code is added */ 43 enum hv_stats_vp_counters { /* HV_THREAD_COUNTER */ 44 #if defined(CONFIG_X86) 45 VpRootDispatchThreadBlocked = 202, 46 #elif defined(CONFIG_ARM64) 47 VpRootDispatchThreadBlocked = 94, 48 #endif 49 VpStatsMaxCounter 50 }; 51 52 struct hv_stats_page { 53 union { 54 u64 vp_cntrs[VpStatsMaxCounter]; /* VP counters */ 55 u8 data[HV_HYP_PAGE_SIZE]; 56 }; 57 } __packed; 58 59 struct mshv_root mshv_root; 60 61 enum hv_scheduler_type hv_scheduler_type; 62 63 /* Once we implement the fast extended hypercall ABI they can go away. */ 64 static void * __percpu *root_scheduler_input; 65 static void * __percpu *root_scheduler_output; 66 67 static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); 68 static int mshv_dev_open(struct inode *inode, struct file *filp); 69 static int mshv_dev_release(struct inode *inode, struct file *filp); 70 static int mshv_vp_release(struct inode *inode, struct file *filp); 71 static long mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); 72 static int mshv_partition_release(struct inode *inode, struct file *filp); 73 static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); 74 static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma); 75 static vm_fault_t mshv_vp_fault(struct vm_fault *vmf); 76 static int mshv_init_async_handler(struct mshv_partition *partition); 77 static void mshv_async_hvcall_handler(void *data, u64 *status); 78 79 static const union hv_input_vtl input_vtl_zero; 80 static const union hv_input_vtl input_vtl_normal = { 81 .target_vtl = HV_NORMAL_VTL, 82 .use_target_vtl = 1, 83 }; 84 85 static const struct vm_operations_struct mshv_vp_vm_ops = { 86 .fault = mshv_vp_fault, 87 }; 88 89 static const struct file_operations mshv_vp_fops = { 90 .owner = THIS_MODULE, 91 .release = mshv_vp_release, 92 .unlocked_ioctl = mshv_vp_ioctl, 93 .llseek = noop_llseek, 94 .mmap = mshv_vp_mmap, 95 }; 96 97 static const struct file_operations mshv_partition_fops = { 98 .owner = THIS_MODULE, 99 .release = mshv_partition_release, 100 .unlocked_ioctl = mshv_partition_ioctl, 101 .llseek = noop_llseek, 102 }; 103 104 static const struct file_operations mshv_dev_fops = { 105 .owner = THIS_MODULE, 106 .open = mshv_dev_open, 107 .release = mshv_dev_release, 108 .unlocked_ioctl = mshv_dev_ioctl, 109 .llseek = noop_llseek, 110 }; 111 112 static struct miscdevice mshv_dev = { 113 .minor = MISC_DYNAMIC_MINOR, 114 .name = "mshv", 115 .fops = &mshv_dev_fops, 116 .mode = 0600, 117 }; 118 119 /* 120 * Only allow hypercalls that have a u64 partition id as the first member of 121 * the input structure. 122 * These are sorted by value. 123 */ 124 static u16 mshv_passthru_hvcalls[] = { 125 HVCALL_GET_PARTITION_PROPERTY, 126 HVCALL_GET_PARTITION_PROPERTY_EX, 127 HVCALL_SET_PARTITION_PROPERTY, 128 HVCALL_INSTALL_INTERCEPT, 129 HVCALL_GET_VP_REGISTERS, 130 HVCALL_SET_VP_REGISTERS, 131 HVCALL_TRANSLATE_VIRTUAL_ADDRESS, 132 HVCALL_CLEAR_VIRTUAL_INTERRUPT, 133 HVCALL_REGISTER_INTERCEPT_RESULT, 134 HVCALL_ASSERT_VIRTUAL_INTERRUPT, 135 HVCALL_GET_GPA_PAGES_ACCESS_STATES, 136 HVCALL_SIGNAL_EVENT_DIRECT, 137 HVCALL_POST_MESSAGE_DIRECT, 138 HVCALL_GET_VP_CPUID_VALUES, 139 }; 140 141 /* 142 * Only allow hypercalls that are safe to be called by the VMM with the host 143 * partition as target (i.e. HV_PARTITION_ID_SELF). Carefully audit that a 144 * hypercall cannot be misused by the VMM before adding it to this list. 145 */ 146 static u16 mshv_self_passthru_hvcalls[] = { 147 HVCALL_GET_PARTITION_PROPERTY, 148 HVCALL_GET_PARTITION_PROPERTY_EX, 149 }; 150 151 static bool mshv_hvcall_is_async(u16 code) 152 { 153 switch (code) { 154 case HVCALL_SET_PARTITION_PROPERTY: 155 return true; 156 default: 157 break; 158 } 159 return false; 160 } 161 162 static bool mshv_passthru_hvcall_allowed(u16 code, u64 pt_id) 163 { 164 int i; 165 int n = ARRAY_SIZE(mshv_passthru_hvcalls); 166 u16 *allowed_hvcalls = mshv_passthru_hvcalls; 167 168 if (pt_id == HV_PARTITION_ID_SELF) { 169 n = ARRAY_SIZE(mshv_self_passthru_hvcalls); 170 allowed_hvcalls = mshv_self_passthru_hvcalls; 171 } 172 173 for (i = 0; i < n; ++i) 174 if (allowed_hvcalls[i] == code) 175 return true; 176 177 return false; 178 } 179 180 static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition, 181 bool partition_locked, 182 void __user *user_args) 183 { 184 u64 status; 185 int ret = 0; 186 bool is_async; 187 struct mshv_root_hvcall args; 188 struct page *page; 189 unsigned int pages_order; 190 void *input_pg = NULL; 191 void *output_pg = NULL; 192 u16 reps_completed; 193 u64 pt_id = partition ? partition->pt_id : HV_PARTITION_ID_SELF; 194 195 if (copy_from_user(&args, user_args, sizeof(args))) 196 return -EFAULT; 197 198 if (args.status || !args.in_ptr || args.in_sz < sizeof(u64) || 199 mshv_field_nonzero(args, rsvd) || args.in_sz > HV_HYP_PAGE_SIZE) 200 return -EINVAL; 201 202 if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE)) 203 return -EINVAL; 204 205 if (!mshv_passthru_hvcall_allowed(args.code, pt_id)) 206 return -EINVAL; 207 208 is_async = mshv_hvcall_is_async(args.code); 209 if (is_async) { 210 /* async hypercalls can only be called from partition fd */ 211 if (!partition || !partition_locked) 212 return -EINVAL; 213 ret = mshv_init_async_handler(partition); 214 if (ret) 215 return ret; 216 } 217 218 pages_order = args.out_ptr ? 1 : 0; 219 page = alloc_pages(GFP_KERNEL, pages_order); 220 if (!page) 221 return -ENOMEM; 222 input_pg = page_address(page); 223 224 if (args.out_ptr) 225 output_pg = (char *)input_pg + PAGE_SIZE; 226 else 227 output_pg = NULL; 228 229 if (copy_from_user(input_pg, (void __user *)args.in_ptr, 230 args.in_sz)) { 231 ret = -EFAULT; 232 goto free_pages_out; 233 } 234 235 /* 236 * NOTE: This only works because all the allowed hypercalls' input 237 * structs begin with a u64 partition_id field. 238 */ 239 *(u64 *)input_pg = pt_id; 240 241 reps_completed = 0; 242 do { 243 if (args.reps) { 244 status = hv_do_rep_hypercall_ex(args.code, args.reps, 245 0, reps_completed, 246 input_pg, output_pg); 247 reps_completed = hv_repcomp(status); 248 } else { 249 status = hv_do_hypercall(args.code, input_pg, output_pg); 250 } 251 252 if (hv_result(status) == HV_STATUS_CALL_PENDING) { 253 if (is_async) { 254 mshv_async_hvcall_handler(partition, &status); 255 } else { /* Paranoia check. This shouldn't happen! */ 256 ret = -EBADFD; 257 goto free_pages_out; 258 } 259 } 260 261 if (hv_result_success(status)) 262 break; 263 264 if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) 265 ret = hv_result_to_errno(status); 266 else 267 ret = hv_call_deposit_pages(NUMA_NO_NODE, 268 pt_id, 1); 269 } while (!ret); 270 271 args.status = hv_result(status); 272 args.reps = reps_completed; 273 if (copy_to_user(user_args, &args, sizeof(args))) 274 ret = -EFAULT; 275 276 if (!ret && output_pg && 277 copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz)) 278 ret = -EFAULT; 279 280 free_pages_out: 281 free_pages((unsigned long)input_pg, pages_order); 282 283 return ret; 284 } 285 286 static inline bool is_ghcb_mapping_available(void) 287 { 288 #if IS_ENABLED(CONFIG_X86_64) 289 return ms_hyperv.ext_features & HV_VP_GHCB_ROOT_MAPPING_AVAILABLE; 290 #else 291 return 0; 292 #endif 293 } 294 295 static int mshv_get_vp_registers(u32 vp_index, u64 partition_id, u16 count, 296 struct hv_register_assoc *registers) 297 { 298 return hv_call_get_vp_registers(vp_index, partition_id, 299 count, input_vtl_zero, registers); 300 } 301 302 static int mshv_set_vp_registers(u32 vp_index, u64 partition_id, u16 count, 303 struct hv_register_assoc *registers) 304 { 305 return hv_call_set_vp_registers(vp_index, partition_id, 306 count, input_vtl_zero, registers); 307 } 308 309 /* 310 * Explicit guest vCPU suspend is asynchronous by nature (as it is requested by 311 * dom0 vCPU for guest vCPU) and thus it can race with "intercept" suspend, 312 * done by the hypervisor. 313 * "Intercept" suspend leads to asynchronous message delivery to dom0 which 314 * should be awaited to keep the VP loop consistent (i.e. no message pending 315 * upon VP resume). 316 * VP intercept suspend can't be done when the VP is explicitly suspended 317 * already, and thus can be only two possible race scenarios: 318 * 1. implicit suspend bit set -> explicit suspend bit set -> message sent 319 * 2. implicit suspend bit set -> message sent -> explicit suspend bit set 320 * Checking for implicit suspend bit set after explicit suspend request has 321 * succeeded in either case allows us to reliably identify, if there is a 322 * message to receive and deliver to VMM. 323 */ 324 static int 325 mshv_suspend_vp(const struct mshv_vp *vp, bool *message_in_flight) 326 { 327 struct hv_register_assoc explicit_suspend = { 328 .name = HV_REGISTER_EXPLICIT_SUSPEND 329 }; 330 struct hv_register_assoc intercept_suspend = { 331 .name = HV_REGISTER_INTERCEPT_SUSPEND 332 }; 333 union hv_explicit_suspend_register *es = 334 &explicit_suspend.value.explicit_suspend; 335 union hv_intercept_suspend_register *is = 336 &intercept_suspend.value.intercept_suspend; 337 int ret; 338 339 es->suspended = 1; 340 341 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 342 1, &explicit_suspend); 343 if (ret) { 344 vp_err(vp, "Failed to explicitly suspend vCPU\n"); 345 return ret; 346 } 347 348 ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 349 1, &intercept_suspend); 350 if (ret) { 351 vp_err(vp, "Failed to get intercept suspend state\n"); 352 return ret; 353 } 354 355 *message_in_flight = is->suspended; 356 357 return 0; 358 } 359 360 /* 361 * This function is used when VPs are scheduled by the hypervisor's 362 * scheduler. 363 * 364 * Caller has to make sure the registers contain cleared 365 * HV_REGISTER_INTERCEPT_SUSPEND and HV_REGISTER_EXPLICIT_SUSPEND registers 366 * exactly in this order (the hypervisor clears them sequentially) to avoid 367 * potential invalid clearing a newly arrived HV_REGISTER_INTERCEPT_SUSPEND 368 * after VP is released from HV_REGISTER_EXPLICIT_SUSPEND in case of the 369 * opposite order. 370 */ 371 static long mshv_run_vp_with_hyp_scheduler(struct mshv_vp *vp) 372 { 373 long ret; 374 struct hv_register_assoc suspend_regs[2] = { 375 { .name = HV_REGISTER_INTERCEPT_SUSPEND }, 376 { .name = HV_REGISTER_EXPLICIT_SUSPEND } 377 }; 378 size_t count = ARRAY_SIZE(suspend_regs); 379 380 /* Resume VP execution */ 381 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 382 count, suspend_regs); 383 if (ret) { 384 vp_err(vp, "Failed to resume vp execution. %lx\n", ret); 385 return ret; 386 } 387 388 ret = wait_event_interruptible(vp->run.vp_suspend_queue, 389 vp->run.kicked_by_hv == 1); 390 if (ret) { 391 bool message_in_flight; 392 393 /* 394 * Otherwise the waiting was interrupted by a signal: suspend 395 * the vCPU explicitly and copy message in flight (if any). 396 */ 397 ret = mshv_suspend_vp(vp, &message_in_flight); 398 if (ret) 399 return ret; 400 401 /* Return if no message in flight */ 402 if (!message_in_flight) 403 return -EINTR; 404 405 /* Wait for the message in flight. */ 406 wait_event(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1); 407 } 408 409 /* 410 * Reset the flag to make the wait_event call above work 411 * next time. 412 */ 413 vp->run.kicked_by_hv = 0; 414 415 return 0; 416 } 417 418 static int 419 mshv_vp_dispatch(struct mshv_vp *vp, u32 flags, 420 struct hv_output_dispatch_vp *res) 421 { 422 struct hv_input_dispatch_vp *input; 423 struct hv_output_dispatch_vp *output; 424 u64 status; 425 426 preempt_disable(); 427 input = *this_cpu_ptr(root_scheduler_input); 428 output = *this_cpu_ptr(root_scheduler_output); 429 430 memset(input, 0, sizeof(*input)); 431 memset(output, 0, sizeof(*output)); 432 433 input->partition_id = vp->vp_partition->pt_id; 434 input->vp_index = vp->vp_index; 435 input->time_slice = 0; /* Run forever until something happens */ 436 input->spec_ctrl = 0; /* TODO: set sensible flags */ 437 input->flags = flags; 438 439 vp->run.flags.root_sched_dispatched = 1; 440 status = hv_do_hypercall(HVCALL_DISPATCH_VP, input, output); 441 vp->run.flags.root_sched_dispatched = 0; 442 443 *res = *output; 444 preempt_enable(); 445 446 if (!hv_result_success(status)) 447 vp_err(vp, "%s: status %s\n", __func__, 448 hv_result_to_string(status)); 449 450 return hv_result_to_errno(status); 451 } 452 453 static int 454 mshv_vp_clear_explicit_suspend(struct mshv_vp *vp) 455 { 456 struct hv_register_assoc explicit_suspend = { 457 .name = HV_REGISTER_EXPLICIT_SUSPEND, 458 .value.explicit_suspend.suspended = 0, 459 }; 460 int ret; 461 462 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 463 1, &explicit_suspend); 464 465 if (ret) 466 vp_err(vp, "Failed to unsuspend\n"); 467 468 return ret; 469 } 470 471 #if IS_ENABLED(CONFIG_X86_64) 472 static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp) 473 { 474 if (!vp->vp_register_page) 475 return 0; 476 return vp->vp_register_page->interrupt_vectors.as_uint64; 477 } 478 #else 479 static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp) 480 { 481 return 0; 482 } 483 #endif 484 485 static bool mshv_vp_dispatch_thread_blocked(struct mshv_vp *vp) 486 { 487 struct hv_stats_page **stats = vp->vp_stats_pages; 488 u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->vp_cntrs; 489 u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->vp_cntrs; 490 491 if (self_vp_cntrs[VpRootDispatchThreadBlocked]) 492 return self_vp_cntrs[VpRootDispatchThreadBlocked]; 493 return parent_vp_cntrs[VpRootDispatchThreadBlocked]; 494 } 495 496 static int 497 mshv_vp_wait_for_hv_kick(struct mshv_vp *vp) 498 { 499 int ret; 500 501 ret = wait_event_interruptible(vp->run.vp_suspend_queue, 502 (vp->run.kicked_by_hv == 1 && 503 !mshv_vp_dispatch_thread_blocked(vp)) || 504 mshv_vp_interrupt_pending(vp)); 505 if (ret) 506 return -EINTR; 507 508 vp->run.flags.root_sched_blocked = 0; 509 vp->run.kicked_by_hv = 0; 510 511 return 0; 512 } 513 514 /* Must be called with interrupts enabled */ 515 static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp) 516 { 517 long ret; 518 519 if (vp->run.flags.root_sched_blocked) { 520 /* 521 * Dispatch state of this VP is blocked. Need to wait 522 * for the hypervisor to clear the blocked state before 523 * dispatching it. 524 */ 525 ret = mshv_vp_wait_for_hv_kick(vp); 526 if (ret) 527 return ret; 528 } 529 530 do { 531 u32 flags = 0; 532 struct hv_output_dispatch_vp output; 533 534 if (__xfer_to_guest_mode_work_pending()) { 535 ret = xfer_to_guest_mode_handle_work(); 536 if (ret) 537 break; 538 } 539 540 if (vp->run.flags.intercept_suspend) 541 flags |= HV_DISPATCH_VP_FLAG_CLEAR_INTERCEPT_SUSPEND; 542 543 if (mshv_vp_interrupt_pending(vp)) 544 flags |= HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION; 545 546 ret = mshv_vp_dispatch(vp, flags, &output); 547 if (ret) 548 break; 549 550 vp->run.flags.intercept_suspend = 0; 551 552 if (output.dispatch_state == HV_VP_DISPATCH_STATE_BLOCKED) { 553 if (output.dispatch_event == 554 HV_VP_DISPATCH_EVENT_SUSPEND) { 555 /* 556 * TODO: remove the warning once VP canceling 557 * is supported 558 */ 559 WARN_ONCE(atomic64_read(&vp->run.vp_signaled_count), 560 "%s: vp#%d: unexpected explicit suspend\n", 561 __func__, vp->vp_index); 562 /* 563 * Need to clear explicit suspend before 564 * dispatching. 565 * Explicit suspend is either: 566 * - set right after the first VP dispatch or 567 * - set explicitly via hypercall 568 * Since the latter case is not yet supported, 569 * simply clear it here. 570 */ 571 ret = mshv_vp_clear_explicit_suspend(vp); 572 if (ret) 573 break; 574 575 ret = mshv_vp_wait_for_hv_kick(vp); 576 if (ret) 577 break; 578 } else { 579 vp->run.flags.root_sched_blocked = 1; 580 ret = mshv_vp_wait_for_hv_kick(vp); 581 if (ret) 582 break; 583 } 584 } else { 585 /* HV_VP_DISPATCH_STATE_READY */ 586 if (output.dispatch_event == 587 HV_VP_DISPATCH_EVENT_INTERCEPT) 588 vp->run.flags.intercept_suspend = 1; 589 } 590 } while (!vp->run.flags.intercept_suspend); 591 592 rseq_virt_userspace_exit(); 593 594 return ret; 595 } 596 597 static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ, 598 "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ"); 599 600 static struct mshv_mem_region * 601 mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn) 602 { 603 struct mshv_mem_region *region; 604 605 hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { 606 if (gfn >= region->start_gfn && 607 gfn < region->start_gfn + region->nr_pages) 608 return region; 609 } 610 611 return NULL; 612 } 613 614 static struct mshv_mem_region * 615 mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn) 616 { 617 struct mshv_mem_region *region; 618 619 spin_lock(&p->pt_mem_regions_lock); 620 region = mshv_partition_region_by_gfn(p, gfn); 621 if (!region || !mshv_region_get(region)) { 622 spin_unlock(&p->pt_mem_regions_lock); 623 return NULL; 624 } 625 spin_unlock(&p->pt_mem_regions_lock); 626 627 return region; 628 } 629 630 /** 631 * mshv_handle_gpa_intercept - Handle GPA (Guest Physical Address) intercepts. 632 * @vp: Pointer to the virtual processor structure. 633 * 634 * This function processes GPA intercepts by identifying the memory region 635 * corresponding to the intercepted GPA, aligning the page offset, and 636 * mapping the required pages. It ensures that the region is valid and 637 * handles faults efficiently by mapping multiple pages at once. 638 * 639 * Return: true if the intercept was handled successfully, false otherwise. 640 */ 641 static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) 642 { 643 struct mshv_partition *p = vp->vp_partition; 644 struct mshv_mem_region *region; 645 bool ret; 646 u64 gfn; 647 #if defined(CONFIG_X86_64) 648 struct hv_x64_memory_intercept_message *msg = 649 (struct hv_x64_memory_intercept_message *) 650 vp->vp_intercept_msg_page->u.payload; 651 #elif defined(CONFIG_ARM64) 652 struct hv_arm64_memory_intercept_message *msg = 653 (struct hv_arm64_memory_intercept_message *) 654 vp->vp_intercept_msg_page->u.payload; 655 #endif 656 657 gfn = HVPFN_DOWN(msg->guest_physical_address); 658 659 region = mshv_partition_region_by_gfn_get(p, gfn); 660 if (!region) 661 return false; 662 663 /* Only movable memory ranges are supported for GPA intercepts */ 664 if (region->type == MSHV_REGION_TYPE_MEM_MOVABLE) 665 ret = mshv_region_handle_gfn_fault(region, gfn); 666 else 667 ret = false; 668 669 mshv_region_put(region); 670 671 return ret; 672 } 673 674 static bool mshv_vp_handle_intercept(struct mshv_vp *vp) 675 { 676 switch (vp->vp_intercept_msg_page->header.message_type) { 677 case HVMSG_GPA_INTERCEPT: 678 return mshv_handle_gpa_intercept(vp); 679 } 680 return false; 681 } 682 683 static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg) 684 { 685 long rc; 686 687 do { 688 if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) 689 rc = mshv_run_vp_with_root_scheduler(vp); 690 else 691 rc = mshv_run_vp_with_hyp_scheduler(vp); 692 } while (rc == 0 && mshv_vp_handle_intercept(vp)); 693 694 if (rc) 695 return rc; 696 697 if (copy_to_user(ret_msg, vp->vp_intercept_msg_page, 698 sizeof(struct hv_message))) 699 rc = -EFAULT; 700 701 return rc; 702 } 703 704 static int 705 mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp *vp, 706 struct hv_vp_state_data state_data, 707 unsigned long user_pfn, size_t page_count, 708 bool is_set) 709 { 710 int completed, ret = 0; 711 unsigned long check; 712 struct page **pages; 713 714 if (page_count > INT_MAX) 715 return -EINVAL; 716 /* 717 * Check the arithmetic for wraparound/overflow. 718 * The last page address in the buffer is: 719 * (user_pfn + (page_count - 1)) * PAGE_SIZE 720 */ 721 if (check_add_overflow(user_pfn, (page_count - 1), &check)) 722 return -EOVERFLOW; 723 if (check_mul_overflow(check, PAGE_SIZE, &check)) 724 return -EOVERFLOW; 725 726 /* Pin user pages so hypervisor can copy directly to them */ 727 pages = kcalloc(page_count, sizeof(struct page *), GFP_KERNEL); 728 if (!pages) 729 return -ENOMEM; 730 731 for (completed = 0; completed < page_count; completed += ret) { 732 unsigned long user_addr = (user_pfn + completed) * PAGE_SIZE; 733 int remaining = page_count - completed; 734 735 ret = pin_user_pages_fast(user_addr, remaining, FOLL_WRITE, 736 &pages[completed]); 737 if (ret < 0) { 738 vp_err(vp, "%s: Failed to pin user pages error %i\n", 739 __func__, ret); 740 goto unpin_pages; 741 } 742 } 743 744 if (is_set) 745 ret = hv_call_set_vp_state(vp->vp_index, 746 vp->vp_partition->pt_id, 747 state_data, page_count, pages, 748 0, NULL); 749 else 750 ret = hv_call_get_vp_state(vp->vp_index, 751 vp->vp_partition->pt_id, 752 state_data, page_count, pages, 753 NULL); 754 755 unpin_pages: 756 unpin_user_pages(pages, completed); 757 kfree(pages); 758 return ret; 759 } 760 761 static long 762 mshv_vp_ioctl_get_set_state(struct mshv_vp *vp, 763 struct mshv_get_set_vp_state __user *user_args, 764 bool is_set) 765 { 766 struct mshv_get_set_vp_state args; 767 long ret = 0; 768 union hv_output_get_vp_state vp_state; 769 u32 data_sz; 770 struct hv_vp_state_data state_data = {}; 771 772 if (copy_from_user(&args, user_args, sizeof(args))) 773 return -EFAULT; 774 775 if (args.type >= MSHV_VP_STATE_COUNT || mshv_field_nonzero(args, rsvd) || 776 !args.buf_sz || !PAGE_ALIGNED(args.buf_sz) || 777 !PAGE_ALIGNED(args.buf_ptr)) 778 return -EINVAL; 779 780 if (!access_ok((void __user *)args.buf_ptr, args.buf_sz)) 781 return -EFAULT; 782 783 switch (args.type) { 784 case MSHV_VP_STATE_LAPIC: 785 state_data.type = HV_GET_SET_VP_STATE_LAPIC_STATE; 786 data_sz = HV_HYP_PAGE_SIZE; 787 break; 788 case MSHV_VP_STATE_XSAVE: 789 { 790 u64 data_sz_64; 791 792 ret = hv_call_get_partition_property(vp->vp_partition->pt_id, 793 HV_PARTITION_PROPERTY_XSAVE_STATES, 794 &state_data.xsave.states.as_uint64); 795 if (ret) 796 return ret; 797 798 ret = hv_call_get_partition_property(vp->vp_partition->pt_id, 799 HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE, 800 &data_sz_64); 801 if (ret) 802 return ret; 803 804 data_sz = (u32)data_sz_64; 805 state_data.xsave.flags = 0; 806 /* Always request legacy states */ 807 state_data.xsave.states.legacy_x87 = 1; 808 state_data.xsave.states.legacy_sse = 1; 809 state_data.type = HV_GET_SET_VP_STATE_XSAVE; 810 break; 811 } 812 case MSHV_VP_STATE_SIMP: 813 state_data.type = HV_GET_SET_VP_STATE_SIM_PAGE; 814 data_sz = HV_HYP_PAGE_SIZE; 815 break; 816 case MSHV_VP_STATE_SIEFP: 817 state_data.type = HV_GET_SET_VP_STATE_SIEF_PAGE; 818 data_sz = HV_HYP_PAGE_SIZE; 819 break; 820 case MSHV_VP_STATE_SYNTHETIC_TIMERS: 821 state_data.type = HV_GET_SET_VP_STATE_SYNTHETIC_TIMERS; 822 data_sz = sizeof(vp_state.synthetic_timers_state); 823 break; 824 default: 825 return -EINVAL; 826 } 827 828 if (copy_to_user(&user_args->buf_sz, &data_sz, sizeof(user_args->buf_sz))) 829 return -EFAULT; 830 831 if (data_sz > args.buf_sz) 832 return -EINVAL; 833 834 /* If the data is transmitted via pfns, delegate to helper */ 835 if (state_data.type & HV_GET_SET_VP_STATE_TYPE_PFN) { 836 unsigned long user_pfn = PFN_DOWN(args.buf_ptr); 837 size_t page_count = PFN_DOWN(args.buf_sz); 838 839 return mshv_vp_ioctl_get_set_state_pfn(vp, state_data, user_pfn, 840 page_count, is_set); 841 } 842 843 /* Paranoia check - this shouldn't happen! */ 844 if (data_sz > sizeof(vp_state)) { 845 vp_err(vp, "Invalid vp state data size!\n"); 846 return -EINVAL; 847 } 848 849 if (is_set) { 850 if (copy_from_user(&vp_state, (__user void *)args.buf_ptr, data_sz)) 851 return -EFAULT; 852 853 return hv_call_set_vp_state(vp->vp_index, 854 vp->vp_partition->pt_id, 855 state_data, 0, NULL, 856 sizeof(vp_state), (u8 *)&vp_state); 857 } 858 859 ret = hv_call_get_vp_state(vp->vp_index, vp->vp_partition->pt_id, 860 state_data, 0, NULL, &vp_state); 861 if (ret) 862 return ret; 863 864 if (copy_to_user((void __user *)args.buf_ptr, &vp_state, data_sz)) 865 return -EFAULT; 866 867 return 0; 868 } 869 870 static long 871 mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) 872 { 873 struct mshv_vp *vp = filp->private_data; 874 long r = -ENOTTY; 875 876 if (mutex_lock_killable(&vp->vp_mutex)) 877 return -EINTR; 878 879 switch (ioctl) { 880 case MSHV_RUN_VP: 881 r = mshv_vp_ioctl_run_vp(vp, (void __user *)arg); 882 break; 883 case MSHV_GET_VP_STATE: 884 r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, false); 885 break; 886 case MSHV_SET_VP_STATE: 887 r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, true); 888 break; 889 case MSHV_ROOT_HVCALL: 890 r = mshv_ioctl_passthru_hvcall(vp->vp_partition, false, 891 (void __user *)arg); 892 break; 893 default: 894 vp_warn(vp, "Invalid ioctl: %#x\n", ioctl); 895 break; 896 } 897 mutex_unlock(&vp->vp_mutex); 898 899 return r; 900 } 901 902 static vm_fault_t mshv_vp_fault(struct vm_fault *vmf) 903 { 904 struct mshv_vp *vp = vmf->vma->vm_file->private_data; 905 906 switch (vmf->vma->vm_pgoff) { 907 case MSHV_VP_MMAP_OFFSET_REGISTERS: 908 vmf->page = virt_to_page(vp->vp_register_page); 909 break; 910 case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE: 911 vmf->page = virt_to_page(vp->vp_intercept_msg_page); 912 break; 913 case MSHV_VP_MMAP_OFFSET_GHCB: 914 vmf->page = virt_to_page(vp->vp_ghcb_page); 915 break; 916 default: 917 return VM_FAULT_SIGBUS; 918 } 919 920 get_page(vmf->page); 921 922 return 0; 923 } 924 925 static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma) 926 { 927 struct mshv_vp *vp = file->private_data; 928 929 switch (vma->vm_pgoff) { 930 case MSHV_VP_MMAP_OFFSET_REGISTERS: 931 if (!vp->vp_register_page) 932 return -ENODEV; 933 break; 934 case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE: 935 if (!vp->vp_intercept_msg_page) 936 return -ENODEV; 937 break; 938 case MSHV_VP_MMAP_OFFSET_GHCB: 939 if (!vp->vp_ghcb_page) 940 return -ENODEV; 941 break; 942 default: 943 return -EINVAL; 944 } 945 946 vma->vm_ops = &mshv_vp_vm_ops; 947 return 0; 948 } 949 950 static int 951 mshv_vp_release(struct inode *inode, struct file *filp) 952 { 953 struct mshv_vp *vp = filp->private_data; 954 955 /* Rest of VP cleanup happens in destroy_partition() */ 956 mshv_partition_put(vp->vp_partition); 957 return 0; 958 } 959 960 static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index, 961 void *stats_pages[]) 962 { 963 union hv_stats_object_identity identity = { 964 .vp.partition_id = partition_id, 965 .vp.vp_index = vp_index, 966 }; 967 968 identity.vp.stats_area_type = HV_STATS_AREA_SELF; 969 hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity); 970 971 identity.vp.stats_area_type = HV_STATS_AREA_PARENT; 972 hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity); 973 } 974 975 static int mshv_vp_stats_map(u64 partition_id, u32 vp_index, 976 void *stats_pages[]) 977 { 978 union hv_stats_object_identity identity = { 979 .vp.partition_id = partition_id, 980 .vp.vp_index = vp_index, 981 }; 982 int err; 983 984 identity.vp.stats_area_type = HV_STATS_AREA_SELF; 985 err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity, 986 &stats_pages[HV_STATS_AREA_SELF]); 987 if (err) 988 return err; 989 990 identity.vp.stats_area_type = HV_STATS_AREA_PARENT; 991 err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity, 992 &stats_pages[HV_STATS_AREA_PARENT]); 993 if (err) 994 goto unmap_self; 995 996 return 0; 997 998 unmap_self: 999 identity.vp.stats_area_type = HV_STATS_AREA_SELF; 1000 hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity); 1001 return err; 1002 } 1003 1004 static long 1005 mshv_partition_ioctl_create_vp(struct mshv_partition *partition, 1006 void __user *arg) 1007 { 1008 struct mshv_create_vp args; 1009 struct mshv_vp *vp; 1010 struct page *intercept_msg_page, *register_page, *ghcb_page; 1011 void *stats_pages[2]; 1012 long ret; 1013 1014 if (copy_from_user(&args, arg, sizeof(args))) 1015 return -EFAULT; 1016 1017 if (args.vp_index >= MSHV_MAX_VPS) 1018 return -EINVAL; 1019 1020 if (partition->pt_vp_array[args.vp_index]) 1021 return -EEXIST; 1022 1023 ret = hv_call_create_vp(NUMA_NO_NODE, partition->pt_id, args.vp_index, 1024 0 /* Only valid for root partition VPs */); 1025 if (ret) 1026 return ret; 1027 1028 ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, 1029 HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, 1030 input_vtl_zero, &intercept_msg_page); 1031 if (ret) 1032 goto destroy_vp; 1033 1034 if (!mshv_partition_encrypted(partition)) { 1035 ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, 1036 HV_VP_STATE_PAGE_REGISTERS, 1037 input_vtl_zero, ®ister_page); 1038 if (ret) 1039 goto unmap_intercept_message_page; 1040 } 1041 1042 if (mshv_partition_encrypted(partition) && 1043 is_ghcb_mapping_available()) { 1044 ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, 1045 HV_VP_STATE_PAGE_GHCB, 1046 input_vtl_normal, &ghcb_page); 1047 if (ret) 1048 goto unmap_register_page; 1049 } 1050 1051 /* 1052 * This mapping of the stats page is for detecting if dispatch thread 1053 * is blocked - only relevant for root scheduler 1054 */ 1055 if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) { 1056 ret = mshv_vp_stats_map(partition->pt_id, args.vp_index, 1057 stats_pages); 1058 if (ret) 1059 goto unmap_ghcb_page; 1060 } 1061 1062 vp = kzalloc(sizeof(*vp), GFP_KERNEL); 1063 if (!vp) 1064 goto unmap_stats_pages; 1065 1066 vp->vp_partition = mshv_partition_get(partition); 1067 if (!vp->vp_partition) { 1068 ret = -EBADF; 1069 goto free_vp; 1070 } 1071 1072 mutex_init(&vp->vp_mutex); 1073 init_waitqueue_head(&vp->run.vp_suspend_queue); 1074 atomic64_set(&vp->run.vp_signaled_count, 0); 1075 1076 vp->vp_index = args.vp_index; 1077 vp->vp_intercept_msg_page = page_to_virt(intercept_msg_page); 1078 if (!mshv_partition_encrypted(partition)) 1079 vp->vp_register_page = page_to_virt(register_page); 1080 1081 if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) 1082 vp->vp_ghcb_page = page_to_virt(ghcb_page); 1083 1084 if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) 1085 memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages)); 1086 1087 /* 1088 * Keep anon_inode_getfd last: it installs fd in the file struct and 1089 * thus makes the state accessible in user space. 1090 */ 1091 ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp, 1092 O_RDWR | O_CLOEXEC); 1093 if (ret < 0) 1094 goto put_partition; 1095 1096 /* already exclusive with the partition mutex for all ioctls */ 1097 partition->pt_vp_count++; 1098 partition->pt_vp_array[args.vp_index] = vp; 1099 1100 return ret; 1101 1102 put_partition: 1103 mshv_partition_put(partition); 1104 free_vp: 1105 kfree(vp); 1106 unmap_stats_pages: 1107 if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) 1108 mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages); 1109 unmap_ghcb_page: 1110 if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) 1111 hv_unmap_vp_state_page(partition->pt_id, args.vp_index, 1112 HV_VP_STATE_PAGE_GHCB, ghcb_page, 1113 input_vtl_normal); 1114 unmap_register_page: 1115 if (!mshv_partition_encrypted(partition)) 1116 hv_unmap_vp_state_page(partition->pt_id, args.vp_index, 1117 HV_VP_STATE_PAGE_REGISTERS, 1118 register_page, input_vtl_zero); 1119 unmap_intercept_message_page: 1120 hv_unmap_vp_state_page(partition->pt_id, args.vp_index, 1121 HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, 1122 intercept_msg_page, input_vtl_zero); 1123 destroy_vp: 1124 hv_call_delete_vp(partition->pt_id, args.vp_index); 1125 return ret; 1126 } 1127 1128 static int mshv_init_async_handler(struct mshv_partition *partition) 1129 { 1130 if (completion_done(&partition->async_hypercall)) { 1131 pt_err(partition, 1132 "Cannot issue async hypercall while another one in progress!\n"); 1133 return -EPERM; 1134 } 1135 1136 reinit_completion(&partition->async_hypercall); 1137 return 0; 1138 } 1139 1140 static void mshv_async_hvcall_handler(void *data, u64 *status) 1141 { 1142 struct mshv_partition *partition = data; 1143 1144 wait_for_completion(&partition->async_hypercall); 1145 pt_dbg(partition, "Async hypercall completed!\n"); 1146 1147 *status = partition->async_hypercall_status; 1148 } 1149 1150 /* 1151 * NB: caller checks and makes sure mem->size is page aligned 1152 * Returns: 0 with regionpp updated on success, or -errno 1153 */ 1154 static int mshv_partition_create_region(struct mshv_partition *partition, 1155 struct mshv_user_mem_region *mem, 1156 struct mshv_mem_region **regionpp, 1157 bool is_mmio) 1158 { 1159 struct mshv_mem_region *rg; 1160 u64 nr_pages = HVPFN_DOWN(mem->size); 1161 1162 /* Reject overlapping regions */ 1163 spin_lock(&partition->pt_mem_regions_lock); 1164 hlist_for_each_entry(rg, &partition->pt_mem_regions, hnode) { 1165 if (mem->guest_pfn + nr_pages <= rg->start_gfn || 1166 rg->start_gfn + rg->nr_pages <= mem->guest_pfn) 1167 continue; 1168 spin_unlock(&partition->pt_mem_regions_lock); 1169 return -EEXIST; 1170 } 1171 spin_unlock(&partition->pt_mem_regions_lock); 1172 1173 rg = mshv_region_create(mem->guest_pfn, nr_pages, 1174 mem->userspace_addr, mem->flags); 1175 if (IS_ERR(rg)) 1176 return PTR_ERR(rg); 1177 1178 if (is_mmio) 1179 rg->type = MSHV_REGION_TYPE_MMIO; 1180 else if (mshv_partition_encrypted(partition) || 1181 !mshv_region_movable_init(rg)) 1182 rg->type = MSHV_REGION_TYPE_MEM_PINNED; 1183 else 1184 rg->type = MSHV_REGION_TYPE_MEM_MOVABLE; 1185 1186 rg->partition = partition; 1187 1188 *regionpp = rg; 1189 1190 return 0; 1191 } 1192 1193 /** 1194 * mshv_prepare_pinned_region - Pin and map memory regions 1195 * @region: Pointer to the memory region structure 1196 * 1197 * This function processes memory regions that are explicitly marked as pinned. 1198 * Pinned regions are preallocated, mapped upfront, and do not rely on fault-based 1199 * population. The function ensures the region is properly populated, handles 1200 * encryption requirements for SNP partitions if applicable, maps the region, 1201 * and performs necessary sharing or eviction operations based on the mapping 1202 * result. 1203 * 1204 * Return: 0 on success, negative error code on failure. 1205 */ 1206 static int mshv_prepare_pinned_region(struct mshv_mem_region *region) 1207 { 1208 struct mshv_partition *partition = region->partition; 1209 int ret; 1210 1211 ret = mshv_region_pin(region); 1212 if (ret) { 1213 pt_err(partition, "Failed to pin memory region: %d\n", 1214 ret); 1215 goto err_out; 1216 } 1217 1218 /* 1219 * For an SNP partition it is a requirement that for every memory region 1220 * that we are going to map for this partition we should make sure that 1221 * host access to that region is released. This is ensured by doing an 1222 * additional hypercall which will update the SLAT to release host 1223 * access to guest memory regions. 1224 */ 1225 if (mshv_partition_encrypted(partition)) { 1226 ret = mshv_region_unshare(region); 1227 if (ret) { 1228 pt_err(partition, 1229 "Failed to unshare memory region (guest_pfn: %llu): %d\n", 1230 region->start_gfn, ret); 1231 goto invalidate_region; 1232 } 1233 } 1234 1235 ret = mshv_region_map(region); 1236 if (ret && mshv_partition_encrypted(partition)) { 1237 int shrc; 1238 1239 shrc = mshv_region_share(region); 1240 if (!shrc) 1241 goto invalidate_region; 1242 1243 pt_err(partition, 1244 "Failed to share memory region (guest_pfn: %llu): %d\n", 1245 region->start_gfn, shrc); 1246 /* 1247 * Don't unpin if marking shared failed because pages are no 1248 * longer mapped in the host, ie root, anymore. 1249 */ 1250 goto err_out; 1251 } 1252 1253 return 0; 1254 1255 invalidate_region: 1256 mshv_region_invalidate(region); 1257 err_out: 1258 return ret; 1259 } 1260 1261 /* 1262 * This maps two things: guest RAM and for pci passthru mmio space. 1263 * 1264 * mmio: 1265 * - vfio overloads vm_pgoff to store the mmio start pfn/spa. 1266 * - Two things need to happen for mapping mmio range: 1267 * 1. mapped in the uaddr so VMM can access it. 1268 * 2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it. 1269 * 1270 * This function takes care of the second. The first one is managed by vfio, 1271 * and hence is taken care of via vfio_pci_mmap_fault(). 1272 */ 1273 static long 1274 mshv_map_user_memory(struct mshv_partition *partition, 1275 struct mshv_user_mem_region mem) 1276 { 1277 struct mshv_mem_region *region; 1278 struct vm_area_struct *vma; 1279 bool is_mmio; 1280 ulong mmio_pfn; 1281 long ret; 1282 1283 if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP) || 1284 !access_ok((const void __user *)mem.userspace_addr, mem.size)) 1285 return -EINVAL; 1286 1287 mmap_read_lock(current->mm); 1288 vma = vma_lookup(current->mm, mem.userspace_addr); 1289 is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0; 1290 mmio_pfn = is_mmio ? vma->vm_pgoff : 0; 1291 mmap_read_unlock(current->mm); 1292 1293 if (!vma) 1294 return -EINVAL; 1295 1296 ret = mshv_partition_create_region(partition, &mem, ®ion, 1297 is_mmio); 1298 if (ret) 1299 return ret; 1300 1301 switch (region->type) { 1302 case MSHV_REGION_TYPE_MEM_PINNED: 1303 ret = mshv_prepare_pinned_region(region); 1304 break; 1305 case MSHV_REGION_TYPE_MEM_MOVABLE: 1306 /* 1307 * For movable memory regions, remap with no access to let 1308 * the hypervisor track dirty pages, enabling pre-copy live 1309 * migration. 1310 */ 1311 ret = hv_call_map_gpa_pages(partition->pt_id, 1312 region->start_gfn, 1313 region->nr_pages, 1314 HV_MAP_GPA_NO_ACCESS, NULL); 1315 break; 1316 case MSHV_REGION_TYPE_MMIO: 1317 ret = hv_call_map_mmio_pages(partition->pt_id, 1318 region->start_gfn, 1319 mmio_pfn, 1320 region->nr_pages); 1321 break; 1322 } 1323 1324 if (ret) 1325 goto errout; 1326 1327 spin_lock(&partition->pt_mem_regions_lock); 1328 hlist_add_head(®ion->hnode, &partition->pt_mem_regions); 1329 spin_unlock(&partition->pt_mem_regions_lock); 1330 1331 return 0; 1332 1333 errout: 1334 vfree(region); 1335 return ret; 1336 } 1337 1338 /* Called for unmapping both the guest ram and the mmio space */ 1339 static long 1340 mshv_unmap_user_memory(struct mshv_partition *partition, 1341 struct mshv_user_mem_region mem) 1342 { 1343 struct mshv_mem_region *region; 1344 1345 if (!(mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP))) 1346 return -EINVAL; 1347 1348 spin_lock(&partition->pt_mem_regions_lock); 1349 1350 region = mshv_partition_region_by_gfn(partition, mem.guest_pfn); 1351 if (!region) { 1352 spin_unlock(&partition->pt_mem_regions_lock); 1353 return -ENOENT; 1354 } 1355 1356 /* Paranoia check */ 1357 if (region->start_uaddr != mem.userspace_addr || 1358 region->start_gfn != mem.guest_pfn || 1359 region->nr_pages != HVPFN_DOWN(mem.size)) { 1360 spin_unlock(&partition->pt_mem_regions_lock); 1361 return -EINVAL; 1362 } 1363 1364 hlist_del(®ion->hnode); 1365 1366 spin_unlock(&partition->pt_mem_regions_lock); 1367 1368 mshv_region_put(region); 1369 1370 return 0; 1371 } 1372 1373 static long 1374 mshv_partition_ioctl_set_memory(struct mshv_partition *partition, 1375 struct mshv_user_mem_region __user *user_mem) 1376 { 1377 struct mshv_user_mem_region mem; 1378 1379 if (copy_from_user(&mem, user_mem, sizeof(mem))) 1380 return -EFAULT; 1381 1382 if (!mem.size || 1383 !PAGE_ALIGNED(mem.size) || 1384 !PAGE_ALIGNED(mem.userspace_addr) || 1385 (mem.flags & ~MSHV_SET_MEM_FLAGS_MASK) || 1386 mshv_field_nonzero(mem, rsvd)) 1387 return -EINVAL; 1388 1389 if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP)) 1390 return mshv_unmap_user_memory(partition, mem); 1391 1392 return mshv_map_user_memory(partition, mem); 1393 } 1394 1395 static long 1396 mshv_partition_ioctl_ioeventfd(struct mshv_partition *partition, 1397 void __user *user_args) 1398 { 1399 struct mshv_user_ioeventfd args; 1400 1401 if (copy_from_user(&args, user_args, sizeof(args))) 1402 return -EFAULT; 1403 1404 return mshv_set_unset_ioeventfd(partition, &args); 1405 } 1406 1407 static long 1408 mshv_partition_ioctl_irqfd(struct mshv_partition *partition, 1409 void __user *user_args) 1410 { 1411 struct mshv_user_irqfd args; 1412 1413 if (copy_from_user(&args, user_args, sizeof(args))) 1414 return -EFAULT; 1415 1416 return mshv_set_unset_irqfd(partition, &args); 1417 } 1418 1419 static long 1420 mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition *partition, 1421 void __user *user_args) 1422 { 1423 struct mshv_gpap_access_bitmap args; 1424 union hv_gpa_page_access_state *states; 1425 long ret, i; 1426 union hv_gpa_page_access_state_flags hv_flags = {}; 1427 u8 hv_type_mask; 1428 ulong bitmap_buf_sz, states_buf_sz; 1429 int written = 0; 1430 1431 if (copy_from_user(&args, user_args, sizeof(args))) 1432 return -EFAULT; 1433 1434 if (args.access_type >= MSHV_GPAP_ACCESS_TYPE_COUNT || 1435 args.access_op >= MSHV_GPAP_ACCESS_OP_COUNT || 1436 mshv_field_nonzero(args, rsvd) || !args.page_count || 1437 !args.bitmap_ptr) 1438 return -EINVAL; 1439 1440 if (check_mul_overflow(args.page_count, sizeof(*states), &states_buf_sz)) 1441 return -E2BIG; 1442 1443 /* Num bytes needed to store bitmap; one bit per page rounded up */ 1444 bitmap_buf_sz = DIV_ROUND_UP(args.page_count, 8); 1445 1446 /* Sanity check */ 1447 if (bitmap_buf_sz > states_buf_sz) 1448 return -EBADFD; 1449 1450 switch (args.access_type) { 1451 case MSHV_GPAP_ACCESS_TYPE_ACCESSED: 1452 hv_type_mask = 1; 1453 if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) { 1454 hv_flags.clear_accessed = 1; 1455 /* not accessed implies not dirty */ 1456 hv_flags.clear_dirty = 1; 1457 } else { /* MSHV_GPAP_ACCESS_OP_SET */ 1458 hv_flags.set_accessed = 1; 1459 } 1460 break; 1461 case MSHV_GPAP_ACCESS_TYPE_DIRTY: 1462 hv_type_mask = 2; 1463 if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) { 1464 hv_flags.clear_dirty = 1; 1465 } else { /* MSHV_GPAP_ACCESS_OP_SET */ 1466 hv_flags.set_dirty = 1; 1467 /* dirty implies accessed */ 1468 hv_flags.set_accessed = 1; 1469 } 1470 break; 1471 } 1472 1473 states = vzalloc(states_buf_sz); 1474 if (!states) 1475 return -ENOMEM; 1476 1477 ret = hv_call_get_gpa_access_states(partition->pt_id, args.page_count, 1478 args.gpap_base, hv_flags, &written, 1479 states); 1480 if (ret) 1481 goto free_return; 1482 1483 /* 1484 * Overwrite states buffer with bitmap - the bits in hv_type_mask 1485 * correspond to bitfields in hv_gpa_page_access_state 1486 */ 1487 for (i = 0; i < written; ++i) 1488 __assign_bit(i, (ulong *)states, 1489 states[i].as_uint8 & hv_type_mask); 1490 1491 /* zero the unused bits in the last byte(s) of the returned bitmap */ 1492 for (i = written; i < bitmap_buf_sz * 8; ++i) 1493 __clear_bit(i, (ulong *)states); 1494 1495 if (copy_to_user((void __user *)args.bitmap_ptr, states, bitmap_buf_sz)) 1496 ret = -EFAULT; 1497 1498 free_return: 1499 vfree(states); 1500 return ret; 1501 } 1502 1503 static long 1504 mshv_partition_ioctl_set_msi_routing(struct mshv_partition *partition, 1505 void __user *user_args) 1506 { 1507 struct mshv_user_irq_entry *entries = NULL; 1508 struct mshv_user_irq_table args; 1509 long ret; 1510 1511 if (copy_from_user(&args, user_args, sizeof(args))) 1512 return -EFAULT; 1513 1514 if (args.nr > MSHV_MAX_GUEST_IRQS || 1515 mshv_field_nonzero(args, rsvd)) 1516 return -EINVAL; 1517 1518 if (args.nr) { 1519 struct mshv_user_irq_table __user *urouting = user_args; 1520 1521 entries = vmemdup_user(urouting->entries, 1522 array_size(sizeof(*entries), 1523 args.nr)); 1524 if (IS_ERR(entries)) 1525 return PTR_ERR(entries); 1526 } 1527 ret = mshv_update_routing_table(partition, entries, args.nr); 1528 kvfree(entries); 1529 1530 return ret; 1531 } 1532 1533 static long 1534 mshv_partition_ioctl_initialize(struct mshv_partition *partition) 1535 { 1536 long ret; 1537 1538 if (partition->pt_initialized) 1539 return 0; 1540 1541 ret = hv_call_initialize_partition(partition->pt_id); 1542 if (ret) 1543 goto withdraw_mem; 1544 1545 partition->pt_initialized = true; 1546 1547 return 0; 1548 1549 withdraw_mem: 1550 hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); 1551 1552 return ret; 1553 } 1554 1555 static long 1556 mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) 1557 { 1558 struct mshv_partition *partition = filp->private_data; 1559 long ret; 1560 void __user *uarg = (void __user *)arg; 1561 1562 if (mutex_lock_killable(&partition->pt_mutex)) 1563 return -EINTR; 1564 1565 switch (ioctl) { 1566 case MSHV_INITIALIZE_PARTITION: 1567 ret = mshv_partition_ioctl_initialize(partition); 1568 break; 1569 case MSHV_SET_GUEST_MEMORY: 1570 ret = mshv_partition_ioctl_set_memory(partition, uarg); 1571 break; 1572 case MSHV_CREATE_VP: 1573 ret = mshv_partition_ioctl_create_vp(partition, uarg); 1574 break; 1575 case MSHV_IRQFD: 1576 ret = mshv_partition_ioctl_irqfd(partition, uarg); 1577 break; 1578 case MSHV_IOEVENTFD: 1579 ret = mshv_partition_ioctl_ioeventfd(partition, uarg); 1580 break; 1581 case MSHV_SET_MSI_ROUTING: 1582 ret = mshv_partition_ioctl_set_msi_routing(partition, uarg); 1583 break; 1584 case MSHV_GET_GPAP_ACCESS_BITMAP: 1585 ret = mshv_partition_ioctl_get_gpap_access_bitmap(partition, 1586 uarg); 1587 break; 1588 case MSHV_ROOT_HVCALL: 1589 ret = mshv_ioctl_passthru_hvcall(partition, true, uarg); 1590 break; 1591 default: 1592 ret = -ENOTTY; 1593 } 1594 1595 mutex_unlock(&partition->pt_mutex); 1596 return ret; 1597 } 1598 1599 static int 1600 disable_vp_dispatch(struct mshv_vp *vp) 1601 { 1602 int ret; 1603 struct hv_register_assoc dispatch_suspend = { 1604 .name = HV_REGISTER_DISPATCH_SUSPEND, 1605 .value.dispatch_suspend.suspended = 1, 1606 }; 1607 1608 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 1609 1, &dispatch_suspend); 1610 if (ret) 1611 vp_err(vp, "failed to suspend\n"); 1612 1613 return ret; 1614 } 1615 1616 static int 1617 get_vp_signaled_count(struct mshv_vp *vp, u64 *count) 1618 { 1619 int ret; 1620 struct hv_register_assoc root_signal_count = { 1621 .name = HV_REGISTER_VP_ROOT_SIGNAL_COUNT, 1622 }; 1623 1624 ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 1625 1, &root_signal_count); 1626 1627 if (ret) { 1628 vp_err(vp, "Failed to get root signal count"); 1629 *count = 0; 1630 return ret; 1631 } 1632 1633 *count = root_signal_count.value.reg64; 1634 1635 return ret; 1636 } 1637 1638 static void 1639 drain_vp_signals(struct mshv_vp *vp) 1640 { 1641 u64 hv_signal_count; 1642 u64 vp_signal_count; 1643 1644 get_vp_signaled_count(vp, &hv_signal_count); 1645 1646 vp_signal_count = atomic64_read(&vp->run.vp_signaled_count); 1647 1648 /* 1649 * There should be at most 1 outstanding notification, but be extra 1650 * careful anyway. 1651 */ 1652 while (hv_signal_count != vp_signal_count) { 1653 WARN_ON(hv_signal_count - vp_signal_count != 1); 1654 1655 if (wait_event_interruptible(vp->run.vp_suspend_queue, 1656 vp->run.kicked_by_hv == 1)) 1657 break; 1658 vp->run.kicked_by_hv = 0; 1659 vp_signal_count = atomic64_read(&vp->run.vp_signaled_count); 1660 } 1661 } 1662 1663 static void drain_all_vps(const struct mshv_partition *partition) 1664 { 1665 int i; 1666 struct mshv_vp *vp; 1667 1668 /* 1669 * VPs are reachable from ISR. It is safe to not take the partition 1670 * lock because nobody else can enter this function and drop the 1671 * partition from the list. 1672 */ 1673 for (i = 0; i < MSHV_MAX_VPS; i++) { 1674 vp = partition->pt_vp_array[i]; 1675 if (!vp) 1676 continue; 1677 /* 1678 * Disable dispatching of the VP in the hypervisor. After this 1679 * the hypervisor guarantees it won't generate any signals for 1680 * the VP and the hypervisor's VP signal count won't change. 1681 */ 1682 disable_vp_dispatch(vp); 1683 drain_vp_signals(vp); 1684 } 1685 } 1686 1687 static void 1688 remove_partition(struct mshv_partition *partition) 1689 { 1690 spin_lock(&mshv_root.pt_ht_lock); 1691 hlist_del_rcu(&partition->pt_hnode); 1692 spin_unlock(&mshv_root.pt_ht_lock); 1693 1694 synchronize_rcu(); 1695 } 1696 1697 /* 1698 * Tear down a partition and remove it from the list. 1699 * Partition's refcount must be 0 1700 */ 1701 static void destroy_partition(struct mshv_partition *partition) 1702 { 1703 struct mshv_vp *vp; 1704 struct mshv_mem_region *region; 1705 struct hlist_node *n; 1706 int i; 1707 1708 if (refcount_read(&partition->pt_ref_count)) { 1709 pt_err(partition, 1710 "Attempt to destroy partition but refcount > 0\n"); 1711 return; 1712 } 1713 1714 if (partition->pt_initialized) { 1715 /* 1716 * We only need to drain signals for root scheduler. This should be 1717 * done before removing the partition from the partition list. 1718 */ 1719 if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) 1720 drain_all_vps(partition); 1721 1722 /* Remove vps */ 1723 for (i = 0; i < MSHV_MAX_VPS; ++i) { 1724 vp = partition->pt_vp_array[i]; 1725 if (!vp) 1726 continue; 1727 1728 if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) 1729 mshv_vp_stats_unmap(partition->pt_id, vp->vp_index, 1730 (void **)vp->vp_stats_pages); 1731 1732 if (vp->vp_register_page) { 1733 (void)hv_unmap_vp_state_page(partition->pt_id, 1734 vp->vp_index, 1735 HV_VP_STATE_PAGE_REGISTERS, 1736 virt_to_page(vp->vp_register_page), 1737 input_vtl_zero); 1738 vp->vp_register_page = NULL; 1739 } 1740 1741 (void)hv_unmap_vp_state_page(partition->pt_id, 1742 vp->vp_index, 1743 HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, 1744 virt_to_page(vp->vp_intercept_msg_page), 1745 input_vtl_zero); 1746 vp->vp_intercept_msg_page = NULL; 1747 1748 if (vp->vp_ghcb_page) { 1749 (void)hv_unmap_vp_state_page(partition->pt_id, 1750 vp->vp_index, 1751 HV_VP_STATE_PAGE_GHCB, 1752 virt_to_page(vp->vp_ghcb_page), 1753 input_vtl_normal); 1754 vp->vp_ghcb_page = NULL; 1755 } 1756 1757 kfree(vp); 1758 1759 partition->pt_vp_array[i] = NULL; 1760 } 1761 1762 /* Deallocates and unmaps everything including vcpus, GPA mappings etc */ 1763 hv_call_finalize_partition(partition->pt_id); 1764 1765 partition->pt_initialized = false; 1766 } 1767 1768 remove_partition(partition); 1769 1770 hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions, 1771 hnode) { 1772 hlist_del(®ion->hnode); 1773 mshv_region_put(region); 1774 } 1775 1776 /* Withdraw and free all pages we deposited */ 1777 hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); 1778 hv_call_delete_partition(partition->pt_id); 1779 1780 mshv_free_routing_table(partition); 1781 kfree(partition); 1782 } 1783 1784 struct 1785 mshv_partition *mshv_partition_get(struct mshv_partition *partition) 1786 { 1787 if (refcount_inc_not_zero(&partition->pt_ref_count)) 1788 return partition; 1789 return NULL; 1790 } 1791 1792 struct 1793 mshv_partition *mshv_partition_find(u64 partition_id) 1794 __must_hold(RCU) 1795 { 1796 struct mshv_partition *p; 1797 1798 hash_for_each_possible_rcu(mshv_root.pt_htable, p, pt_hnode, 1799 partition_id) 1800 if (p->pt_id == partition_id) 1801 return p; 1802 1803 return NULL; 1804 } 1805 1806 void 1807 mshv_partition_put(struct mshv_partition *partition) 1808 { 1809 if (refcount_dec_and_test(&partition->pt_ref_count)) 1810 destroy_partition(partition); 1811 } 1812 1813 static int 1814 mshv_partition_release(struct inode *inode, struct file *filp) 1815 { 1816 struct mshv_partition *partition = filp->private_data; 1817 1818 mshv_eventfd_release(partition); 1819 1820 cleanup_srcu_struct(&partition->pt_irq_srcu); 1821 1822 mshv_partition_put(partition); 1823 1824 return 0; 1825 } 1826 1827 static int 1828 add_partition(struct mshv_partition *partition) 1829 { 1830 spin_lock(&mshv_root.pt_ht_lock); 1831 1832 hash_add_rcu(mshv_root.pt_htable, &partition->pt_hnode, 1833 partition->pt_id); 1834 1835 spin_unlock(&mshv_root.pt_ht_lock); 1836 1837 return 0; 1838 } 1839 1840 static_assert(MSHV_NUM_CPU_FEATURES_BANKS == 1841 HV_PARTITION_PROCESSOR_FEATURES_BANKS); 1842 1843 static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags, 1844 struct hv_partition_creation_properties *cr_props, 1845 union hv_partition_isolation_properties *isol_props) 1846 { 1847 int i; 1848 struct mshv_create_partition_v2 args; 1849 union hv_partition_processor_features *disabled_procs; 1850 union hv_partition_processor_xsave_features *disabled_xsave; 1851 1852 /* First, copy v1 struct in case user is on previous versions */ 1853 if (copy_from_user(&args, user_arg, 1854 sizeof(struct mshv_create_partition))) 1855 return -EFAULT; 1856 1857 if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) || 1858 args.pt_isolation >= MSHV_PT_ISOLATION_COUNT) 1859 return -EINVAL; 1860 1861 disabled_procs = &cr_props->disabled_processor_features; 1862 disabled_xsave = &cr_props->disabled_processor_xsave_features; 1863 1864 /* Check if user provided newer struct with feature fields */ 1865 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES)) { 1866 if (copy_from_user(&args, user_arg, sizeof(args))) 1867 return -EFAULT; 1868 1869 /* Re-validate v1 fields after second copy_from_user() */ 1870 if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) || 1871 args.pt_isolation >= MSHV_PT_ISOLATION_COUNT) 1872 return -EINVAL; 1873 1874 if (args.pt_num_cpu_fbanks != MSHV_NUM_CPU_FEATURES_BANKS || 1875 mshv_field_nonzero(args, pt_rsvd) || 1876 mshv_field_nonzero(args, pt_rsvd1)) 1877 return -EINVAL; 1878 1879 /* 1880 * Note this assumes MSHV_NUM_CPU_FEATURES_BANKS will never 1881 * change and equals HV_PARTITION_PROCESSOR_FEATURES_BANKS 1882 * (i.e. 2). 1883 * 1884 * Further banks (index >= 2) will be modifiable as 'early' 1885 * properties via the set partition property hypercall. 1886 */ 1887 for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++) 1888 disabled_procs->as_uint64[i] = args.pt_cpu_fbanks[i]; 1889 1890 #if IS_ENABLED(CONFIG_X86_64) 1891 disabled_xsave->as_uint64 = args.pt_disabled_xsave; 1892 #else 1893 /* 1894 * In practice this field is ignored on arm64, but safer to 1895 * zero it in case it is ever used. 1896 */ 1897 disabled_xsave->as_uint64 = 0; 1898 1899 if (mshv_field_nonzero(args, pt_rsvd2)) 1900 return -EINVAL; 1901 #endif 1902 } else { 1903 /* 1904 * v1 behavior: try to enable everything. The hypervisor will 1905 * disable features that are not supported. The banks can be 1906 * queried via the get partition property hypercall. 1907 */ 1908 for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++) 1909 disabled_procs->as_uint64[i] = 0; 1910 1911 disabled_xsave->as_uint64 = 0; 1912 } 1913 1914 /* Only support EXO partitions */ 1915 *pt_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION | 1916 HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED; 1917 1918 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_LAPIC)) 1919 *pt_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED; 1920 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_X2APIC)) 1921 *pt_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE; 1922 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_GPA_SUPER_PAGES)) 1923 *pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED; 1924 1925 isol_props->as_uint64 = 0; 1926 1927 switch (args.pt_isolation) { 1928 case MSHV_PT_ISOLATION_NONE: 1929 isol_props->isolation_type = HV_PARTITION_ISOLATION_TYPE_NONE; 1930 break; 1931 } 1932 1933 return 0; 1934 } 1935 1936 static long 1937 mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev) 1938 { 1939 u64 creation_flags; 1940 struct hv_partition_creation_properties creation_properties; 1941 union hv_partition_isolation_properties isolation_properties; 1942 struct mshv_partition *partition; 1943 long ret; 1944 1945 ret = mshv_ioctl_process_pt_flags(user_arg, &creation_flags, 1946 &creation_properties, 1947 &isolation_properties); 1948 if (ret) 1949 return ret; 1950 1951 partition = kzalloc(sizeof(*partition), GFP_KERNEL); 1952 if (!partition) 1953 return -ENOMEM; 1954 1955 partition->pt_module_dev = module_dev; 1956 partition->isolation_type = isolation_properties.isolation_type; 1957 1958 refcount_set(&partition->pt_ref_count, 1); 1959 1960 mutex_init(&partition->pt_mutex); 1961 1962 mutex_init(&partition->pt_irq_lock); 1963 1964 init_completion(&partition->async_hypercall); 1965 1966 INIT_HLIST_HEAD(&partition->irq_ack_notifier_list); 1967 1968 INIT_HLIST_HEAD(&partition->pt_devices); 1969 1970 spin_lock_init(&partition->pt_mem_regions_lock); 1971 INIT_HLIST_HEAD(&partition->pt_mem_regions); 1972 1973 mshv_eventfd_init(partition); 1974 1975 ret = init_srcu_struct(&partition->pt_irq_srcu); 1976 if (ret) 1977 goto free_partition; 1978 1979 ret = hv_call_create_partition(creation_flags, 1980 creation_properties, 1981 isolation_properties, 1982 &partition->pt_id); 1983 if (ret) 1984 goto cleanup_irq_srcu; 1985 1986 ret = add_partition(partition); 1987 if (ret) 1988 goto delete_partition; 1989 1990 ret = mshv_init_async_handler(partition); 1991 if (!ret) { 1992 ret = FD_ADD(O_CLOEXEC, anon_inode_getfile("mshv_partition", 1993 &mshv_partition_fops, 1994 partition, O_RDWR)); 1995 if (ret >= 0) 1996 return ret; 1997 } 1998 remove_partition(partition); 1999 delete_partition: 2000 hv_call_delete_partition(partition->pt_id); 2001 cleanup_irq_srcu: 2002 cleanup_srcu_struct(&partition->pt_irq_srcu); 2003 free_partition: 2004 kfree(partition); 2005 2006 return ret; 2007 } 2008 2009 static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, 2010 unsigned long arg) 2011 { 2012 struct miscdevice *misc = filp->private_data; 2013 2014 switch (ioctl) { 2015 case MSHV_CREATE_PARTITION: 2016 return mshv_ioctl_create_partition((void __user *)arg, 2017 misc->this_device); 2018 case MSHV_ROOT_HVCALL: 2019 return mshv_ioctl_passthru_hvcall(NULL, false, 2020 (void __user *)arg); 2021 } 2022 2023 return -ENOTTY; 2024 } 2025 2026 static int 2027 mshv_dev_open(struct inode *inode, struct file *filp) 2028 { 2029 return 0; 2030 } 2031 2032 static int 2033 mshv_dev_release(struct inode *inode, struct file *filp) 2034 { 2035 return 0; 2036 } 2037 2038 static int mshv_cpuhp_online; 2039 static int mshv_root_sched_online; 2040 2041 static const char *scheduler_type_to_string(enum hv_scheduler_type type) 2042 { 2043 switch (type) { 2044 case HV_SCHEDULER_TYPE_LP: 2045 return "classic scheduler without SMT"; 2046 case HV_SCHEDULER_TYPE_LP_SMT: 2047 return "classic scheduler with SMT"; 2048 case HV_SCHEDULER_TYPE_CORE_SMT: 2049 return "core scheduler"; 2050 case HV_SCHEDULER_TYPE_ROOT: 2051 return "root scheduler"; 2052 default: 2053 return "unknown scheduler"; 2054 }; 2055 } 2056 2057 /* TODO move this to hv_common.c when needed outside */ 2058 static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out) 2059 { 2060 struct hv_input_get_system_property *input; 2061 struct hv_output_get_system_property *output; 2062 unsigned long flags; 2063 u64 status; 2064 2065 local_irq_save(flags); 2066 input = *this_cpu_ptr(hyperv_pcpu_input_arg); 2067 output = *this_cpu_ptr(hyperv_pcpu_output_arg); 2068 2069 memset(input, 0, sizeof(*input)); 2070 memset(output, 0, sizeof(*output)); 2071 input->property_id = HV_SYSTEM_PROPERTY_SCHEDULER_TYPE; 2072 2073 status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output); 2074 if (!hv_result_success(status)) { 2075 local_irq_restore(flags); 2076 pr_err("%s: %s\n", __func__, hv_result_to_string(status)); 2077 return hv_result_to_errno(status); 2078 } 2079 2080 *out = output->scheduler_type; 2081 local_irq_restore(flags); 2082 2083 return 0; 2084 } 2085 2086 /* Retrieve and stash the supported scheduler type */ 2087 static int __init mshv_retrieve_scheduler_type(struct device *dev) 2088 { 2089 int ret = 0; 2090 2091 if (hv_l1vh_partition()) 2092 hv_scheduler_type = HV_SCHEDULER_TYPE_CORE_SMT; 2093 else 2094 ret = hv_retrieve_scheduler_type(&hv_scheduler_type); 2095 2096 if (ret) 2097 return ret; 2098 2099 dev_info(dev, "Hypervisor using %s\n", 2100 scheduler_type_to_string(hv_scheduler_type)); 2101 2102 switch (hv_scheduler_type) { 2103 case HV_SCHEDULER_TYPE_CORE_SMT: 2104 case HV_SCHEDULER_TYPE_LP_SMT: 2105 case HV_SCHEDULER_TYPE_ROOT: 2106 case HV_SCHEDULER_TYPE_LP: 2107 /* Supported scheduler, nothing to do */ 2108 break; 2109 default: 2110 dev_err(dev, "unsupported scheduler 0x%x, bailing.\n", 2111 hv_scheduler_type); 2112 return -EOPNOTSUPP; 2113 } 2114 2115 return 0; 2116 } 2117 2118 static int mshv_root_scheduler_init(unsigned int cpu) 2119 { 2120 void **inputarg, **outputarg, *p; 2121 2122 inputarg = (void **)this_cpu_ptr(root_scheduler_input); 2123 outputarg = (void **)this_cpu_ptr(root_scheduler_output); 2124 2125 /* Allocate two consecutive pages. One for input, one for output. */ 2126 p = kmalloc(2 * HV_HYP_PAGE_SIZE, GFP_KERNEL); 2127 if (!p) 2128 return -ENOMEM; 2129 2130 *inputarg = p; 2131 *outputarg = (char *)p + HV_HYP_PAGE_SIZE; 2132 2133 return 0; 2134 } 2135 2136 static int mshv_root_scheduler_cleanup(unsigned int cpu) 2137 { 2138 void *p, **inputarg, **outputarg; 2139 2140 inputarg = (void **)this_cpu_ptr(root_scheduler_input); 2141 outputarg = (void **)this_cpu_ptr(root_scheduler_output); 2142 2143 p = *inputarg; 2144 2145 *inputarg = NULL; 2146 *outputarg = NULL; 2147 2148 kfree(p); 2149 2150 return 0; 2151 } 2152 2153 /* Must be called after retrieving the scheduler type */ 2154 static int 2155 root_scheduler_init(struct device *dev) 2156 { 2157 int ret; 2158 2159 if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) 2160 return 0; 2161 2162 root_scheduler_input = alloc_percpu(void *); 2163 root_scheduler_output = alloc_percpu(void *); 2164 2165 if (!root_scheduler_input || !root_scheduler_output) { 2166 dev_err(dev, "Failed to allocate root scheduler buffers\n"); 2167 ret = -ENOMEM; 2168 goto out; 2169 } 2170 2171 ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_root_sched", 2172 mshv_root_scheduler_init, 2173 mshv_root_scheduler_cleanup); 2174 2175 if (ret < 0) { 2176 dev_err(dev, "Failed to setup root scheduler state: %i\n", ret); 2177 goto out; 2178 } 2179 2180 mshv_root_sched_online = ret; 2181 2182 return 0; 2183 2184 out: 2185 free_percpu(root_scheduler_input); 2186 free_percpu(root_scheduler_output); 2187 return ret; 2188 } 2189 2190 static void 2191 root_scheduler_deinit(void) 2192 { 2193 if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) 2194 return; 2195 2196 cpuhp_remove_state(mshv_root_sched_online); 2197 free_percpu(root_scheduler_input); 2198 free_percpu(root_scheduler_output); 2199 } 2200 2201 static int mshv_reboot_notify(struct notifier_block *nb, 2202 unsigned long code, void *unused) 2203 { 2204 cpuhp_remove_state(mshv_cpuhp_online); 2205 return 0; 2206 } 2207 2208 struct notifier_block mshv_reboot_nb = { 2209 .notifier_call = mshv_reboot_notify, 2210 }; 2211 2212 static void mshv_root_partition_exit(void) 2213 { 2214 unregister_reboot_notifier(&mshv_reboot_nb); 2215 root_scheduler_deinit(); 2216 } 2217 2218 static int __init mshv_root_partition_init(struct device *dev) 2219 { 2220 int err; 2221 2222 err = root_scheduler_init(dev); 2223 if (err) 2224 return err; 2225 2226 err = register_reboot_notifier(&mshv_reboot_nb); 2227 if (err) 2228 goto root_sched_deinit; 2229 2230 return 0; 2231 2232 root_sched_deinit: 2233 root_scheduler_deinit(); 2234 return err; 2235 } 2236 2237 static void mshv_init_vmm_caps(struct device *dev) 2238 { 2239 /* 2240 * This can only fail here if HVCALL_GET_PARTITION_PROPERTY_EX or 2241 * HV_PARTITION_PROPERTY_VMM_CAPABILITIES are not supported. In that 2242 * case it's valid to proceed as if all vmm_caps are disabled (zero). 2243 */ 2244 if (hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF, 2245 HV_PARTITION_PROPERTY_VMM_CAPABILITIES, 2246 0, &mshv_root.vmm_caps, 2247 sizeof(mshv_root.vmm_caps))) 2248 dev_warn(dev, "Unable to get VMM capabilities\n"); 2249 2250 dev_dbg(dev, "vmm_caps = %#llx\n", mshv_root.vmm_caps.as_uint64[0]); 2251 } 2252 2253 static int __init mshv_parent_partition_init(void) 2254 { 2255 int ret; 2256 struct device *dev; 2257 union hv_hypervisor_version_info version_info; 2258 2259 if (!hv_parent_partition() || is_kdump_kernel()) 2260 return -ENODEV; 2261 2262 if (hv_get_hypervisor_version(&version_info)) 2263 return -ENODEV; 2264 2265 ret = misc_register(&mshv_dev); 2266 if (ret) 2267 return ret; 2268 2269 dev = mshv_dev.this_device; 2270 2271 if (version_info.build_number < MSHV_HV_MIN_VERSION || 2272 version_info.build_number > MSHV_HV_MAX_VERSION) { 2273 dev_err(dev, "Running on unvalidated Hyper-V version\n"); 2274 dev_err(dev, "Versions: current: %u min: %u max: %u\n", 2275 version_info.build_number, MSHV_HV_MIN_VERSION, 2276 MSHV_HV_MAX_VERSION); 2277 } 2278 2279 mshv_root.synic_pages = alloc_percpu(struct hv_synic_pages); 2280 if (!mshv_root.synic_pages) { 2281 dev_err(dev, "Failed to allocate percpu synic page\n"); 2282 ret = -ENOMEM; 2283 goto device_deregister; 2284 } 2285 2286 ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic", 2287 mshv_synic_init, 2288 mshv_synic_cleanup); 2289 if (ret < 0) { 2290 dev_err(dev, "Failed to setup cpu hotplug state: %i\n", ret); 2291 goto free_synic_pages; 2292 } 2293 2294 mshv_cpuhp_online = ret; 2295 2296 ret = mshv_retrieve_scheduler_type(dev); 2297 if (ret) 2298 goto remove_cpu_state; 2299 2300 if (hv_root_partition()) 2301 ret = mshv_root_partition_init(dev); 2302 if (ret) 2303 goto remove_cpu_state; 2304 2305 mshv_init_vmm_caps(dev); 2306 2307 ret = mshv_irqfd_wq_init(); 2308 if (ret) 2309 goto exit_partition; 2310 2311 spin_lock_init(&mshv_root.pt_ht_lock); 2312 hash_init(mshv_root.pt_htable); 2313 2314 hv_setup_mshv_handler(mshv_isr); 2315 2316 return 0; 2317 2318 exit_partition: 2319 if (hv_root_partition()) 2320 mshv_root_partition_exit(); 2321 remove_cpu_state: 2322 cpuhp_remove_state(mshv_cpuhp_online); 2323 free_synic_pages: 2324 free_percpu(mshv_root.synic_pages); 2325 device_deregister: 2326 misc_deregister(&mshv_dev); 2327 return ret; 2328 } 2329 2330 static void __exit mshv_parent_partition_exit(void) 2331 { 2332 hv_setup_mshv_handler(NULL); 2333 mshv_port_table_fini(); 2334 misc_deregister(&mshv_dev); 2335 mshv_irqfd_wq_cleanup(); 2336 if (hv_root_partition()) 2337 mshv_root_partition_exit(); 2338 cpuhp_remove_state(mshv_cpuhp_online); 2339 free_percpu(mshv_root.synic_pages); 2340 } 2341 2342 module_init(mshv_parent_partition_init); 2343 module_exit(mshv_parent_partition_exit); 2344