1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2024, Microsoft Corporation. 4 * 5 * The main part of the mshv_root module, providing APIs to create 6 * and manage guest partitions. 7 * 8 * Authors: Microsoft Linux virtualization team 9 */ 10 11 #include <linux/entry-virt.h> 12 #include <linux/kernel.h> 13 #include <linux/module.h> 14 #include <linux/fs.h> 15 #include <linux/miscdevice.h> 16 #include <linux/slab.h> 17 #include <linux/file.h> 18 #include <linux/anon_inodes.h> 19 #include <linux/mm.h> 20 #include <linux/io.h> 21 #include <linux/cpuhotplug.h> 22 #include <linux/random.h> 23 #include <asm/mshyperv.h> 24 #include <linux/hyperv.h> 25 #include <linux/notifier.h> 26 #include <linux/reboot.h> 27 #include <linux/kexec.h> 28 #include <linux/page-flags.h> 29 #include <linux/crash_dump.h> 30 #include <linux/panic_notifier.h> 31 #include <linux/vmalloc.h> 32 #include <linux/rseq.h> 33 34 #include "mshv_eventfd.h" 35 #include "mshv.h" 36 #include "mshv_root.h" 37 38 MODULE_AUTHOR("Microsoft"); 39 MODULE_LICENSE("GPL"); 40 MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv"); 41 42 /* HV_THREAD_COUNTER */ 43 #if defined(CONFIG_X86_64) 44 #define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 202 45 #elif defined(CONFIG_ARM64) 46 #define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 95 47 #endif 48 49 struct mshv_root mshv_root; 50 51 enum hv_scheduler_type hv_scheduler_type; 52 53 /* Once we implement the fast extended hypercall ABI they can go away. */ 54 static void * __percpu *root_scheduler_input; 55 static void * __percpu *root_scheduler_output; 56 57 static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); 58 static int mshv_dev_open(struct inode *inode, struct file *filp); 59 static int mshv_dev_release(struct inode *inode, struct file *filp); 60 static int mshv_vp_release(struct inode *inode, struct file *filp); 61 static long mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); 62 static int mshv_partition_release(struct inode *inode, struct file *filp); 63 static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); 64 static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma); 65 static vm_fault_t mshv_vp_fault(struct vm_fault *vmf); 66 static int mshv_init_async_handler(struct mshv_partition *partition); 67 static void mshv_async_hvcall_handler(void *data, u64 *status); 68 69 static const union hv_input_vtl input_vtl_zero; 70 static const union hv_input_vtl input_vtl_normal = { 71 .target_vtl = HV_NORMAL_VTL, 72 .use_target_vtl = 1, 73 }; 74 75 static const struct vm_operations_struct mshv_vp_vm_ops = { 76 .fault = mshv_vp_fault, 77 }; 78 79 static const struct file_operations mshv_vp_fops = { 80 .owner = THIS_MODULE, 81 .release = mshv_vp_release, 82 .unlocked_ioctl = mshv_vp_ioctl, 83 .llseek = noop_llseek, 84 .mmap = mshv_vp_mmap, 85 }; 86 87 static const struct file_operations mshv_partition_fops = { 88 .owner = THIS_MODULE, 89 .release = mshv_partition_release, 90 .unlocked_ioctl = mshv_partition_ioctl, 91 .llseek = noop_llseek, 92 }; 93 94 static const struct file_operations mshv_dev_fops = { 95 .owner = THIS_MODULE, 96 .open = mshv_dev_open, 97 .release = mshv_dev_release, 98 .unlocked_ioctl = mshv_dev_ioctl, 99 .llseek = noop_llseek, 100 }; 101 102 static struct miscdevice mshv_dev = { 103 .minor = MISC_DYNAMIC_MINOR, 104 .name = "mshv", 105 .fops = &mshv_dev_fops, 106 .mode = 0600, 107 }; 108 109 /* 110 * Only allow hypercalls that have a u64 partition id as the first member of 111 * the input structure. 112 * These are sorted by value. 113 */ 114 static u16 mshv_passthru_hvcalls[] = { 115 HVCALL_GET_PARTITION_PROPERTY, 116 HVCALL_GET_PARTITION_PROPERTY_EX, 117 HVCALL_SET_PARTITION_PROPERTY, 118 HVCALL_INSTALL_INTERCEPT, 119 HVCALL_GET_VP_REGISTERS, 120 HVCALL_SET_VP_REGISTERS, 121 HVCALL_TRANSLATE_VIRTUAL_ADDRESS, 122 HVCALL_CLEAR_VIRTUAL_INTERRUPT, 123 HVCALL_SCRUB_PARTITION, 124 HVCALL_REGISTER_INTERCEPT_RESULT, 125 HVCALL_ASSERT_VIRTUAL_INTERRUPT, 126 HVCALL_GET_GPA_PAGES_ACCESS_STATES, 127 HVCALL_SIGNAL_EVENT_DIRECT, 128 HVCALL_POST_MESSAGE_DIRECT, 129 HVCALL_GET_VP_CPUID_VALUES, 130 }; 131 132 /* 133 * Only allow hypercalls that are safe to be called by the VMM with the host 134 * partition as target (i.e. HV_PARTITION_ID_SELF). Carefully audit that a 135 * hypercall cannot be misused by the VMM before adding it to this list. 136 */ 137 static u16 mshv_self_passthru_hvcalls[] = { 138 HVCALL_GET_PARTITION_PROPERTY, 139 HVCALL_GET_PARTITION_PROPERTY_EX, 140 }; 141 142 static bool mshv_hvcall_is_async(u16 code) 143 { 144 switch (code) { 145 case HVCALL_SET_PARTITION_PROPERTY: 146 return true; 147 default: 148 break; 149 } 150 return false; 151 } 152 153 static bool mshv_passthru_hvcall_allowed(u16 code, u64 pt_id) 154 { 155 int i; 156 int n = ARRAY_SIZE(mshv_passthru_hvcalls); 157 u16 *allowed_hvcalls = mshv_passthru_hvcalls; 158 159 if (pt_id == HV_PARTITION_ID_SELF) { 160 n = ARRAY_SIZE(mshv_self_passthru_hvcalls); 161 allowed_hvcalls = mshv_self_passthru_hvcalls; 162 } 163 164 for (i = 0; i < n; ++i) 165 if (allowed_hvcalls[i] == code) 166 return true; 167 168 return false; 169 } 170 171 static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition, 172 bool partition_locked, 173 void __user *user_args) 174 { 175 u64 status; 176 int ret = 0; 177 bool is_async; 178 struct mshv_root_hvcall args; 179 struct page *page; 180 unsigned int pages_order; 181 void *input_pg = NULL; 182 void *output_pg = NULL; 183 u16 reps_completed; 184 u64 pt_id = partition ? partition->pt_id : HV_PARTITION_ID_SELF; 185 186 if (copy_from_user(&args, user_args, sizeof(args))) 187 return -EFAULT; 188 189 if (args.status || !args.in_ptr || args.in_sz < sizeof(u64) || 190 mshv_field_nonzero(args, rsvd) || args.in_sz > HV_HYP_PAGE_SIZE) 191 return -EINVAL; 192 193 if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE)) 194 return -EINVAL; 195 196 if (!mshv_passthru_hvcall_allowed(args.code, pt_id)) 197 return -EINVAL; 198 199 is_async = mshv_hvcall_is_async(args.code); 200 if (is_async) { 201 /* async hypercalls can only be called from partition fd */ 202 if (!partition || !partition_locked) 203 return -EINVAL; 204 ret = mshv_init_async_handler(partition); 205 if (ret) 206 return ret; 207 } 208 209 pages_order = args.out_ptr ? 1 : 0; 210 page = alloc_pages(GFP_KERNEL, pages_order); 211 if (!page) 212 return -ENOMEM; 213 input_pg = page_address(page); 214 215 if (args.out_ptr) 216 output_pg = (char *)input_pg + PAGE_SIZE; 217 else 218 output_pg = NULL; 219 220 if (copy_from_user(input_pg, (void __user *)args.in_ptr, 221 args.in_sz)) { 222 ret = -EFAULT; 223 goto free_pages_out; 224 } 225 226 /* 227 * NOTE: This only works because all the allowed hypercalls' input 228 * structs begin with a u64 partition_id field. 229 */ 230 *(u64 *)input_pg = pt_id; 231 232 reps_completed = 0; 233 do { 234 if (args.reps) { 235 status = hv_do_rep_hypercall_ex(args.code, args.reps, 236 0, reps_completed, 237 input_pg, output_pg); 238 reps_completed = hv_repcomp(status); 239 } else { 240 status = hv_do_hypercall(args.code, input_pg, output_pg); 241 } 242 243 if (hv_result(status) == HV_STATUS_CALL_PENDING) { 244 if (is_async) { 245 mshv_async_hvcall_handler(partition, &status); 246 } else { /* Paranoia check. This shouldn't happen! */ 247 ret = -EBADFD; 248 goto free_pages_out; 249 } 250 } 251 252 if (hv_result_success(status)) 253 break; 254 255 if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) 256 ret = hv_result_to_errno(status); 257 else 258 ret = hv_call_deposit_pages(NUMA_NO_NODE, 259 pt_id, 1); 260 } while (!ret); 261 262 args.status = hv_result(status); 263 args.reps = reps_completed; 264 if (copy_to_user(user_args, &args, sizeof(args))) 265 ret = -EFAULT; 266 267 if (!ret && output_pg && 268 copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz)) 269 ret = -EFAULT; 270 271 free_pages_out: 272 free_pages((unsigned long)input_pg, pages_order); 273 274 return ret; 275 } 276 277 static inline bool is_ghcb_mapping_available(void) 278 { 279 #if IS_ENABLED(CONFIG_X86_64) 280 return ms_hyperv.ext_features & HV_VP_GHCB_ROOT_MAPPING_AVAILABLE; 281 #else 282 return 0; 283 #endif 284 } 285 286 static int mshv_get_vp_registers(u32 vp_index, u64 partition_id, u16 count, 287 struct hv_register_assoc *registers) 288 { 289 return hv_call_get_vp_registers(vp_index, partition_id, 290 count, input_vtl_zero, registers); 291 } 292 293 static int mshv_set_vp_registers(u32 vp_index, u64 partition_id, u16 count, 294 struct hv_register_assoc *registers) 295 { 296 return hv_call_set_vp_registers(vp_index, partition_id, 297 count, input_vtl_zero, registers); 298 } 299 300 /* 301 * Explicit guest vCPU suspend is asynchronous by nature (as it is requested by 302 * dom0 vCPU for guest vCPU) and thus it can race with "intercept" suspend, 303 * done by the hypervisor. 304 * "Intercept" suspend leads to asynchronous message delivery to dom0 which 305 * should be awaited to keep the VP loop consistent (i.e. no message pending 306 * upon VP resume). 307 * VP intercept suspend can't be done when the VP is explicitly suspended 308 * already, and thus can be only two possible race scenarios: 309 * 1. implicit suspend bit set -> explicit suspend bit set -> message sent 310 * 2. implicit suspend bit set -> message sent -> explicit suspend bit set 311 * Checking for implicit suspend bit set after explicit suspend request has 312 * succeeded in either case allows us to reliably identify, if there is a 313 * message to receive and deliver to VMM. 314 */ 315 static int 316 mshv_suspend_vp(const struct mshv_vp *vp, bool *message_in_flight) 317 { 318 struct hv_register_assoc explicit_suspend = { 319 .name = HV_REGISTER_EXPLICIT_SUSPEND 320 }; 321 struct hv_register_assoc intercept_suspend = { 322 .name = HV_REGISTER_INTERCEPT_SUSPEND 323 }; 324 union hv_explicit_suspend_register *es = 325 &explicit_suspend.value.explicit_suspend; 326 union hv_intercept_suspend_register *is = 327 &intercept_suspend.value.intercept_suspend; 328 int ret; 329 330 es->suspended = 1; 331 332 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 333 1, &explicit_suspend); 334 if (ret) { 335 vp_err(vp, "Failed to explicitly suspend vCPU\n"); 336 return ret; 337 } 338 339 ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 340 1, &intercept_suspend); 341 if (ret) { 342 vp_err(vp, "Failed to get intercept suspend state\n"); 343 return ret; 344 } 345 346 *message_in_flight = is->suspended; 347 348 return 0; 349 } 350 351 /* 352 * This function is used when VPs are scheduled by the hypervisor's 353 * scheduler. 354 * 355 * Caller has to make sure the registers contain cleared 356 * HV_REGISTER_INTERCEPT_SUSPEND and HV_REGISTER_EXPLICIT_SUSPEND registers 357 * exactly in this order (the hypervisor clears them sequentially) to avoid 358 * potential invalid clearing a newly arrived HV_REGISTER_INTERCEPT_SUSPEND 359 * after VP is released from HV_REGISTER_EXPLICIT_SUSPEND in case of the 360 * opposite order. 361 */ 362 static long mshv_run_vp_with_hyp_scheduler(struct mshv_vp *vp) 363 { 364 long ret; 365 struct hv_register_assoc suspend_regs[2] = { 366 { .name = HV_REGISTER_INTERCEPT_SUSPEND }, 367 { .name = HV_REGISTER_EXPLICIT_SUSPEND } 368 }; 369 size_t count = ARRAY_SIZE(suspend_regs); 370 371 /* Resume VP execution */ 372 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 373 count, suspend_regs); 374 if (ret) { 375 vp_err(vp, "Failed to resume vp execution. %lx\n", ret); 376 return ret; 377 } 378 379 ret = wait_event_interruptible(vp->run.vp_suspend_queue, 380 vp->run.kicked_by_hv == 1); 381 if (ret) { 382 bool message_in_flight; 383 384 /* 385 * Otherwise the waiting was interrupted by a signal: suspend 386 * the vCPU explicitly and copy message in flight (if any). 387 */ 388 ret = mshv_suspend_vp(vp, &message_in_flight); 389 if (ret) 390 return ret; 391 392 /* Return if no message in flight */ 393 if (!message_in_flight) 394 return -EINTR; 395 396 /* Wait for the message in flight. */ 397 wait_event(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1); 398 } 399 400 /* 401 * Reset the flag to make the wait_event call above work 402 * next time. 403 */ 404 vp->run.kicked_by_hv = 0; 405 406 return 0; 407 } 408 409 static int 410 mshv_vp_dispatch(struct mshv_vp *vp, u32 flags, 411 struct hv_output_dispatch_vp *res) 412 { 413 struct hv_input_dispatch_vp *input; 414 struct hv_output_dispatch_vp *output; 415 u64 status; 416 417 preempt_disable(); 418 input = *this_cpu_ptr(root_scheduler_input); 419 output = *this_cpu_ptr(root_scheduler_output); 420 421 memset(input, 0, sizeof(*input)); 422 memset(output, 0, sizeof(*output)); 423 424 input->partition_id = vp->vp_partition->pt_id; 425 input->vp_index = vp->vp_index; 426 input->time_slice = 0; /* Run forever until something happens */ 427 input->spec_ctrl = 0; /* TODO: set sensible flags */ 428 input->flags = flags; 429 430 vp->run.flags.root_sched_dispatched = 1; 431 status = hv_do_hypercall(HVCALL_DISPATCH_VP, input, output); 432 vp->run.flags.root_sched_dispatched = 0; 433 434 *res = *output; 435 preempt_enable(); 436 437 if (!hv_result_success(status)) 438 vp_err(vp, "%s: status %s\n", __func__, 439 hv_result_to_string(status)); 440 441 return hv_result_to_errno(status); 442 } 443 444 static int 445 mshv_vp_clear_explicit_suspend(struct mshv_vp *vp) 446 { 447 struct hv_register_assoc explicit_suspend = { 448 .name = HV_REGISTER_EXPLICIT_SUSPEND, 449 .value.explicit_suspend.suspended = 0, 450 }; 451 int ret; 452 453 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 454 1, &explicit_suspend); 455 456 if (ret) 457 vp_err(vp, "Failed to unsuspend\n"); 458 459 return ret; 460 } 461 462 #if IS_ENABLED(CONFIG_X86_64) 463 static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp) 464 { 465 if (!vp->vp_register_page) 466 return 0; 467 return vp->vp_register_page->interrupt_vectors.as_uint64; 468 } 469 #else 470 static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp) 471 { 472 return 0; 473 } 474 #endif 475 476 static bool mshv_vp_dispatch_thread_blocked(struct mshv_vp *vp) 477 { 478 struct hv_stats_page **stats = vp->vp_stats_pages; 479 u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->data; 480 u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->data; 481 482 return parent_vp_cntrs[HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED] || 483 self_vp_cntrs[HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED]; 484 } 485 486 static int 487 mshv_vp_wait_for_hv_kick(struct mshv_vp *vp) 488 { 489 int ret; 490 491 ret = wait_event_interruptible(vp->run.vp_suspend_queue, 492 (vp->run.kicked_by_hv == 1 && 493 !mshv_vp_dispatch_thread_blocked(vp)) || 494 mshv_vp_interrupt_pending(vp)); 495 if (ret) 496 return -EINTR; 497 498 vp->run.flags.root_sched_blocked = 0; 499 vp->run.kicked_by_hv = 0; 500 501 return 0; 502 } 503 504 /* Must be called with interrupts enabled */ 505 static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp) 506 { 507 long ret; 508 509 if (vp->run.flags.root_sched_blocked) { 510 /* 511 * Dispatch state of this VP is blocked. Need to wait 512 * for the hypervisor to clear the blocked state before 513 * dispatching it. 514 */ 515 ret = mshv_vp_wait_for_hv_kick(vp); 516 if (ret) 517 return ret; 518 } 519 520 do { 521 u32 flags = 0; 522 struct hv_output_dispatch_vp output; 523 524 if (__xfer_to_guest_mode_work_pending()) { 525 ret = xfer_to_guest_mode_handle_work(); 526 if (ret) 527 break; 528 } 529 530 if (vp->run.flags.intercept_suspend) 531 flags |= HV_DISPATCH_VP_FLAG_CLEAR_INTERCEPT_SUSPEND; 532 533 if (mshv_vp_interrupt_pending(vp)) 534 flags |= HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION; 535 536 ret = mshv_vp_dispatch(vp, flags, &output); 537 if (ret) 538 break; 539 540 vp->run.flags.intercept_suspend = 0; 541 542 if (output.dispatch_state == HV_VP_DISPATCH_STATE_BLOCKED) { 543 if (output.dispatch_event == 544 HV_VP_DISPATCH_EVENT_SUSPEND) { 545 /* 546 * TODO: remove the warning once VP canceling 547 * is supported 548 */ 549 WARN_ONCE(atomic64_read(&vp->run.vp_signaled_count), 550 "%s: vp#%d: unexpected explicit suspend\n", 551 __func__, vp->vp_index); 552 /* 553 * Need to clear explicit suspend before 554 * dispatching. 555 * Explicit suspend is either: 556 * - set right after the first VP dispatch or 557 * - set explicitly via hypercall 558 * Since the latter case is not yet supported, 559 * simply clear it here. 560 */ 561 ret = mshv_vp_clear_explicit_suspend(vp); 562 if (ret) 563 break; 564 565 ret = mshv_vp_wait_for_hv_kick(vp); 566 if (ret) 567 break; 568 } else { 569 vp->run.flags.root_sched_blocked = 1; 570 ret = mshv_vp_wait_for_hv_kick(vp); 571 if (ret) 572 break; 573 } 574 } else { 575 /* HV_VP_DISPATCH_STATE_READY */ 576 if (output.dispatch_event == 577 HV_VP_DISPATCH_EVENT_INTERCEPT) 578 vp->run.flags.intercept_suspend = 1; 579 } 580 } while (!vp->run.flags.intercept_suspend); 581 582 rseq_virt_userspace_exit(); 583 584 return ret; 585 } 586 587 static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ, 588 "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ"); 589 590 static struct mshv_mem_region * 591 mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn) 592 { 593 struct mshv_mem_region *region; 594 595 hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { 596 if (gfn >= region->start_gfn && 597 gfn < region->start_gfn + region->nr_pages) 598 return region; 599 } 600 601 return NULL; 602 } 603 604 static struct mshv_mem_region * 605 mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn) 606 { 607 struct mshv_mem_region *region; 608 609 spin_lock(&p->pt_mem_regions_lock); 610 region = mshv_partition_region_by_gfn(p, gfn); 611 if (!region || !mshv_region_get(region)) { 612 spin_unlock(&p->pt_mem_regions_lock); 613 return NULL; 614 } 615 spin_unlock(&p->pt_mem_regions_lock); 616 617 return region; 618 } 619 620 /** 621 * mshv_handle_gpa_intercept - Handle GPA (Guest Physical Address) intercepts. 622 * @vp: Pointer to the virtual processor structure. 623 * 624 * This function processes GPA intercepts by identifying the memory region 625 * corresponding to the intercepted GPA, aligning the page offset, and 626 * mapping the required pages. It ensures that the region is valid and 627 * handles faults efficiently by mapping multiple pages at once. 628 * 629 * Return: true if the intercept was handled successfully, false otherwise. 630 */ 631 static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) 632 { 633 struct mshv_partition *p = vp->vp_partition; 634 struct mshv_mem_region *region; 635 bool ret; 636 u64 gfn; 637 #if defined(CONFIG_X86_64) 638 struct hv_x64_memory_intercept_message *msg = 639 (struct hv_x64_memory_intercept_message *) 640 vp->vp_intercept_msg_page->u.payload; 641 #elif defined(CONFIG_ARM64) 642 struct hv_arm64_memory_intercept_message *msg = 643 (struct hv_arm64_memory_intercept_message *) 644 vp->vp_intercept_msg_page->u.payload; 645 #endif 646 647 gfn = HVPFN_DOWN(msg->guest_physical_address); 648 649 region = mshv_partition_region_by_gfn_get(p, gfn); 650 if (!region) 651 return false; 652 653 /* Only movable memory ranges are supported for GPA intercepts */ 654 if (region->mreg_type == MSHV_REGION_TYPE_MEM_MOVABLE) 655 ret = mshv_region_handle_gfn_fault(region, gfn); 656 else 657 ret = false; 658 659 mshv_region_put(region); 660 661 return ret; 662 } 663 664 static bool mshv_vp_handle_intercept(struct mshv_vp *vp) 665 { 666 switch (vp->vp_intercept_msg_page->header.message_type) { 667 case HVMSG_GPA_INTERCEPT: 668 return mshv_handle_gpa_intercept(vp); 669 } 670 return false; 671 } 672 673 static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg) 674 { 675 long rc; 676 677 do { 678 if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) 679 rc = mshv_run_vp_with_root_scheduler(vp); 680 else 681 rc = mshv_run_vp_with_hyp_scheduler(vp); 682 } while (rc == 0 && mshv_vp_handle_intercept(vp)); 683 684 if (rc) 685 return rc; 686 687 if (copy_to_user(ret_msg, vp->vp_intercept_msg_page, 688 sizeof(struct hv_message))) 689 rc = -EFAULT; 690 691 return rc; 692 } 693 694 static int 695 mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp *vp, 696 struct hv_vp_state_data state_data, 697 unsigned long user_pfn, size_t page_count, 698 bool is_set) 699 { 700 int completed, ret = 0; 701 unsigned long check; 702 struct page **pages; 703 704 if (page_count > INT_MAX) 705 return -EINVAL; 706 /* 707 * Check the arithmetic for wraparound/overflow. 708 * The last page address in the buffer is: 709 * (user_pfn + (page_count - 1)) * PAGE_SIZE 710 */ 711 if (check_add_overflow(user_pfn, (page_count - 1), &check)) 712 return -EOVERFLOW; 713 if (check_mul_overflow(check, PAGE_SIZE, &check)) 714 return -EOVERFLOW; 715 716 /* Pin user pages so hypervisor can copy directly to them */ 717 pages = kcalloc(page_count, sizeof(struct page *), GFP_KERNEL); 718 if (!pages) 719 return -ENOMEM; 720 721 for (completed = 0; completed < page_count; completed += ret) { 722 unsigned long user_addr = (user_pfn + completed) * PAGE_SIZE; 723 int remaining = page_count - completed; 724 725 ret = pin_user_pages_fast(user_addr, remaining, FOLL_WRITE, 726 &pages[completed]); 727 if (ret < 0) { 728 vp_err(vp, "%s: Failed to pin user pages error %i\n", 729 __func__, ret); 730 goto unpin_pages; 731 } 732 } 733 734 if (is_set) 735 ret = hv_call_set_vp_state(vp->vp_index, 736 vp->vp_partition->pt_id, 737 state_data, page_count, pages, 738 0, NULL); 739 else 740 ret = hv_call_get_vp_state(vp->vp_index, 741 vp->vp_partition->pt_id, 742 state_data, page_count, pages, 743 NULL); 744 745 unpin_pages: 746 unpin_user_pages(pages, completed); 747 kfree(pages); 748 return ret; 749 } 750 751 static long 752 mshv_vp_ioctl_get_set_state(struct mshv_vp *vp, 753 struct mshv_get_set_vp_state __user *user_args, 754 bool is_set) 755 { 756 struct mshv_get_set_vp_state args; 757 long ret = 0; 758 union hv_output_get_vp_state vp_state; 759 u32 data_sz; 760 struct hv_vp_state_data state_data = {}; 761 762 if (copy_from_user(&args, user_args, sizeof(args))) 763 return -EFAULT; 764 765 if (args.type >= MSHV_VP_STATE_COUNT || mshv_field_nonzero(args, rsvd) || 766 !args.buf_sz || !PAGE_ALIGNED(args.buf_sz) || 767 !PAGE_ALIGNED(args.buf_ptr)) 768 return -EINVAL; 769 770 if (!access_ok((void __user *)args.buf_ptr, args.buf_sz)) 771 return -EFAULT; 772 773 switch (args.type) { 774 case MSHV_VP_STATE_LAPIC: 775 state_data.type = HV_GET_SET_VP_STATE_LAPIC_STATE; 776 data_sz = HV_HYP_PAGE_SIZE; 777 break; 778 case MSHV_VP_STATE_XSAVE: 779 { 780 u64 data_sz_64; 781 782 ret = hv_call_get_partition_property(vp->vp_partition->pt_id, 783 HV_PARTITION_PROPERTY_XSAVE_STATES, 784 &state_data.xsave.states.as_uint64); 785 if (ret) 786 return ret; 787 788 ret = hv_call_get_partition_property(vp->vp_partition->pt_id, 789 HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE, 790 &data_sz_64); 791 if (ret) 792 return ret; 793 794 data_sz = (u32)data_sz_64; 795 state_data.xsave.flags = 0; 796 /* Always request legacy states */ 797 state_data.xsave.states.legacy_x87 = 1; 798 state_data.xsave.states.legacy_sse = 1; 799 state_data.type = HV_GET_SET_VP_STATE_XSAVE; 800 break; 801 } 802 case MSHV_VP_STATE_SIMP: 803 state_data.type = HV_GET_SET_VP_STATE_SIM_PAGE; 804 data_sz = HV_HYP_PAGE_SIZE; 805 break; 806 case MSHV_VP_STATE_SIEFP: 807 state_data.type = HV_GET_SET_VP_STATE_SIEF_PAGE; 808 data_sz = HV_HYP_PAGE_SIZE; 809 break; 810 case MSHV_VP_STATE_SYNTHETIC_TIMERS: 811 state_data.type = HV_GET_SET_VP_STATE_SYNTHETIC_TIMERS; 812 data_sz = sizeof(vp_state.synthetic_timers_state); 813 break; 814 default: 815 return -EINVAL; 816 } 817 818 if (copy_to_user(&user_args->buf_sz, &data_sz, sizeof(user_args->buf_sz))) 819 return -EFAULT; 820 821 if (data_sz > args.buf_sz) 822 return -EINVAL; 823 824 /* If the data is transmitted via pfns, delegate to helper */ 825 if (state_data.type & HV_GET_SET_VP_STATE_TYPE_PFN) { 826 unsigned long user_pfn = PFN_DOWN(args.buf_ptr); 827 size_t page_count = PFN_DOWN(args.buf_sz); 828 829 return mshv_vp_ioctl_get_set_state_pfn(vp, state_data, user_pfn, 830 page_count, is_set); 831 } 832 833 /* Paranoia check - this shouldn't happen! */ 834 if (data_sz > sizeof(vp_state)) { 835 vp_err(vp, "Invalid vp state data size!\n"); 836 return -EINVAL; 837 } 838 839 if (is_set) { 840 if (copy_from_user(&vp_state, (__user void *)args.buf_ptr, data_sz)) 841 return -EFAULT; 842 843 return hv_call_set_vp_state(vp->vp_index, 844 vp->vp_partition->pt_id, 845 state_data, 0, NULL, 846 sizeof(vp_state), (u8 *)&vp_state); 847 } 848 849 ret = hv_call_get_vp_state(vp->vp_index, vp->vp_partition->pt_id, 850 state_data, 0, NULL, &vp_state); 851 if (ret) 852 return ret; 853 854 if (copy_to_user((void __user *)args.buf_ptr, &vp_state, data_sz)) 855 return -EFAULT; 856 857 return 0; 858 } 859 860 static long 861 mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) 862 { 863 struct mshv_vp *vp = filp->private_data; 864 long r = -ENOTTY; 865 866 if (mutex_lock_killable(&vp->vp_mutex)) 867 return -EINTR; 868 869 switch (ioctl) { 870 case MSHV_RUN_VP: 871 r = mshv_vp_ioctl_run_vp(vp, (void __user *)arg); 872 break; 873 case MSHV_GET_VP_STATE: 874 r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, false); 875 break; 876 case MSHV_SET_VP_STATE: 877 r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, true); 878 break; 879 case MSHV_ROOT_HVCALL: 880 r = mshv_ioctl_passthru_hvcall(vp->vp_partition, false, 881 (void __user *)arg); 882 break; 883 default: 884 vp_warn(vp, "Invalid ioctl: %#x\n", ioctl); 885 break; 886 } 887 mutex_unlock(&vp->vp_mutex); 888 889 return r; 890 } 891 892 static vm_fault_t mshv_vp_fault(struct vm_fault *vmf) 893 { 894 struct mshv_vp *vp = vmf->vma->vm_file->private_data; 895 896 switch (vmf->vma->vm_pgoff) { 897 case MSHV_VP_MMAP_OFFSET_REGISTERS: 898 vmf->page = virt_to_page(vp->vp_register_page); 899 break; 900 case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE: 901 vmf->page = virt_to_page(vp->vp_intercept_msg_page); 902 break; 903 case MSHV_VP_MMAP_OFFSET_GHCB: 904 vmf->page = virt_to_page(vp->vp_ghcb_page); 905 break; 906 default: 907 return VM_FAULT_SIGBUS; 908 } 909 910 get_page(vmf->page); 911 912 return 0; 913 } 914 915 static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma) 916 { 917 struct mshv_vp *vp = file->private_data; 918 919 switch (vma->vm_pgoff) { 920 case MSHV_VP_MMAP_OFFSET_REGISTERS: 921 if (!vp->vp_register_page) 922 return -ENODEV; 923 break; 924 case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE: 925 if (!vp->vp_intercept_msg_page) 926 return -ENODEV; 927 break; 928 case MSHV_VP_MMAP_OFFSET_GHCB: 929 if (!vp->vp_ghcb_page) 930 return -ENODEV; 931 break; 932 default: 933 return -EINVAL; 934 } 935 936 vma->vm_ops = &mshv_vp_vm_ops; 937 return 0; 938 } 939 940 static int 941 mshv_vp_release(struct inode *inode, struct file *filp) 942 { 943 struct mshv_vp *vp = filp->private_data; 944 945 /* Rest of VP cleanup happens in destroy_partition() */ 946 mshv_partition_put(vp->vp_partition); 947 return 0; 948 } 949 950 void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index, 951 struct hv_stats_page *stats_pages[]) 952 { 953 union hv_stats_object_identity identity = { 954 .vp.partition_id = partition_id, 955 .vp.vp_index = vp_index, 956 }; 957 int err; 958 959 identity.vp.stats_area_type = HV_STATS_AREA_SELF; 960 err = hv_unmap_stats_page(HV_STATS_OBJECT_VP, 961 stats_pages[HV_STATS_AREA_SELF], 962 &identity); 963 if (err) 964 pr_err("%s: failed to unmap partition %llu vp %u self stats, err: %d\n", 965 __func__, partition_id, vp_index, err); 966 967 if (stats_pages[HV_STATS_AREA_PARENT] != stats_pages[HV_STATS_AREA_SELF]) { 968 identity.vp.stats_area_type = HV_STATS_AREA_PARENT; 969 err = hv_unmap_stats_page(HV_STATS_OBJECT_VP, 970 stats_pages[HV_STATS_AREA_PARENT], 971 &identity); 972 if (err) 973 pr_err("%s: failed to unmap partition %llu vp %u parent stats, err: %d\n", 974 __func__, partition_id, vp_index, err); 975 } 976 } 977 978 int mshv_vp_stats_map(u64 partition_id, u32 vp_index, 979 struct hv_stats_page *stats_pages[]) 980 { 981 union hv_stats_object_identity identity = { 982 .vp.partition_id = partition_id, 983 .vp.vp_index = vp_index, 984 }; 985 int err; 986 987 identity.vp.stats_area_type = HV_STATS_AREA_SELF; 988 err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity, 989 &stats_pages[HV_STATS_AREA_SELF]); 990 if (err) { 991 pr_err("%s: failed to map partition %llu vp %u self stats, err: %d\n", 992 __func__, partition_id, vp_index, err); 993 return err; 994 } 995 996 /* 997 * L1VH partition cannot access its vp stats in parent area. 998 */ 999 if (is_l1vh_parent(partition_id)) { 1000 stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF]; 1001 } else { 1002 identity.vp.stats_area_type = HV_STATS_AREA_PARENT; 1003 err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity, 1004 &stats_pages[HV_STATS_AREA_PARENT]); 1005 if (err) { 1006 pr_err("%s: failed to map partition %llu vp %u parent stats, err: %d\n", 1007 __func__, partition_id, vp_index, err); 1008 goto unmap_self; 1009 } 1010 if (!stats_pages[HV_STATS_AREA_PARENT]) 1011 stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF]; 1012 } 1013 1014 return 0; 1015 1016 unmap_self: 1017 identity.vp.stats_area_type = HV_STATS_AREA_SELF; 1018 hv_unmap_stats_page(HV_STATS_OBJECT_VP, 1019 stats_pages[HV_STATS_AREA_SELF], 1020 &identity); 1021 return err; 1022 } 1023 1024 static long 1025 mshv_partition_ioctl_create_vp(struct mshv_partition *partition, 1026 void __user *arg) 1027 { 1028 struct mshv_create_vp args; 1029 struct mshv_vp *vp; 1030 struct page *intercept_msg_page, *register_page, *ghcb_page; 1031 struct hv_stats_page *stats_pages[2]; 1032 long ret; 1033 1034 if (copy_from_user(&args, arg, sizeof(args))) 1035 return -EFAULT; 1036 1037 if (args.vp_index >= MSHV_MAX_VPS) 1038 return -EINVAL; 1039 1040 if (partition->pt_vp_array[args.vp_index]) 1041 return -EEXIST; 1042 1043 ret = hv_call_create_vp(NUMA_NO_NODE, partition->pt_id, args.vp_index, 1044 0 /* Only valid for root partition VPs */); 1045 if (ret) 1046 return ret; 1047 1048 ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, 1049 HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, 1050 input_vtl_zero, &intercept_msg_page); 1051 if (ret) 1052 goto destroy_vp; 1053 1054 if (!mshv_partition_encrypted(partition)) { 1055 ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, 1056 HV_VP_STATE_PAGE_REGISTERS, 1057 input_vtl_zero, ®ister_page); 1058 if (ret) 1059 goto unmap_intercept_message_page; 1060 } 1061 1062 if (mshv_partition_encrypted(partition) && 1063 is_ghcb_mapping_available()) { 1064 ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, 1065 HV_VP_STATE_PAGE_GHCB, 1066 input_vtl_normal, &ghcb_page); 1067 if (ret) 1068 goto unmap_register_page; 1069 } 1070 1071 ret = mshv_vp_stats_map(partition->pt_id, args.vp_index, 1072 stats_pages); 1073 if (ret) 1074 goto unmap_ghcb_page; 1075 1076 vp = kzalloc(sizeof(*vp), GFP_KERNEL); 1077 if (!vp) 1078 goto unmap_stats_pages; 1079 1080 vp->vp_partition = mshv_partition_get(partition); 1081 if (!vp->vp_partition) { 1082 ret = -EBADF; 1083 goto free_vp; 1084 } 1085 1086 mutex_init(&vp->vp_mutex); 1087 init_waitqueue_head(&vp->run.vp_suspend_queue); 1088 atomic64_set(&vp->run.vp_signaled_count, 0); 1089 1090 vp->vp_index = args.vp_index; 1091 vp->vp_intercept_msg_page = page_to_virt(intercept_msg_page); 1092 if (!mshv_partition_encrypted(partition)) 1093 vp->vp_register_page = page_to_virt(register_page); 1094 1095 if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) 1096 vp->vp_ghcb_page = page_to_virt(ghcb_page); 1097 1098 memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages)); 1099 1100 ret = mshv_debugfs_vp_create(vp); 1101 if (ret) 1102 goto put_partition; 1103 1104 /* 1105 * Keep anon_inode_getfd last: it installs fd in the file struct and 1106 * thus makes the state accessible in user space. 1107 */ 1108 ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp, 1109 O_RDWR | O_CLOEXEC); 1110 if (ret < 0) 1111 goto remove_debugfs_vp; 1112 1113 /* already exclusive with the partition mutex for all ioctls */ 1114 partition->pt_vp_count++; 1115 partition->pt_vp_array[args.vp_index] = vp; 1116 1117 return ret; 1118 1119 remove_debugfs_vp: 1120 mshv_debugfs_vp_remove(vp); 1121 put_partition: 1122 mshv_partition_put(partition); 1123 free_vp: 1124 kfree(vp); 1125 unmap_stats_pages: 1126 mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages); 1127 unmap_ghcb_page: 1128 if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) 1129 hv_unmap_vp_state_page(partition->pt_id, args.vp_index, 1130 HV_VP_STATE_PAGE_GHCB, ghcb_page, 1131 input_vtl_normal); 1132 unmap_register_page: 1133 if (!mshv_partition_encrypted(partition)) 1134 hv_unmap_vp_state_page(partition->pt_id, args.vp_index, 1135 HV_VP_STATE_PAGE_REGISTERS, 1136 register_page, input_vtl_zero); 1137 unmap_intercept_message_page: 1138 hv_unmap_vp_state_page(partition->pt_id, args.vp_index, 1139 HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, 1140 intercept_msg_page, input_vtl_zero); 1141 destroy_vp: 1142 hv_call_delete_vp(partition->pt_id, args.vp_index); 1143 return ret; 1144 } 1145 1146 static int mshv_init_async_handler(struct mshv_partition *partition) 1147 { 1148 if (completion_done(&partition->async_hypercall)) { 1149 pt_err(partition, 1150 "Cannot issue async hypercall while another one in progress!\n"); 1151 return -EPERM; 1152 } 1153 1154 reinit_completion(&partition->async_hypercall); 1155 return 0; 1156 } 1157 1158 static void mshv_async_hvcall_handler(void *data, u64 *status) 1159 { 1160 struct mshv_partition *partition = data; 1161 1162 wait_for_completion(&partition->async_hypercall); 1163 pt_dbg(partition, "Async hypercall completed!\n"); 1164 1165 *status = partition->async_hypercall_status; 1166 } 1167 1168 /* 1169 * NB: caller checks and makes sure mem->size is page aligned 1170 * Returns: 0 with regionpp updated on success, or -errno 1171 */ 1172 static int mshv_partition_create_region(struct mshv_partition *partition, 1173 struct mshv_user_mem_region *mem, 1174 struct mshv_mem_region **regionpp, 1175 bool is_mmio) 1176 { 1177 struct mshv_mem_region *rg; 1178 u64 nr_pages = HVPFN_DOWN(mem->size); 1179 1180 /* Reject overlapping regions */ 1181 spin_lock(&partition->pt_mem_regions_lock); 1182 hlist_for_each_entry(rg, &partition->pt_mem_regions, hnode) { 1183 if (mem->guest_pfn + nr_pages <= rg->start_gfn || 1184 rg->start_gfn + rg->nr_pages <= mem->guest_pfn) 1185 continue; 1186 spin_unlock(&partition->pt_mem_regions_lock); 1187 return -EEXIST; 1188 } 1189 spin_unlock(&partition->pt_mem_regions_lock); 1190 1191 rg = mshv_region_create(mem->guest_pfn, nr_pages, 1192 mem->userspace_addr, mem->flags); 1193 if (IS_ERR(rg)) 1194 return PTR_ERR(rg); 1195 1196 if (is_mmio) 1197 rg->mreg_type = MSHV_REGION_TYPE_MMIO; 1198 else if (mshv_partition_encrypted(partition) || 1199 !mshv_region_movable_init(rg)) 1200 rg->mreg_type = MSHV_REGION_TYPE_MEM_PINNED; 1201 else 1202 rg->mreg_type = MSHV_REGION_TYPE_MEM_MOVABLE; 1203 1204 rg->partition = partition; 1205 1206 *regionpp = rg; 1207 1208 return 0; 1209 } 1210 1211 /** 1212 * mshv_prepare_pinned_region - Pin and map memory regions 1213 * @region: Pointer to the memory region structure 1214 * 1215 * This function processes memory regions that are explicitly marked as pinned. 1216 * Pinned regions are preallocated, mapped upfront, and do not rely on fault-based 1217 * population. The function ensures the region is properly populated, handles 1218 * encryption requirements for SNP partitions if applicable, maps the region, 1219 * and performs necessary sharing or eviction operations based on the mapping 1220 * result. 1221 * 1222 * Return: 0 on success, negative error code on failure. 1223 */ 1224 static int mshv_prepare_pinned_region(struct mshv_mem_region *region) 1225 { 1226 struct mshv_partition *partition = region->partition; 1227 int ret; 1228 1229 ret = mshv_region_pin(region); 1230 if (ret) { 1231 pt_err(partition, "Failed to pin memory region: %d\n", 1232 ret); 1233 goto err_out; 1234 } 1235 1236 /* 1237 * For an SNP partition it is a requirement that for every memory region 1238 * that we are going to map for this partition we should make sure that 1239 * host access to that region is released. This is ensured by doing an 1240 * additional hypercall which will update the SLAT to release host 1241 * access to guest memory regions. 1242 */ 1243 if (mshv_partition_encrypted(partition)) { 1244 ret = mshv_region_unshare(region); 1245 if (ret) { 1246 pt_err(partition, 1247 "Failed to unshare memory region (guest_pfn: %llu): %d\n", 1248 region->start_gfn, ret); 1249 goto invalidate_region; 1250 } 1251 } 1252 1253 ret = mshv_region_map(region); 1254 if (ret && mshv_partition_encrypted(partition)) { 1255 int shrc; 1256 1257 shrc = mshv_region_share(region); 1258 if (!shrc) 1259 goto invalidate_region; 1260 1261 pt_err(partition, 1262 "Failed to share memory region (guest_pfn: %llu): %d\n", 1263 region->start_gfn, shrc); 1264 /* 1265 * Don't unpin if marking shared failed because pages are no 1266 * longer mapped in the host, ie root, anymore. 1267 */ 1268 goto err_out; 1269 } 1270 1271 return 0; 1272 1273 invalidate_region: 1274 mshv_region_invalidate(region); 1275 err_out: 1276 return ret; 1277 } 1278 1279 /* 1280 * This maps two things: guest RAM and for pci passthru mmio space. 1281 * 1282 * mmio: 1283 * - vfio overloads vm_pgoff to store the mmio start pfn/spa. 1284 * - Two things need to happen for mapping mmio range: 1285 * 1. mapped in the uaddr so VMM can access it. 1286 * 2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it. 1287 * 1288 * This function takes care of the second. The first one is managed by vfio, 1289 * and hence is taken care of via vfio_pci_mmap_fault(). 1290 */ 1291 static long 1292 mshv_map_user_memory(struct mshv_partition *partition, 1293 struct mshv_user_mem_region mem) 1294 { 1295 struct mshv_mem_region *region; 1296 struct vm_area_struct *vma; 1297 bool is_mmio; 1298 ulong mmio_pfn; 1299 long ret; 1300 1301 if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP) || 1302 !access_ok((const void __user *)mem.userspace_addr, mem.size)) 1303 return -EINVAL; 1304 1305 mmap_read_lock(current->mm); 1306 vma = vma_lookup(current->mm, mem.userspace_addr); 1307 is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0; 1308 mmio_pfn = is_mmio ? vma->vm_pgoff : 0; 1309 mmap_read_unlock(current->mm); 1310 1311 if (!vma) 1312 return -EINVAL; 1313 1314 ret = mshv_partition_create_region(partition, &mem, ®ion, 1315 is_mmio); 1316 if (ret) 1317 return ret; 1318 1319 switch (region->mreg_type) { 1320 case MSHV_REGION_TYPE_MEM_PINNED: 1321 ret = mshv_prepare_pinned_region(region); 1322 break; 1323 case MSHV_REGION_TYPE_MEM_MOVABLE: 1324 /* 1325 * For movable memory regions, remap with no access to let 1326 * the hypervisor track dirty pages, enabling pre-copy live 1327 * migration. 1328 */ 1329 ret = hv_call_map_gpa_pages(partition->pt_id, 1330 region->start_gfn, 1331 region->nr_pages, 1332 HV_MAP_GPA_NO_ACCESS, NULL); 1333 break; 1334 case MSHV_REGION_TYPE_MMIO: 1335 ret = hv_call_map_mmio_pages(partition->pt_id, 1336 region->start_gfn, 1337 mmio_pfn, 1338 region->nr_pages); 1339 break; 1340 } 1341 1342 if (ret) 1343 goto errout; 1344 1345 spin_lock(&partition->pt_mem_regions_lock); 1346 hlist_add_head(®ion->hnode, &partition->pt_mem_regions); 1347 spin_unlock(&partition->pt_mem_regions_lock); 1348 1349 return 0; 1350 1351 errout: 1352 vfree(region); 1353 return ret; 1354 } 1355 1356 /* Called for unmapping both the guest ram and the mmio space */ 1357 static long 1358 mshv_unmap_user_memory(struct mshv_partition *partition, 1359 struct mshv_user_mem_region mem) 1360 { 1361 struct mshv_mem_region *region; 1362 1363 if (!(mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP))) 1364 return -EINVAL; 1365 1366 spin_lock(&partition->pt_mem_regions_lock); 1367 1368 region = mshv_partition_region_by_gfn(partition, mem.guest_pfn); 1369 if (!region) { 1370 spin_unlock(&partition->pt_mem_regions_lock); 1371 return -ENOENT; 1372 } 1373 1374 /* Paranoia check */ 1375 if (region->start_uaddr != mem.userspace_addr || 1376 region->start_gfn != mem.guest_pfn || 1377 region->nr_pages != HVPFN_DOWN(mem.size)) { 1378 spin_unlock(&partition->pt_mem_regions_lock); 1379 return -EINVAL; 1380 } 1381 1382 hlist_del(®ion->hnode); 1383 1384 spin_unlock(&partition->pt_mem_regions_lock); 1385 1386 mshv_region_put(region); 1387 1388 return 0; 1389 } 1390 1391 static long 1392 mshv_partition_ioctl_set_memory(struct mshv_partition *partition, 1393 struct mshv_user_mem_region __user *user_mem) 1394 { 1395 struct mshv_user_mem_region mem; 1396 1397 if (copy_from_user(&mem, user_mem, sizeof(mem))) 1398 return -EFAULT; 1399 1400 if (!mem.size || 1401 !PAGE_ALIGNED(mem.size) || 1402 !PAGE_ALIGNED(mem.userspace_addr) || 1403 (mem.flags & ~MSHV_SET_MEM_FLAGS_MASK) || 1404 mshv_field_nonzero(mem, rsvd)) 1405 return -EINVAL; 1406 1407 if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP)) 1408 return mshv_unmap_user_memory(partition, mem); 1409 1410 return mshv_map_user_memory(partition, mem); 1411 } 1412 1413 static long 1414 mshv_partition_ioctl_ioeventfd(struct mshv_partition *partition, 1415 void __user *user_args) 1416 { 1417 struct mshv_user_ioeventfd args; 1418 1419 if (copy_from_user(&args, user_args, sizeof(args))) 1420 return -EFAULT; 1421 1422 return mshv_set_unset_ioeventfd(partition, &args); 1423 } 1424 1425 static long 1426 mshv_partition_ioctl_irqfd(struct mshv_partition *partition, 1427 void __user *user_args) 1428 { 1429 struct mshv_user_irqfd args; 1430 1431 if (copy_from_user(&args, user_args, sizeof(args))) 1432 return -EFAULT; 1433 1434 return mshv_set_unset_irqfd(partition, &args); 1435 } 1436 1437 static long 1438 mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition *partition, 1439 void __user *user_args) 1440 { 1441 struct mshv_gpap_access_bitmap args; 1442 union hv_gpa_page_access_state *states; 1443 long ret, i; 1444 union hv_gpa_page_access_state_flags hv_flags = {}; 1445 u8 hv_type_mask; 1446 ulong bitmap_buf_sz, states_buf_sz; 1447 int written = 0; 1448 1449 if (copy_from_user(&args, user_args, sizeof(args))) 1450 return -EFAULT; 1451 1452 if (args.access_type >= MSHV_GPAP_ACCESS_TYPE_COUNT || 1453 args.access_op >= MSHV_GPAP_ACCESS_OP_COUNT || 1454 mshv_field_nonzero(args, rsvd) || !args.page_count || 1455 !args.bitmap_ptr) 1456 return -EINVAL; 1457 1458 if (check_mul_overflow(args.page_count, sizeof(*states), &states_buf_sz)) 1459 return -E2BIG; 1460 1461 /* Num bytes needed to store bitmap; one bit per page rounded up */ 1462 bitmap_buf_sz = DIV_ROUND_UP(args.page_count, 8); 1463 1464 /* Sanity check */ 1465 if (bitmap_buf_sz > states_buf_sz) 1466 return -EBADFD; 1467 1468 switch (args.access_type) { 1469 case MSHV_GPAP_ACCESS_TYPE_ACCESSED: 1470 hv_type_mask = 1; 1471 if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) { 1472 hv_flags.clear_accessed = 1; 1473 /* not accessed implies not dirty */ 1474 hv_flags.clear_dirty = 1; 1475 } else { /* MSHV_GPAP_ACCESS_OP_SET */ 1476 hv_flags.set_accessed = 1; 1477 } 1478 break; 1479 case MSHV_GPAP_ACCESS_TYPE_DIRTY: 1480 hv_type_mask = 2; 1481 if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) { 1482 hv_flags.clear_dirty = 1; 1483 } else { /* MSHV_GPAP_ACCESS_OP_SET */ 1484 hv_flags.set_dirty = 1; 1485 /* dirty implies accessed */ 1486 hv_flags.set_accessed = 1; 1487 } 1488 break; 1489 } 1490 1491 states = vzalloc(states_buf_sz); 1492 if (!states) 1493 return -ENOMEM; 1494 1495 ret = hv_call_get_gpa_access_states(partition->pt_id, args.page_count, 1496 args.gpap_base, hv_flags, &written, 1497 states); 1498 if (ret) 1499 goto free_return; 1500 1501 /* 1502 * Overwrite states buffer with bitmap - the bits in hv_type_mask 1503 * correspond to bitfields in hv_gpa_page_access_state 1504 */ 1505 for (i = 0; i < written; ++i) 1506 __assign_bit(i, (ulong *)states, 1507 states[i].as_uint8 & hv_type_mask); 1508 1509 /* zero the unused bits in the last byte(s) of the returned bitmap */ 1510 for (i = written; i < bitmap_buf_sz * 8; ++i) 1511 __clear_bit(i, (ulong *)states); 1512 1513 if (copy_to_user((void __user *)args.bitmap_ptr, states, bitmap_buf_sz)) 1514 ret = -EFAULT; 1515 1516 free_return: 1517 vfree(states); 1518 return ret; 1519 } 1520 1521 static long 1522 mshv_partition_ioctl_set_msi_routing(struct mshv_partition *partition, 1523 void __user *user_args) 1524 { 1525 struct mshv_user_irq_entry *entries = NULL; 1526 struct mshv_user_irq_table args; 1527 long ret; 1528 1529 if (copy_from_user(&args, user_args, sizeof(args))) 1530 return -EFAULT; 1531 1532 if (args.nr > MSHV_MAX_GUEST_IRQS || 1533 mshv_field_nonzero(args, rsvd)) 1534 return -EINVAL; 1535 1536 if (args.nr) { 1537 struct mshv_user_irq_table __user *urouting = user_args; 1538 1539 entries = vmemdup_user(urouting->entries, 1540 array_size(sizeof(*entries), 1541 args.nr)); 1542 if (IS_ERR(entries)) 1543 return PTR_ERR(entries); 1544 } 1545 ret = mshv_update_routing_table(partition, entries, args.nr); 1546 kvfree(entries); 1547 1548 return ret; 1549 } 1550 1551 static long 1552 mshv_partition_ioctl_initialize(struct mshv_partition *partition) 1553 { 1554 long ret; 1555 1556 if (partition->pt_initialized) 1557 return 0; 1558 1559 ret = hv_call_initialize_partition(partition->pt_id); 1560 if (ret) 1561 goto withdraw_mem; 1562 1563 ret = mshv_debugfs_partition_create(partition); 1564 if (ret) 1565 goto finalize_partition; 1566 1567 partition->pt_initialized = true; 1568 1569 return 0; 1570 1571 finalize_partition: 1572 hv_call_finalize_partition(partition->pt_id); 1573 withdraw_mem: 1574 hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); 1575 1576 return ret; 1577 } 1578 1579 static long 1580 mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) 1581 { 1582 struct mshv_partition *partition = filp->private_data; 1583 long ret; 1584 void __user *uarg = (void __user *)arg; 1585 1586 if (mutex_lock_killable(&partition->pt_mutex)) 1587 return -EINTR; 1588 1589 switch (ioctl) { 1590 case MSHV_INITIALIZE_PARTITION: 1591 ret = mshv_partition_ioctl_initialize(partition); 1592 break; 1593 case MSHV_SET_GUEST_MEMORY: 1594 ret = mshv_partition_ioctl_set_memory(partition, uarg); 1595 break; 1596 case MSHV_CREATE_VP: 1597 ret = mshv_partition_ioctl_create_vp(partition, uarg); 1598 break; 1599 case MSHV_IRQFD: 1600 ret = mshv_partition_ioctl_irqfd(partition, uarg); 1601 break; 1602 case MSHV_IOEVENTFD: 1603 ret = mshv_partition_ioctl_ioeventfd(partition, uarg); 1604 break; 1605 case MSHV_SET_MSI_ROUTING: 1606 ret = mshv_partition_ioctl_set_msi_routing(partition, uarg); 1607 break; 1608 case MSHV_GET_GPAP_ACCESS_BITMAP: 1609 ret = mshv_partition_ioctl_get_gpap_access_bitmap(partition, 1610 uarg); 1611 break; 1612 case MSHV_ROOT_HVCALL: 1613 ret = mshv_ioctl_passthru_hvcall(partition, true, uarg); 1614 break; 1615 default: 1616 ret = -ENOTTY; 1617 } 1618 1619 mutex_unlock(&partition->pt_mutex); 1620 return ret; 1621 } 1622 1623 static int 1624 disable_vp_dispatch(struct mshv_vp *vp) 1625 { 1626 int ret; 1627 struct hv_register_assoc dispatch_suspend = { 1628 .name = HV_REGISTER_DISPATCH_SUSPEND, 1629 .value.dispatch_suspend.suspended = 1, 1630 }; 1631 1632 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 1633 1, &dispatch_suspend); 1634 if (ret) 1635 vp_err(vp, "failed to suspend\n"); 1636 1637 return ret; 1638 } 1639 1640 static int 1641 get_vp_signaled_count(struct mshv_vp *vp, u64 *count) 1642 { 1643 int ret; 1644 struct hv_register_assoc root_signal_count = { 1645 .name = HV_REGISTER_VP_ROOT_SIGNAL_COUNT, 1646 }; 1647 1648 ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 1649 1, &root_signal_count); 1650 1651 if (ret) { 1652 vp_err(vp, "Failed to get root signal count"); 1653 *count = 0; 1654 return ret; 1655 } 1656 1657 *count = root_signal_count.value.reg64; 1658 1659 return ret; 1660 } 1661 1662 static void 1663 drain_vp_signals(struct mshv_vp *vp) 1664 { 1665 u64 hv_signal_count; 1666 u64 vp_signal_count; 1667 1668 get_vp_signaled_count(vp, &hv_signal_count); 1669 1670 vp_signal_count = atomic64_read(&vp->run.vp_signaled_count); 1671 1672 /* 1673 * There should be at most 1 outstanding notification, but be extra 1674 * careful anyway. 1675 */ 1676 while (hv_signal_count != vp_signal_count) { 1677 WARN_ON(hv_signal_count - vp_signal_count != 1); 1678 1679 if (wait_event_interruptible(vp->run.vp_suspend_queue, 1680 vp->run.kicked_by_hv == 1)) 1681 break; 1682 vp->run.kicked_by_hv = 0; 1683 vp_signal_count = atomic64_read(&vp->run.vp_signaled_count); 1684 } 1685 } 1686 1687 static void drain_all_vps(const struct mshv_partition *partition) 1688 { 1689 int i; 1690 struct mshv_vp *vp; 1691 1692 /* 1693 * VPs are reachable from ISR. It is safe to not take the partition 1694 * lock because nobody else can enter this function and drop the 1695 * partition from the list. 1696 */ 1697 for (i = 0; i < MSHV_MAX_VPS; i++) { 1698 vp = partition->pt_vp_array[i]; 1699 if (!vp) 1700 continue; 1701 /* 1702 * Disable dispatching of the VP in the hypervisor. After this 1703 * the hypervisor guarantees it won't generate any signals for 1704 * the VP and the hypervisor's VP signal count won't change. 1705 */ 1706 disable_vp_dispatch(vp); 1707 drain_vp_signals(vp); 1708 } 1709 } 1710 1711 static void 1712 remove_partition(struct mshv_partition *partition) 1713 { 1714 spin_lock(&mshv_root.pt_ht_lock); 1715 hlist_del_rcu(&partition->pt_hnode); 1716 spin_unlock(&mshv_root.pt_ht_lock); 1717 1718 synchronize_rcu(); 1719 } 1720 1721 /* 1722 * Tear down a partition and remove it from the list. 1723 * Partition's refcount must be 0 1724 */ 1725 static void destroy_partition(struct mshv_partition *partition) 1726 { 1727 struct mshv_vp *vp; 1728 struct mshv_mem_region *region; 1729 struct hlist_node *n; 1730 int i; 1731 1732 if (refcount_read(&partition->pt_ref_count)) { 1733 pt_err(partition, 1734 "Attempt to destroy partition but refcount > 0\n"); 1735 return; 1736 } 1737 1738 if (partition->pt_initialized) { 1739 /* 1740 * We only need to drain signals for root scheduler. This should be 1741 * done before removing the partition from the partition list. 1742 */ 1743 if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) 1744 drain_all_vps(partition); 1745 1746 /* Remove vps */ 1747 for (i = 0; i < MSHV_MAX_VPS; ++i) { 1748 vp = partition->pt_vp_array[i]; 1749 if (!vp) 1750 continue; 1751 1752 mshv_debugfs_vp_remove(vp); 1753 mshv_vp_stats_unmap(partition->pt_id, vp->vp_index, 1754 vp->vp_stats_pages); 1755 1756 if (vp->vp_register_page) { 1757 (void)hv_unmap_vp_state_page(partition->pt_id, 1758 vp->vp_index, 1759 HV_VP_STATE_PAGE_REGISTERS, 1760 virt_to_page(vp->vp_register_page), 1761 input_vtl_zero); 1762 vp->vp_register_page = NULL; 1763 } 1764 1765 (void)hv_unmap_vp_state_page(partition->pt_id, 1766 vp->vp_index, 1767 HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, 1768 virt_to_page(vp->vp_intercept_msg_page), 1769 input_vtl_zero); 1770 vp->vp_intercept_msg_page = NULL; 1771 1772 if (vp->vp_ghcb_page) { 1773 (void)hv_unmap_vp_state_page(partition->pt_id, 1774 vp->vp_index, 1775 HV_VP_STATE_PAGE_GHCB, 1776 virt_to_page(vp->vp_ghcb_page), 1777 input_vtl_normal); 1778 vp->vp_ghcb_page = NULL; 1779 } 1780 1781 kfree(vp); 1782 1783 partition->pt_vp_array[i] = NULL; 1784 } 1785 1786 mshv_debugfs_partition_remove(partition); 1787 1788 /* Deallocates and unmaps everything including vcpus, GPA mappings etc */ 1789 hv_call_finalize_partition(partition->pt_id); 1790 1791 partition->pt_initialized = false; 1792 } 1793 1794 remove_partition(partition); 1795 1796 hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions, 1797 hnode) { 1798 hlist_del(®ion->hnode); 1799 mshv_region_put(region); 1800 } 1801 1802 /* Withdraw and free all pages we deposited */ 1803 hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); 1804 hv_call_delete_partition(partition->pt_id); 1805 1806 mshv_free_routing_table(partition); 1807 kfree(partition); 1808 } 1809 1810 struct 1811 mshv_partition *mshv_partition_get(struct mshv_partition *partition) 1812 { 1813 if (refcount_inc_not_zero(&partition->pt_ref_count)) 1814 return partition; 1815 return NULL; 1816 } 1817 1818 struct 1819 mshv_partition *mshv_partition_find(u64 partition_id) 1820 __must_hold(RCU) 1821 { 1822 struct mshv_partition *p; 1823 1824 hash_for_each_possible_rcu(mshv_root.pt_htable, p, pt_hnode, 1825 partition_id) 1826 if (p->pt_id == partition_id) 1827 return p; 1828 1829 return NULL; 1830 } 1831 1832 void 1833 mshv_partition_put(struct mshv_partition *partition) 1834 { 1835 if (refcount_dec_and_test(&partition->pt_ref_count)) 1836 destroy_partition(partition); 1837 } 1838 1839 static int 1840 mshv_partition_release(struct inode *inode, struct file *filp) 1841 { 1842 struct mshv_partition *partition = filp->private_data; 1843 1844 mshv_eventfd_release(partition); 1845 1846 cleanup_srcu_struct(&partition->pt_irq_srcu); 1847 1848 mshv_partition_put(partition); 1849 1850 return 0; 1851 } 1852 1853 static int 1854 add_partition(struct mshv_partition *partition) 1855 { 1856 spin_lock(&mshv_root.pt_ht_lock); 1857 1858 hash_add_rcu(mshv_root.pt_htable, &partition->pt_hnode, 1859 partition->pt_id); 1860 1861 spin_unlock(&mshv_root.pt_ht_lock); 1862 1863 return 0; 1864 } 1865 1866 static_assert(MSHV_NUM_CPU_FEATURES_BANKS == 1867 HV_PARTITION_PROCESSOR_FEATURES_BANKS); 1868 1869 static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags, 1870 struct hv_partition_creation_properties *cr_props, 1871 union hv_partition_isolation_properties *isol_props) 1872 { 1873 int i; 1874 struct mshv_create_partition_v2 args; 1875 union hv_partition_processor_features *disabled_procs; 1876 union hv_partition_processor_xsave_features *disabled_xsave; 1877 1878 /* First, copy v1 struct in case user is on previous versions */ 1879 if (copy_from_user(&args, user_arg, 1880 sizeof(struct mshv_create_partition))) 1881 return -EFAULT; 1882 1883 if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) || 1884 args.pt_isolation >= MSHV_PT_ISOLATION_COUNT) 1885 return -EINVAL; 1886 1887 disabled_procs = &cr_props->disabled_processor_features; 1888 disabled_xsave = &cr_props->disabled_processor_xsave_features; 1889 1890 /* Check if user provided newer struct with feature fields */ 1891 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES)) { 1892 if (copy_from_user(&args, user_arg, sizeof(args))) 1893 return -EFAULT; 1894 1895 /* Re-validate v1 fields after second copy_from_user() */ 1896 if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) || 1897 args.pt_isolation >= MSHV_PT_ISOLATION_COUNT) 1898 return -EINVAL; 1899 1900 if (args.pt_num_cpu_fbanks != MSHV_NUM_CPU_FEATURES_BANKS || 1901 mshv_field_nonzero(args, pt_rsvd) || 1902 mshv_field_nonzero(args, pt_rsvd1)) 1903 return -EINVAL; 1904 1905 /* 1906 * Note this assumes MSHV_NUM_CPU_FEATURES_BANKS will never 1907 * change and equals HV_PARTITION_PROCESSOR_FEATURES_BANKS 1908 * (i.e. 2). 1909 * 1910 * Further banks (index >= 2) will be modifiable as 'early' 1911 * properties via the set partition property hypercall. 1912 */ 1913 for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++) 1914 disabled_procs->as_uint64[i] = args.pt_cpu_fbanks[i]; 1915 1916 #if IS_ENABLED(CONFIG_X86_64) 1917 disabled_xsave->as_uint64 = args.pt_disabled_xsave; 1918 #else 1919 /* 1920 * In practice this field is ignored on arm64, but safer to 1921 * zero it in case it is ever used. 1922 */ 1923 disabled_xsave->as_uint64 = 0; 1924 1925 if (mshv_field_nonzero(args, pt_rsvd2)) 1926 return -EINVAL; 1927 #endif 1928 } else { 1929 /* 1930 * v1 behavior: try to enable everything. The hypervisor will 1931 * disable features that are not supported. The banks can be 1932 * queried via the get partition property hypercall. 1933 */ 1934 for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++) 1935 disabled_procs->as_uint64[i] = 0; 1936 1937 disabled_xsave->as_uint64 = 0; 1938 } 1939 1940 /* Only support EXO partitions */ 1941 *pt_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION | 1942 HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED; 1943 1944 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_LAPIC)) 1945 *pt_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED; 1946 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_X2APIC)) 1947 *pt_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE; 1948 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_GPA_SUPER_PAGES)) 1949 *pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED; 1950 1951 isol_props->as_uint64 = 0; 1952 1953 switch (args.pt_isolation) { 1954 case MSHV_PT_ISOLATION_NONE: 1955 isol_props->isolation_type = HV_PARTITION_ISOLATION_TYPE_NONE; 1956 break; 1957 } 1958 1959 return 0; 1960 } 1961 1962 static long 1963 mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev) 1964 { 1965 u64 creation_flags; 1966 struct hv_partition_creation_properties creation_properties; 1967 union hv_partition_isolation_properties isolation_properties; 1968 struct mshv_partition *partition; 1969 long ret; 1970 1971 ret = mshv_ioctl_process_pt_flags(user_arg, &creation_flags, 1972 &creation_properties, 1973 &isolation_properties); 1974 if (ret) 1975 return ret; 1976 1977 partition = kzalloc(sizeof(*partition), GFP_KERNEL); 1978 if (!partition) 1979 return -ENOMEM; 1980 1981 partition->pt_module_dev = module_dev; 1982 partition->isolation_type = isolation_properties.isolation_type; 1983 1984 refcount_set(&partition->pt_ref_count, 1); 1985 1986 mutex_init(&partition->pt_mutex); 1987 1988 mutex_init(&partition->pt_irq_lock); 1989 1990 init_completion(&partition->async_hypercall); 1991 1992 INIT_HLIST_HEAD(&partition->irq_ack_notifier_list); 1993 1994 INIT_HLIST_HEAD(&partition->pt_devices); 1995 1996 spin_lock_init(&partition->pt_mem_regions_lock); 1997 INIT_HLIST_HEAD(&partition->pt_mem_regions); 1998 1999 mshv_eventfd_init(partition); 2000 2001 ret = init_srcu_struct(&partition->pt_irq_srcu); 2002 if (ret) 2003 goto free_partition; 2004 2005 ret = hv_call_create_partition(creation_flags, 2006 creation_properties, 2007 isolation_properties, 2008 &partition->pt_id); 2009 if (ret) 2010 goto cleanup_irq_srcu; 2011 2012 ret = add_partition(partition); 2013 if (ret) 2014 goto delete_partition; 2015 2016 ret = mshv_init_async_handler(partition); 2017 if (!ret) { 2018 ret = FD_ADD(O_CLOEXEC, anon_inode_getfile("mshv_partition", 2019 &mshv_partition_fops, 2020 partition, O_RDWR)); 2021 if (ret >= 0) 2022 return ret; 2023 } 2024 remove_partition(partition); 2025 delete_partition: 2026 hv_call_delete_partition(partition->pt_id); 2027 cleanup_irq_srcu: 2028 cleanup_srcu_struct(&partition->pt_irq_srcu); 2029 free_partition: 2030 kfree(partition); 2031 2032 return ret; 2033 } 2034 2035 static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, 2036 unsigned long arg) 2037 { 2038 struct miscdevice *misc = filp->private_data; 2039 2040 switch (ioctl) { 2041 case MSHV_CREATE_PARTITION: 2042 return mshv_ioctl_create_partition((void __user *)arg, 2043 misc->this_device); 2044 case MSHV_ROOT_HVCALL: 2045 return mshv_ioctl_passthru_hvcall(NULL, false, 2046 (void __user *)arg); 2047 } 2048 2049 return -ENOTTY; 2050 } 2051 2052 static int 2053 mshv_dev_open(struct inode *inode, struct file *filp) 2054 { 2055 return 0; 2056 } 2057 2058 static int 2059 mshv_dev_release(struct inode *inode, struct file *filp) 2060 { 2061 return 0; 2062 } 2063 2064 static int mshv_cpuhp_online; 2065 static int mshv_root_sched_online; 2066 2067 static const char *scheduler_type_to_string(enum hv_scheduler_type type) 2068 { 2069 switch (type) { 2070 case HV_SCHEDULER_TYPE_LP: 2071 return "classic scheduler without SMT"; 2072 case HV_SCHEDULER_TYPE_LP_SMT: 2073 return "classic scheduler with SMT"; 2074 case HV_SCHEDULER_TYPE_CORE_SMT: 2075 return "core scheduler"; 2076 case HV_SCHEDULER_TYPE_ROOT: 2077 return "root scheduler"; 2078 default: 2079 return "unknown scheduler"; 2080 }; 2081 } 2082 2083 static int __init l1vh_retrieve_scheduler_type(enum hv_scheduler_type *out) 2084 { 2085 u64 integrated_sched_enabled; 2086 int ret; 2087 2088 *out = HV_SCHEDULER_TYPE_CORE_SMT; 2089 2090 if (!mshv_root.vmm_caps.vmm_enable_integrated_scheduler) 2091 return 0; 2092 2093 ret = hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF, 2094 HV_PARTITION_PROPERTY_INTEGRATED_SCHEDULER_ENABLED, 2095 0, &integrated_sched_enabled, 2096 sizeof(integrated_sched_enabled)); 2097 if (ret) 2098 return ret; 2099 2100 if (integrated_sched_enabled) 2101 *out = HV_SCHEDULER_TYPE_ROOT; 2102 2103 return 0; 2104 } 2105 2106 /* TODO move this to hv_common.c when needed outside */ 2107 static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out) 2108 { 2109 struct hv_input_get_system_property *input; 2110 struct hv_output_get_system_property *output; 2111 unsigned long flags; 2112 u64 status; 2113 2114 local_irq_save(flags); 2115 input = *this_cpu_ptr(hyperv_pcpu_input_arg); 2116 output = *this_cpu_ptr(hyperv_pcpu_output_arg); 2117 2118 memset(input, 0, sizeof(*input)); 2119 memset(output, 0, sizeof(*output)); 2120 input->property_id = HV_SYSTEM_PROPERTY_SCHEDULER_TYPE; 2121 2122 status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output); 2123 if (!hv_result_success(status)) { 2124 local_irq_restore(flags); 2125 pr_err("%s: %s\n", __func__, hv_result_to_string(status)); 2126 return hv_result_to_errno(status); 2127 } 2128 2129 *out = output->scheduler_type; 2130 local_irq_restore(flags); 2131 2132 return 0; 2133 } 2134 2135 /* Retrieve and stash the supported scheduler type */ 2136 static int __init mshv_retrieve_scheduler_type(struct device *dev) 2137 { 2138 int ret; 2139 2140 if (hv_l1vh_partition()) 2141 ret = l1vh_retrieve_scheduler_type(&hv_scheduler_type); 2142 else 2143 ret = hv_retrieve_scheduler_type(&hv_scheduler_type); 2144 if (ret) 2145 return ret; 2146 2147 dev_info(dev, "Hypervisor using %s\n", 2148 scheduler_type_to_string(hv_scheduler_type)); 2149 2150 switch (hv_scheduler_type) { 2151 case HV_SCHEDULER_TYPE_CORE_SMT: 2152 case HV_SCHEDULER_TYPE_LP_SMT: 2153 case HV_SCHEDULER_TYPE_ROOT: 2154 case HV_SCHEDULER_TYPE_LP: 2155 /* Supported scheduler, nothing to do */ 2156 break; 2157 default: 2158 dev_err(dev, "unsupported scheduler 0x%x, bailing.\n", 2159 hv_scheduler_type); 2160 return -EOPNOTSUPP; 2161 } 2162 2163 return 0; 2164 } 2165 2166 static int mshv_root_scheduler_init(unsigned int cpu) 2167 { 2168 void **inputarg, **outputarg, *p; 2169 2170 inputarg = (void **)this_cpu_ptr(root_scheduler_input); 2171 outputarg = (void **)this_cpu_ptr(root_scheduler_output); 2172 2173 /* Allocate two consecutive pages. One for input, one for output. */ 2174 p = kmalloc(2 * HV_HYP_PAGE_SIZE, GFP_KERNEL); 2175 if (!p) 2176 return -ENOMEM; 2177 2178 *inputarg = p; 2179 *outputarg = (char *)p + HV_HYP_PAGE_SIZE; 2180 2181 return 0; 2182 } 2183 2184 static int mshv_root_scheduler_cleanup(unsigned int cpu) 2185 { 2186 void *p, **inputarg, **outputarg; 2187 2188 inputarg = (void **)this_cpu_ptr(root_scheduler_input); 2189 outputarg = (void **)this_cpu_ptr(root_scheduler_output); 2190 2191 p = *inputarg; 2192 2193 *inputarg = NULL; 2194 *outputarg = NULL; 2195 2196 kfree(p); 2197 2198 return 0; 2199 } 2200 2201 /* Must be called after retrieving the scheduler type */ 2202 static int 2203 root_scheduler_init(struct device *dev) 2204 { 2205 int ret; 2206 2207 if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) 2208 return 0; 2209 2210 root_scheduler_input = alloc_percpu(void *); 2211 root_scheduler_output = alloc_percpu(void *); 2212 2213 if (!root_scheduler_input || !root_scheduler_output) { 2214 dev_err(dev, "Failed to allocate root scheduler buffers\n"); 2215 ret = -ENOMEM; 2216 goto out; 2217 } 2218 2219 ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_root_sched", 2220 mshv_root_scheduler_init, 2221 mshv_root_scheduler_cleanup); 2222 2223 if (ret < 0) { 2224 dev_err(dev, "Failed to setup root scheduler state: %i\n", ret); 2225 goto out; 2226 } 2227 2228 mshv_root_sched_online = ret; 2229 2230 return 0; 2231 2232 out: 2233 free_percpu(root_scheduler_input); 2234 free_percpu(root_scheduler_output); 2235 return ret; 2236 } 2237 2238 static void 2239 root_scheduler_deinit(void) 2240 { 2241 if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) 2242 return; 2243 2244 cpuhp_remove_state(mshv_root_sched_online); 2245 free_percpu(root_scheduler_input); 2246 free_percpu(root_scheduler_output); 2247 } 2248 2249 static int mshv_reboot_notify(struct notifier_block *nb, 2250 unsigned long code, void *unused) 2251 { 2252 cpuhp_remove_state(mshv_cpuhp_online); 2253 return 0; 2254 } 2255 2256 struct notifier_block mshv_reboot_nb = { 2257 .notifier_call = mshv_reboot_notify, 2258 }; 2259 2260 static void mshv_root_partition_exit(void) 2261 { 2262 unregister_reboot_notifier(&mshv_reboot_nb); 2263 } 2264 2265 static int __init mshv_root_partition_init(struct device *dev) 2266 { 2267 return register_reboot_notifier(&mshv_reboot_nb); 2268 } 2269 2270 static int __init mshv_init_vmm_caps(struct device *dev) 2271 { 2272 int ret; 2273 2274 ret = hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF, 2275 HV_PARTITION_PROPERTY_VMM_CAPABILITIES, 2276 0, &mshv_root.vmm_caps, 2277 sizeof(mshv_root.vmm_caps)); 2278 if (ret && hv_l1vh_partition()) { 2279 dev_err(dev, "Failed to get VMM capabilities: %d\n", ret); 2280 return ret; 2281 } 2282 2283 dev_dbg(dev, "vmm_caps = %#llx\n", mshv_root.vmm_caps.as_uint64[0]); 2284 2285 return 0; 2286 } 2287 2288 static int __init mshv_parent_partition_init(void) 2289 { 2290 int ret; 2291 struct device *dev; 2292 union hv_hypervisor_version_info version_info; 2293 2294 if (!hv_parent_partition() || is_kdump_kernel()) 2295 return -ENODEV; 2296 2297 if (hv_get_hypervisor_version(&version_info)) 2298 return -ENODEV; 2299 2300 ret = misc_register(&mshv_dev); 2301 if (ret) 2302 return ret; 2303 2304 dev = mshv_dev.this_device; 2305 2306 if (version_info.build_number < MSHV_HV_MIN_VERSION || 2307 version_info.build_number > MSHV_HV_MAX_VERSION) { 2308 dev_err(dev, "Running on unvalidated Hyper-V version\n"); 2309 dev_err(dev, "Versions: current: %u min: %u max: %u\n", 2310 version_info.build_number, MSHV_HV_MIN_VERSION, 2311 MSHV_HV_MAX_VERSION); 2312 } 2313 2314 mshv_root.synic_pages = alloc_percpu(struct hv_synic_pages); 2315 if (!mshv_root.synic_pages) { 2316 dev_err(dev, "Failed to allocate percpu synic page\n"); 2317 ret = -ENOMEM; 2318 goto device_deregister; 2319 } 2320 2321 ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic", 2322 mshv_synic_init, 2323 mshv_synic_cleanup); 2324 if (ret < 0) { 2325 dev_err(dev, "Failed to setup cpu hotplug state: %i\n", ret); 2326 goto free_synic_pages; 2327 } 2328 2329 mshv_cpuhp_online = ret; 2330 2331 ret = mshv_init_vmm_caps(dev); 2332 if (ret) 2333 goto remove_cpu_state; 2334 2335 ret = mshv_retrieve_scheduler_type(dev); 2336 if (ret) 2337 goto remove_cpu_state; 2338 2339 if (hv_root_partition()) 2340 ret = mshv_root_partition_init(dev); 2341 if (ret) 2342 goto remove_cpu_state; 2343 2344 ret = root_scheduler_init(dev); 2345 if (ret) 2346 goto exit_partition; 2347 2348 ret = mshv_debugfs_init(); 2349 if (ret) 2350 goto deinit_root_scheduler; 2351 2352 ret = mshv_irqfd_wq_init(); 2353 if (ret) 2354 goto exit_debugfs; 2355 2356 spin_lock_init(&mshv_root.pt_ht_lock); 2357 hash_init(mshv_root.pt_htable); 2358 2359 hv_setup_mshv_handler(mshv_isr); 2360 2361 return 0; 2362 2363 exit_debugfs: 2364 mshv_debugfs_exit(); 2365 deinit_root_scheduler: 2366 root_scheduler_deinit(); 2367 exit_partition: 2368 if (hv_root_partition()) 2369 mshv_root_partition_exit(); 2370 remove_cpu_state: 2371 cpuhp_remove_state(mshv_cpuhp_online); 2372 free_synic_pages: 2373 free_percpu(mshv_root.synic_pages); 2374 device_deregister: 2375 misc_deregister(&mshv_dev); 2376 return ret; 2377 } 2378 2379 static void __exit mshv_parent_partition_exit(void) 2380 { 2381 hv_setup_mshv_handler(NULL); 2382 mshv_port_table_fini(); 2383 mshv_debugfs_exit(); 2384 misc_deregister(&mshv_dev); 2385 mshv_irqfd_wq_cleanup(); 2386 root_scheduler_deinit(); 2387 if (hv_root_partition()) 2388 mshv_root_partition_exit(); 2389 cpuhp_remove_state(mshv_cpuhp_online); 2390 free_percpu(mshv_root.synic_pages); 2391 } 2392 2393 module_init(mshv_parent_partition_init); 2394 module_exit(mshv_parent_partition_exit); 2395