1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2024, Microsoft Corporation. 4 * 5 * The main part of the mshv_root module, providing APIs to create 6 * and manage guest partitions. 7 * 8 * Authors: Microsoft Linux virtualization team 9 */ 10 11 #include <linux/entry-virt.h> 12 #include <linux/kernel.h> 13 #include <linux/module.h> 14 #include <linux/fs.h> 15 #include <linux/miscdevice.h> 16 #include <linux/slab.h> 17 #include <linux/file.h> 18 #include <linux/anon_inodes.h> 19 #include <linux/mm.h> 20 #include <linux/io.h> 21 #include <linux/cpuhotplug.h> 22 #include <linux/random.h> 23 #include <asm/mshyperv.h> 24 #include <linux/hyperv.h> 25 #include <linux/notifier.h> 26 #include <linux/reboot.h> 27 #include <linux/kexec.h> 28 #include <linux/page-flags.h> 29 #include <linux/crash_dump.h> 30 #include <linux/panic_notifier.h> 31 #include <linux/vmalloc.h> 32 #include <linux/rseq.h> 33 34 #include "mshv_eventfd.h" 35 #include "mshv.h" 36 #include "mshv_root.h" 37 38 MODULE_AUTHOR("Microsoft"); 39 MODULE_LICENSE("GPL"); 40 MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv"); 41 42 /* HV_THREAD_COUNTER */ 43 #if defined(CONFIG_X86_64) 44 #define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 202 45 #elif defined(CONFIG_ARM64) 46 #define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 95 47 #endif 48 49 struct mshv_root mshv_root; 50 51 enum hv_scheduler_type hv_scheduler_type; 52 53 /* Once we implement the fast extended hypercall ABI they can go away. */ 54 static void * __percpu *root_scheduler_input; 55 static void * __percpu *root_scheduler_output; 56 57 static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); 58 static int mshv_dev_open(struct inode *inode, struct file *filp); 59 static int mshv_dev_release(struct inode *inode, struct file *filp); 60 static int mshv_vp_release(struct inode *inode, struct file *filp); 61 static long mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); 62 static int mshv_partition_release(struct inode *inode, struct file *filp); 63 static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); 64 static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma); 65 static vm_fault_t mshv_vp_fault(struct vm_fault *vmf); 66 static int mshv_init_async_handler(struct mshv_partition *partition); 67 static void mshv_async_hvcall_handler(void *data, u64 *status); 68 69 static const union hv_input_vtl input_vtl_zero; 70 static const union hv_input_vtl input_vtl_normal = { 71 .target_vtl = HV_NORMAL_VTL, 72 .use_target_vtl = 1, 73 }; 74 75 static const struct vm_operations_struct mshv_vp_vm_ops = { 76 .fault = mshv_vp_fault, 77 }; 78 79 static const struct file_operations mshv_vp_fops = { 80 .owner = THIS_MODULE, 81 .release = mshv_vp_release, 82 .unlocked_ioctl = mshv_vp_ioctl, 83 .llseek = noop_llseek, 84 .mmap = mshv_vp_mmap, 85 }; 86 87 static const struct file_operations mshv_partition_fops = { 88 .owner = THIS_MODULE, 89 .release = mshv_partition_release, 90 .unlocked_ioctl = mshv_partition_ioctl, 91 .llseek = noop_llseek, 92 }; 93 94 static const struct file_operations mshv_dev_fops = { 95 .owner = THIS_MODULE, 96 .open = mshv_dev_open, 97 .release = mshv_dev_release, 98 .unlocked_ioctl = mshv_dev_ioctl, 99 .llseek = noop_llseek, 100 }; 101 102 static struct miscdevice mshv_dev = { 103 .minor = MISC_DYNAMIC_MINOR, 104 .name = "mshv", 105 .fops = &mshv_dev_fops, 106 .mode = 0600, 107 }; 108 109 /* 110 * Only allow hypercalls that have a u64 partition id as the first member of 111 * the input structure. 112 * These are sorted by value. 113 */ 114 static u16 mshv_passthru_hvcalls[] = { 115 HVCALL_GET_PARTITION_PROPERTY, 116 HVCALL_GET_PARTITION_PROPERTY_EX, 117 HVCALL_SET_PARTITION_PROPERTY, 118 HVCALL_INSTALL_INTERCEPT, 119 HVCALL_GET_VP_REGISTERS, 120 HVCALL_SET_VP_REGISTERS, 121 HVCALL_TRANSLATE_VIRTUAL_ADDRESS, 122 HVCALL_CLEAR_VIRTUAL_INTERRUPT, 123 HVCALL_SCRUB_PARTITION, 124 HVCALL_REGISTER_INTERCEPT_RESULT, 125 HVCALL_ASSERT_VIRTUAL_INTERRUPT, 126 HVCALL_GET_GPA_PAGES_ACCESS_STATES, 127 HVCALL_SIGNAL_EVENT_DIRECT, 128 HVCALL_POST_MESSAGE_DIRECT, 129 HVCALL_GET_VP_CPUID_VALUES, 130 }; 131 132 /* 133 * Only allow hypercalls that are safe to be called by the VMM with the host 134 * partition as target (i.e. HV_PARTITION_ID_SELF). Carefully audit that a 135 * hypercall cannot be misused by the VMM before adding it to this list. 136 */ 137 static u16 mshv_self_passthru_hvcalls[] = { 138 HVCALL_GET_PARTITION_PROPERTY, 139 HVCALL_GET_PARTITION_PROPERTY_EX, 140 }; 141 142 static bool mshv_hvcall_is_async(u16 code) 143 { 144 switch (code) { 145 case HVCALL_SET_PARTITION_PROPERTY: 146 return true; 147 default: 148 break; 149 } 150 return false; 151 } 152 153 static bool mshv_passthru_hvcall_allowed(u16 code, u64 pt_id) 154 { 155 int i; 156 int n = ARRAY_SIZE(mshv_passthru_hvcalls); 157 u16 *allowed_hvcalls = mshv_passthru_hvcalls; 158 159 if (pt_id == HV_PARTITION_ID_SELF) { 160 n = ARRAY_SIZE(mshv_self_passthru_hvcalls); 161 allowed_hvcalls = mshv_self_passthru_hvcalls; 162 } 163 164 for (i = 0; i < n; ++i) 165 if (allowed_hvcalls[i] == code) 166 return true; 167 168 return false; 169 } 170 171 static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition, 172 bool partition_locked, 173 void __user *user_args) 174 { 175 u64 status; 176 int ret = 0; 177 bool is_async; 178 struct mshv_root_hvcall args; 179 struct page *page; 180 unsigned int pages_order; 181 void *input_pg = NULL; 182 void *output_pg = NULL; 183 u16 reps_completed; 184 u64 pt_id = partition ? partition->pt_id : HV_PARTITION_ID_SELF; 185 186 if (copy_from_user(&args, user_args, sizeof(args))) 187 return -EFAULT; 188 189 if (args.status || !args.in_ptr || args.in_sz < sizeof(u64) || 190 mshv_field_nonzero(args, rsvd) || args.in_sz > HV_HYP_PAGE_SIZE) 191 return -EINVAL; 192 193 if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE)) 194 return -EINVAL; 195 196 if (!mshv_passthru_hvcall_allowed(args.code, pt_id)) 197 return -EINVAL; 198 199 is_async = mshv_hvcall_is_async(args.code); 200 if (is_async) { 201 /* async hypercalls can only be called from partition fd */ 202 if (!partition || !partition_locked) 203 return -EINVAL; 204 ret = mshv_init_async_handler(partition); 205 if (ret) 206 return ret; 207 } 208 209 pages_order = args.out_ptr ? 1 : 0; 210 page = alloc_pages(GFP_KERNEL, pages_order); 211 if (!page) 212 return -ENOMEM; 213 input_pg = page_address(page); 214 215 if (args.out_ptr) 216 output_pg = (char *)input_pg + PAGE_SIZE; 217 else 218 output_pg = NULL; 219 220 if (copy_from_user(input_pg, (void __user *)args.in_ptr, 221 args.in_sz)) { 222 ret = -EFAULT; 223 goto free_pages_out; 224 } 225 226 /* 227 * NOTE: This only works because all the allowed hypercalls' input 228 * structs begin with a u64 partition_id field. 229 */ 230 *(u64 *)input_pg = pt_id; 231 232 reps_completed = 0; 233 do { 234 if (args.reps) { 235 status = hv_do_rep_hypercall_ex(args.code, args.reps, 236 0, reps_completed, 237 input_pg, output_pg); 238 reps_completed = hv_repcomp(status); 239 } else { 240 status = hv_do_hypercall(args.code, input_pg, output_pg); 241 } 242 243 if (hv_result(status) == HV_STATUS_CALL_PENDING) { 244 if (is_async) { 245 mshv_async_hvcall_handler(partition, &status); 246 } else { /* Paranoia check. This shouldn't happen! */ 247 ret = -EBADFD; 248 goto free_pages_out; 249 } 250 } 251 252 if (hv_result_success(status)) 253 break; 254 255 if (!hv_result_needs_memory(status)) 256 ret = hv_result_to_errno(status); 257 else 258 ret = hv_deposit_memory(pt_id, status); 259 } while (!ret); 260 261 args.status = hv_result(status); 262 args.reps = reps_completed; 263 if (copy_to_user(user_args, &args, sizeof(args))) 264 ret = -EFAULT; 265 266 if (!ret && output_pg && 267 copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz)) 268 ret = -EFAULT; 269 270 free_pages_out: 271 free_pages((unsigned long)input_pg, pages_order); 272 273 return ret; 274 } 275 276 static inline bool is_ghcb_mapping_available(void) 277 { 278 #if IS_ENABLED(CONFIG_X86_64) 279 return ms_hyperv.ext_features & HV_VP_GHCB_ROOT_MAPPING_AVAILABLE; 280 #else 281 return 0; 282 #endif 283 } 284 285 static int mshv_get_vp_registers(u32 vp_index, u64 partition_id, u16 count, 286 struct hv_register_assoc *registers) 287 { 288 return hv_call_get_vp_registers(vp_index, partition_id, 289 count, input_vtl_zero, registers); 290 } 291 292 static int mshv_set_vp_registers(u32 vp_index, u64 partition_id, u16 count, 293 struct hv_register_assoc *registers) 294 { 295 return hv_call_set_vp_registers(vp_index, partition_id, 296 count, input_vtl_zero, registers); 297 } 298 299 /* 300 * Explicit guest vCPU suspend is asynchronous by nature (as it is requested by 301 * dom0 vCPU for guest vCPU) and thus it can race with "intercept" suspend, 302 * done by the hypervisor. 303 * "Intercept" suspend leads to asynchronous message delivery to dom0 which 304 * should be awaited to keep the VP loop consistent (i.e. no message pending 305 * upon VP resume). 306 * VP intercept suspend can't be done when the VP is explicitly suspended 307 * already, and thus can be only two possible race scenarios: 308 * 1. implicit suspend bit set -> explicit suspend bit set -> message sent 309 * 2. implicit suspend bit set -> message sent -> explicit suspend bit set 310 * Checking for implicit suspend bit set after explicit suspend request has 311 * succeeded in either case allows us to reliably identify, if there is a 312 * message to receive and deliver to VMM. 313 */ 314 static int 315 mshv_suspend_vp(const struct mshv_vp *vp, bool *message_in_flight) 316 { 317 struct hv_register_assoc explicit_suspend = { 318 .name = HV_REGISTER_EXPLICIT_SUSPEND 319 }; 320 struct hv_register_assoc intercept_suspend = { 321 .name = HV_REGISTER_INTERCEPT_SUSPEND 322 }; 323 union hv_explicit_suspend_register *es = 324 &explicit_suspend.value.explicit_suspend; 325 union hv_intercept_suspend_register *is = 326 &intercept_suspend.value.intercept_suspend; 327 int ret; 328 329 es->suspended = 1; 330 331 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 332 1, &explicit_suspend); 333 if (ret) { 334 vp_err(vp, "Failed to explicitly suspend vCPU\n"); 335 return ret; 336 } 337 338 ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 339 1, &intercept_suspend); 340 if (ret) { 341 vp_err(vp, "Failed to get intercept suspend state\n"); 342 return ret; 343 } 344 345 *message_in_flight = is->suspended; 346 347 return 0; 348 } 349 350 /* 351 * This function is used when VPs are scheduled by the hypervisor's 352 * scheduler. 353 * 354 * Caller has to make sure the registers contain cleared 355 * HV_REGISTER_INTERCEPT_SUSPEND and HV_REGISTER_EXPLICIT_SUSPEND registers 356 * exactly in this order (the hypervisor clears them sequentially) to avoid 357 * potential invalid clearing a newly arrived HV_REGISTER_INTERCEPT_SUSPEND 358 * after VP is released from HV_REGISTER_EXPLICIT_SUSPEND in case of the 359 * opposite order. 360 */ 361 static long mshv_run_vp_with_hyp_scheduler(struct mshv_vp *vp) 362 { 363 long ret; 364 struct hv_register_assoc suspend_regs[2] = { 365 { .name = HV_REGISTER_INTERCEPT_SUSPEND }, 366 { .name = HV_REGISTER_EXPLICIT_SUSPEND } 367 }; 368 size_t count = ARRAY_SIZE(suspend_regs); 369 370 /* Resume VP execution */ 371 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 372 count, suspend_regs); 373 if (ret) { 374 vp_err(vp, "Failed to resume vp execution. %lx\n", ret); 375 return ret; 376 } 377 378 ret = wait_event_interruptible(vp->run.vp_suspend_queue, 379 vp->run.kicked_by_hv == 1); 380 if (ret) { 381 bool message_in_flight; 382 383 /* 384 * Otherwise the waiting was interrupted by a signal: suspend 385 * the vCPU explicitly and copy message in flight (if any). 386 */ 387 ret = mshv_suspend_vp(vp, &message_in_flight); 388 if (ret) 389 return ret; 390 391 /* Return if no message in flight */ 392 if (!message_in_flight) 393 return -EINTR; 394 395 /* Wait for the message in flight. */ 396 wait_event(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1); 397 } 398 399 /* 400 * Reset the flag to make the wait_event call above work 401 * next time. 402 */ 403 vp->run.kicked_by_hv = 0; 404 405 return 0; 406 } 407 408 static int 409 mshv_vp_dispatch(struct mshv_vp *vp, u32 flags, 410 struct hv_output_dispatch_vp *res) 411 { 412 struct hv_input_dispatch_vp *input; 413 struct hv_output_dispatch_vp *output; 414 u64 status; 415 416 preempt_disable(); 417 input = *this_cpu_ptr(root_scheduler_input); 418 output = *this_cpu_ptr(root_scheduler_output); 419 420 memset(input, 0, sizeof(*input)); 421 memset(output, 0, sizeof(*output)); 422 423 input->partition_id = vp->vp_partition->pt_id; 424 input->vp_index = vp->vp_index; 425 input->time_slice = 0; /* Run forever until something happens */ 426 input->spec_ctrl = 0; /* TODO: set sensible flags */ 427 input->flags = flags; 428 429 vp->run.flags.root_sched_dispatched = 1; 430 status = hv_do_hypercall(HVCALL_DISPATCH_VP, input, output); 431 vp->run.flags.root_sched_dispatched = 0; 432 433 *res = *output; 434 preempt_enable(); 435 436 if (!hv_result_success(status)) 437 vp_err(vp, "%s: status %s\n", __func__, 438 hv_result_to_string(status)); 439 440 return hv_result_to_errno(status); 441 } 442 443 static int 444 mshv_vp_clear_explicit_suspend(struct mshv_vp *vp) 445 { 446 struct hv_register_assoc explicit_suspend = { 447 .name = HV_REGISTER_EXPLICIT_SUSPEND, 448 .value.explicit_suspend.suspended = 0, 449 }; 450 int ret; 451 452 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 453 1, &explicit_suspend); 454 455 if (ret) 456 vp_err(vp, "Failed to unsuspend\n"); 457 458 return ret; 459 } 460 461 #if IS_ENABLED(CONFIG_X86_64) 462 static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp) 463 { 464 if (!vp->vp_register_page) 465 return 0; 466 return vp->vp_register_page->interrupt_vectors.as_uint64; 467 } 468 #else 469 static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp) 470 { 471 return 0; 472 } 473 #endif 474 475 static bool mshv_vp_dispatch_thread_blocked(struct mshv_vp *vp) 476 { 477 struct hv_stats_page **stats = vp->vp_stats_pages; 478 u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->data; 479 u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->data; 480 481 return parent_vp_cntrs[HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED] || 482 self_vp_cntrs[HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED]; 483 } 484 485 static int 486 mshv_vp_wait_for_hv_kick(struct mshv_vp *vp) 487 { 488 int ret; 489 490 ret = wait_event_interruptible(vp->run.vp_suspend_queue, 491 (vp->run.kicked_by_hv == 1 && 492 !mshv_vp_dispatch_thread_blocked(vp)) || 493 mshv_vp_interrupt_pending(vp)); 494 if (ret) 495 return -EINTR; 496 497 vp->run.flags.root_sched_blocked = 0; 498 vp->run.kicked_by_hv = 0; 499 500 return 0; 501 } 502 503 /* Must be called with interrupts enabled */ 504 static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp) 505 { 506 long ret; 507 508 if (vp->run.flags.root_sched_blocked) { 509 /* 510 * Dispatch state of this VP is blocked. Need to wait 511 * for the hypervisor to clear the blocked state before 512 * dispatching it. 513 */ 514 ret = mshv_vp_wait_for_hv_kick(vp); 515 if (ret) 516 return ret; 517 } 518 519 do { 520 u32 flags = 0; 521 struct hv_output_dispatch_vp output; 522 523 if (__xfer_to_guest_mode_work_pending()) { 524 ret = xfer_to_guest_mode_handle_work(); 525 if (ret) 526 break; 527 } 528 529 if (vp->run.flags.intercept_suspend) 530 flags |= HV_DISPATCH_VP_FLAG_CLEAR_INTERCEPT_SUSPEND; 531 532 if (mshv_vp_interrupt_pending(vp)) 533 flags |= HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION; 534 535 ret = mshv_vp_dispatch(vp, flags, &output); 536 if (ret) 537 break; 538 539 vp->run.flags.intercept_suspend = 0; 540 541 if (output.dispatch_state == HV_VP_DISPATCH_STATE_BLOCKED) { 542 if (output.dispatch_event == 543 HV_VP_DISPATCH_EVENT_SUSPEND) { 544 /* 545 * TODO: remove the warning once VP canceling 546 * is supported 547 */ 548 WARN_ONCE(atomic64_read(&vp->run.vp_signaled_count), 549 "%s: vp#%d: unexpected explicit suspend\n", 550 __func__, vp->vp_index); 551 /* 552 * Need to clear explicit suspend before 553 * dispatching. 554 * Explicit suspend is either: 555 * - set right after the first VP dispatch or 556 * - set explicitly via hypercall 557 * Since the latter case is not yet supported, 558 * simply clear it here. 559 */ 560 ret = mshv_vp_clear_explicit_suspend(vp); 561 if (ret) 562 break; 563 564 ret = mshv_vp_wait_for_hv_kick(vp); 565 if (ret) 566 break; 567 } else { 568 vp->run.flags.root_sched_blocked = 1; 569 ret = mshv_vp_wait_for_hv_kick(vp); 570 if (ret) 571 break; 572 } 573 } else { 574 /* HV_VP_DISPATCH_STATE_READY */ 575 if (output.dispatch_event == 576 HV_VP_DISPATCH_EVENT_INTERCEPT) 577 vp->run.flags.intercept_suspend = 1; 578 } 579 } while (!vp->run.flags.intercept_suspend); 580 581 rseq_virt_userspace_exit(); 582 583 return ret; 584 } 585 586 static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ, 587 "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ"); 588 589 static struct mshv_mem_region * 590 mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn) 591 { 592 struct mshv_mem_region *region; 593 594 hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { 595 if (gfn >= region->start_gfn && 596 gfn < region->start_gfn + region->nr_pages) 597 return region; 598 } 599 600 return NULL; 601 } 602 603 static struct mshv_mem_region * 604 mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn) 605 { 606 struct mshv_mem_region *region; 607 608 spin_lock(&p->pt_mem_regions_lock); 609 region = mshv_partition_region_by_gfn(p, gfn); 610 if (!region || !mshv_region_get(region)) { 611 spin_unlock(&p->pt_mem_regions_lock); 612 return NULL; 613 } 614 spin_unlock(&p->pt_mem_regions_lock); 615 616 return region; 617 } 618 619 /** 620 * mshv_handle_gpa_intercept - Handle GPA (Guest Physical Address) intercepts. 621 * @vp: Pointer to the virtual processor structure. 622 * 623 * This function processes GPA intercepts by identifying the memory region 624 * corresponding to the intercepted GPA, aligning the page offset, and 625 * mapping the required pages. It ensures that the region is valid and 626 * handles faults efficiently by mapping multiple pages at once. 627 * 628 * Return: true if the intercept was handled successfully, false otherwise. 629 */ 630 static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) 631 { 632 struct mshv_partition *p = vp->vp_partition; 633 struct mshv_mem_region *region; 634 bool ret; 635 u64 gfn; 636 #if defined(CONFIG_X86_64) 637 struct hv_x64_memory_intercept_message *msg = 638 (struct hv_x64_memory_intercept_message *) 639 vp->vp_intercept_msg_page->u.payload; 640 #elif defined(CONFIG_ARM64) 641 struct hv_arm64_memory_intercept_message *msg = 642 (struct hv_arm64_memory_intercept_message *) 643 vp->vp_intercept_msg_page->u.payload; 644 #endif 645 646 gfn = HVPFN_DOWN(msg->guest_physical_address); 647 648 region = mshv_partition_region_by_gfn_get(p, gfn); 649 if (!region) 650 return false; 651 652 /* Only movable memory ranges are supported for GPA intercepts */ 653 if (region->mreg_type == MSHV_REGION_TYPE_MEM_MOVABLE) 654 ret = mshv_region_handle_gfn_fault(region, gfn); 655 else 656 ret = false; 657 658 mshv_region_put(region); 659 660 return ret; 661 } 662 663 static bool mshv_vp_handle_intercept(struct mshv_vp *vp) 664 { 665 switch (vp->vp_intercept_msg_page->header.message_type) { 666 case HVMSG_GPA_INTERCEPT: 667 return mshv_handle_gpa_intercept(vp); 668 } 669 return false; 670 } 671 672 static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg) 673 { 674 long rc; 675 676 do { 677 if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) 678 rc = mshv_run_vp_with_root_scheduler(vp); 679 else 680 rc = mshv_run_vp_with_hyp_scheduler(vp); 681 } while (rc == 0 && mshv_vp_handle_intercept(vp)); 682 683 if (rc) 684 return rc; 685 686 if (copy_to_user(ret_msg, vp->vp_intercept_msg_page, 687 sizeof(struct hv_message))) 688 rc = -EFAULT; 689 690 return rc; 691 } 692 693 static int 694 mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp *vp, 695 struct hv_vp_state_data state_data, 696 unsigned long user_pfn, size_t page_count, 697 bool is_set) 698 { 699 int completed, ret = 0; 700 unsigned long check; 701 struct page **pages; 702 703 if (page_count > INT_MAX) 704 return -EINVAL; 705 /* 706 * Check the arithmetic for wraparound/overflow. 707 * The last page address in the buffer is: 708 * (user_pfn + (page_count - 1)) * PAGE_SIZE 709 */ 710 if (check_add_overflow(user_pfn, (page_count - 1), &check)) 711 return -EOVERFLOW; 712 if (check_mul_overflow(check, PAGE_SIZE, &check)) 713 return -EOVERFLOW; 714 715 /* Pin user pages so hypervisor can copy directly to them */ 716 pages = kcalloc(page_count, sizeof(struct page *), GFP_KERNEL); 717 if (!pages) 718 return -ENOMEM; 719 720 for (completed = 0; completed < page_count; completed += ret) { 721 unsigned long user_addr = (user_pfn + completed) * PAGE_SIZE; 722 int remaining = page_count - completed; 723 724 ret = pin_user_pages_fast(user_addr, remaining, FOLL_WRITE, 725 &pages[completed]); 726 if (ret < 0) { 727 vp_err(vp, "%s: Failed to pin user pages error %i\n", 728 __func__, ret); 729 goto unpin_pages; 730 } 731 } 732 733 if (is_set) 734 ret = hv_call_set_vp_state(vp->vp_index, 735 vp->vp_partition->pt_id, 736 state_data, page_count, pages, 737 0, NULL); 738 else 739 ret = hv_call_get_vp_state(vp->vp_index, 740 vp->vp_partition->pt_id, 741 state_data, page_count, pages, 742 NULL); 743 744 unpin_pages: 745 unpin_user_pages(pages, completed); 746 kfree(pages); 747 return ret; 748 } 749 750 static long 751 mshv_vp_ioctl_get_set_state(struct mshv_vp *vp, 752 struct mshv_get_set_vp_state __user *user_args, 753 bool is_set) 754 { 755 struct mshv_get_set_vp_state args; 756 long ret = 0; 757 union hv_output_get_vp_state vp_state; 758 u32 data_sz; 759 struct hv_vp_state_data state_data = {}; 760 761 if (copy_from_user(&args, user_args, sizeof(args))) 762 return -EFAULT; 763 764 if (args.type >= MSHV_VP_STATE_COUNT || mshv_field_nonzero(args, rsvd) || 765 !args.buf_sz || !PAGE_ALIGNED(args.buf_sz) || 766 !PAGE_ALIGNED(args.buf_ptr)) 767 return -EINVAL; 768 769 if (!access_ok((void __user *)args.buf_ptr, args.buf_sz)) 770 return -EFAULT; 771 772 switch (args.type) { 773 case MSHV_VP_STATE_LAPIC: 774 state_data.type = HV_GET_SET_VP_STATE_LAPIC_STATE; 775 data_sz = HV_HYP_PAGE_SIZE; 776 break; 777 case MSHV_VP_STATE_XSAVE: 778 { 779 u64 data_sz_64; 780 781 ret = hv_call_get_partition_property(vp->vp_partition->pt_id, 782 HV_PARTITION_PROPERTY_XSAVE_STATES, 783 &state_data.xsave.states.as_uint64); 784 if (ret) 785 return ret; 786 787 ret = hv_call_get_partition_property(vp->vp_partition->pt_id, 788 HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE, 789 &data_sz_64); 790 if (ret) 791 return ret; 792 793 data_sz = (u32)data_sz_64; 794 state_data.xsave.flags = 0; 795 /* Always request legacy states */ 796 state_data.xsave.states.legacy_x87 = 1; 797 state_data.xsave.states.legacy_sse = 1; 798 state_data.type = HV_GET_SET_VP_STATE_XSAVE; 799 break; 800 } 801 case MSHV_VP_STATE_SIMP: 802 state_data.type = HV_GET_SET_VP_STATE_SIM_PAGE; 803 data_sz = HV_HYP_PAGE_SIZE; 804 break; 805 case MSHV_VP_STATE_SIEFP: 806 state_data.type = HV_GET_SET_VP_STATE_SIEF_PAGE; 807 data_sz = HV_HYP_PAGE_SIZE; 808 break; 809 case MSHV_VP_STATE_SYNTHETIC_TIMERS: 810 state_data.type = HV_GET_SET_VP_STATE_SYNTHETIC_TIMERS; 811 data_sz = sizeof(vp_state.synthetic_timers_state); 812 break; 813 default: 814 return -EINVAL; 815 } 816 817 if (copy_to_user(&user_args->buf_sz, &data_sz, sizeof(user_args->buf_sz))) 818 return -EFAULT; 819 820 if (data_sz > args.buf_sz) 821 return -EINVAL; 822 823 /* If the data is transmitted via pfns, delegate to helper */ 824 if (state_data.type & HV_GET_SET_VP_STATE_TYPE_PFN) { 825 unsigned long user_pfn = PFN_DOWN(args.buf_ptr); 826 size_t page_count = PFN_DOWN(args.buf_sz); 827 828 return mshv_vp_ioctl_get_set_state_pfn(vp, state_data, user_pfn, 829 page_count, is_set); 830 } 831 832 /* Paranoia check - this shouldn't happen! */ 833 if (data_sz > sizeof(vp_state)) { 834 vp_err(vp, "Invalid vp state data size!\n"); 835 return -EINVAL; 836 } 837 838 if (is_set) { 839 if (copy_from_user(&vp_state, (__user void *)args.buf_ptr, data_sz)) 840 return -EFAULT; 841 842 return hv_call_set_vp_state(vp->vp_index, 843 vp->vp_partition->pt_id, 844 state_data, 0, NULL, 845 sizeof(vp_state), (u8 *)&vp_state); 846 } 847 848 ret = hv_call_get_vp_state(vp->vp_index, vp->vp_partition->pt_id, 849 state_data, 0, NULL, &vp_state); 850 if (ret) 851 return ret; 852 853 if (copy_to_user((void __user *)args.buf_ptr, &vp_state, data_sz)) 854 return -EFAULT; 855 856 return 0; 857 } 858 859 static long 860 mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) 861 { 862 struct mshv_vp *vp = filp->private_data; 863 long r = -ENOTTY; 864 865 if (mutex_lock_killable(&vp->vp_mutex)) 866 return -EINTR; 867 868 switch (ioctl) { 869 case MSHV_RUN_VP: 870 r = mshv_vp_ioctl_run_vp(vp, (void __user *)arg); 871 break; 872 case MSHV_GET_VP_STATE: 873 r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, false); 874 break; 875 case MSHV_SET_VP_STATE: 876 r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, true); 877 break; 878 case MSHV_ROOT_HVCALL: 879 r = mshv_ioctl_passthru_hvcall(vp->vp_partition, false, 880 (void __user *)arg); 881 break; 882 default: 883 vp_warn(vp, "Invalid ioctl: %#x\n", ioctl); 884 break; 885 } 886 mutex_unlock(&vp->vp_mutex); 887 888 return r; 889 } 890 891 static vm_fault_t mshv_vp_fault(struct vm_fault *vmf) 892 { 893 struct mshv_vp *vp = vmf->vma->vm_file->private_data; 894 895 switch (vmf->vma->vm_pgoff) { 896 case MSHV_VP_MMAP_OFFSET_REGISTERS: 897 vmf->page = virt_to_page(vp->vp_register_page); 898 break; 899 case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE: 900 vmf->page = virt_to_page(vp->vp_intercept_msg_page); 901 break; 902 case MSHV_VP_MMAP_OFFSET_GHCB: 903 vmf->page = virt_to_page(vp->vp_ghcb_page); 904 break; 905 default: 906 return VM_FAULT_SIGBUS; 907 } 908 909 get_page(vmf->page); 910 911 return 0; 912 } 913 914 static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma) 915 { 916 struct mshv_vp *vp = file->private_data; 917 918 switch (vma->vm_pgoff) { 919 case MSHV_VP_MMAP_OFFSET_REGISTERS: 920 if (!vp->vp_register_page) 921 return -ENODEV; 922 break; 923 case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE: 924 if (!vp->vp_intercept_msg_page) 925 return -ENODEV; 926 break; 927 case MSHV_VP_MMAP_OFFSET_GHCB: 928 if (!vp->vp_ghcb_page) 929 return -ENODEV; 930 break; 931 default: 932 return -EINVAL; 933 } 934 935 vma->vm_ops = &mshv_vp_vm_ops; 936 return 0; 937 } 938 939 static int 940 mshv_vp_release(struct inode *inode, struct file *filp) 941 { 942 struct mshv_vp *vp = filp->private_data; 943 944 /* Rest of VP cleanup happens in destroy_partition() */ 945 mshv_partition_put(vp->vp_partition); 946 return 0; 947 } 948 949 void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index, 950 struct hv_stats_page *stats_pages[]) 951 { 952 union hv_stats_object_identity identity = { 953 .vp.partition_id = partition_id, 954 .vp.vp_index = vp_index, 955 }; 956 int err; 957 958 identity.vp.stats_area_type = HV_STATS_AREA_SELF; 959 err = hv_unmap_stats_page(HV_STATS_OBJECT_VP, 960 stats_pages[HV_STATS_AREA_SELF], 961 &identity); 962 if (err) 963 pr_err("%s: failed to unmap partition %llu vp %u self stats, err: %d\n", 964 __func__, partition_id, vp_index, err); 965 966 if (stats_pages[HV_STATS_AREA_PARENT] != stats_pages[HV_STATS_AREA_SELF]) { 967 identity.vp.stats_area_type = HV_STATS_AREA_PARENT; 968 err = hv_unmap_stats_page(HV_STATS_OBJECT_VP, 969 stats_pages[HV_STATS_AREA_PARENT], 970 &identity); 971 if (err) 972 pr_err("%s: failed to unmap partition %llu vp %u parent stats, err: %d\n", 973 __func__, partition_id, vp_index, err); 974 } 975 } 976 977 int mshv_vp_stats_map(u64 partition_id, u32 vp_index, 978 struct hv_stats_page *stats_pages[]) 979 { 980 union hv_stats_object_identity identity = { 981 .vp.partition_id = partition_id, 982 .vp.vp_index = vp_index, 983 }; 984 int err; 985 986 identity.vp.stats_area_type = HV_STATS_AREA_SELF; 987 err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity, 988 &stats_pages[HV_STATS_AREA_SELF]); 989 if (err) { 990 pr_err("%s: failed to map partition %llu vp %u self stats, err: %d\n", 991 __func__, partition_id, vp_index, err); 992 return err; 993 } 994 995 /* 996 * L1VH partition cannot access its vp stats in parent area. 997 */ 998 if (is_l1vh_parent(partition_id)) { 999 stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF]; 1000 } else { 1001 identity.vp.stats_area_type = HV_STATS_AREA_PARENT; 1002 err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity, 1003 &stats_pages[HV_STATS_AREA_PARENT]); 1004 if (err) { 1005 pr_err("%s: failed to map partition %llu vp %u parent stats, err: %d\n", 1006 __func__, partition_id, vp_index, err); 1007 goto unmap_self; 1008 } 1009 if (!stats_pages[HV_STATS_AREA_PARENT]) 1010 stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF]; 1011 } 1012 1013 return 0; 1014 1015 unmap_self: 1016 identity.vp.stats_area_type = HV_STATS_AREA_SELF; 1017 hv_unmap_stats_page(HV_STATS_OBJECT_VP, 1018 stats_pages[HV_STATS_AREA_SELF], 1019 &identity); 1020 return err; 1021 } 1022 1023 static long 1024 mshv_partition_ioctl_create_vp(struct mshv_partition *partition, 1025 void __user *arg) 1026 { 1027 struct mshv_create_vp args; 1028 struct mshv_vp *vp; 1029 struct page *intercept_msg_page, *register_page, *ghcb_page; 1030 struct hv_stats_page *stats_pages[2]; 1031 long ret; 1032 1033 if (copy_from_user(&args, arg, sizeof(args))) 1034 return -EFAULT; 1035 1036 if (args.vp_index >= MSHV_MAX_VPS) 1037 return -EINVAL; 1038 1039 if (partition->pt_vp_array[args.vp_index]) 1040 return -EEXIST; 1041 1042 ret = hv_call_create_vp(NUMA_NO_NODE, partition->pt_id, args.vp_index, 1043 0 /* Only valid for root partition VPs */); 1044 if (ret) 1045 return ret; 1046 1047 ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, 1048 HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, 1049 input_vtl_zero, &intercept_msg_page); 1050 if (ret) 1051 goto destroy_vp; 1052 1053 if (!mshv_partition_encrypted(partition)) { 1054 ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, 1055 HV_VP_STATE_PAGE_REGISTERS, 1056 input_vtl_zero, ®ister_page); 1057 if (ret) 1058 goto unmap_intercept_message_page; 1059 } 1060 1061 if (mshv_partition_encrypted(partition) && 1062 is_ghcb_mapping_available()) { 1063 ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, 1064 HV_VP_STATE_PAGE_GHCB, 1065 input_vtl_normal, &ghcb_page); 1066 if (ret) 1067 goto unmap_register_page; 1068 } 1069 1070 ret = mshv_vp_stats_map(partition->pt_id, args.vp_index, 1071 stats_pages); 1072 if (ret) 1073 goto unmap_ghcb_page; 1074 1075 vp = kzalloc(sizeof(*vp), GFP_KERNEL); 1076 if (!vp) 1077 goto unmap_stats_pages; 1078 1079 vp->vp_partition = mshv_partition_get(partition); 1080 if (!vp->vp_partition) { 1081 ret = -EBADF; 1082 goto free_vp; 1083 } 1084 1085 mutex_init(&vp->vp_mutex); 1086 init_waitqueue_head(&vp->run.vp_suspend_queue); 1087 atomic64_set(&vp->run.vp_signaled_count, 0); 1088 1089 vp->vp_index = args.vp_index; 1090 vp->vp_intercept_msg_page = page_to_virt(intercept_msg_page); 1091 if (!mshv_partition_encrypted(partition)) 1092 vp->vp_register_page = page_to_virt(register_page); 1093 1094 if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) 1095 vp->vp_ghcb_page = page_to_virt(ghcb_page); 1096 1097 memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages)); 1098 1099 ret = mshv_debugfs_vp_create(vp); 1100 if (ret) 1101 goto put_partition; 1102 1103 /* 1104 * Keep anon_inode_getfd last: it installs fd in the file struct and 1105 * thus makes the state accessible in user space. 1106 */ 1107 ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp, 1108 O_RDWR | O_CLOEXEC); 1109 if (ret < 0) 1110 goto remove_debugfs_vp; 1111 1112 /* already exclusive with the partition mutex for all ioctls */ 1113 partition->pt_vp_count++; 1114 partition->pt_vp_array[args.vp_index] = vp; 1115 1116 return ret; 1117 1118 remove_debugfs_vp: 1119 mshv_debugfs_vp_remove(vp); 1120 put_partition: 1121 mshv_partition_put(partition); 1122 free_vp: 1123 kfree(vp); 1124 unmap_stats_pages: 1125 mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages); 1126 unmap_ghcb_page: 1127 if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) 1128 hv_unmap_vp_state_page(partition->pt_id, args.vp_index, 1129 HV_VP_STATE_PAGE_GHCB, ghcb_page, 1130 input_vtl_normal); 1131 unmap_register_page: 1132 if (!mshv_partition_encrypted(partition)) 1133 hv_unmap_vp_state_page(partition->pt_id, args.vp_index, 1134 HV_VP_STATE_PAGE_REGISTERS, 1135 register_page, input_vtl_zero); 1136 unmap_intercept_message_page: 1137 hv_unmap_vp_state_page(partition->pt_id, args.vp_index, 1138 HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, 1139 intercept_msg_page, input_vtl_zero); 1140 destroy_vp: 1141 hv_call_delete_vp(partition->pt_id, args.vp_index); 1142 return ret; 1143 } 1144 1145 static int mshv_init_async_handler(struct mshv_partition *partition) 1146 { 1147 if (completion_done(&partition->async_hypercall)) { 1148 pt_err(partition, 1149 "Cannot issue async hypercall while another one in progress!\n"); 1150 return -EPERM; 1151 } 1152 1153 reinit_completion(&partition->async_hypercall); 1154 return 0; 1155 } 1156 1157 static void mshv_async_hvcall_handler(void *data, u64 *status) 1158 { 1159 struct mshv_partition *partition = data; 1160 1161 wait_for_completion(&partition->async_hypercall); 1162 pt_dbg(partition, "Async hypercall completed!\n"); 1163 1164 *status = partition->async_hypercall_status; 1165 } 1166 1167 /* 1168 * NB: caller checks and makes sure mem->size is page aligned 1169 * Returns: 0 with regionpp updated on success, or -errno 1170 */ 1171 static int mshv_partition_create_region(struct mshv_partition *partition, 1172 struct mshv_user_mem_region *mem, 1173 struct mshv_mem_region **regionpp, 1174 bool is_mmio) 1175 { 1176 struct mshv_mem_region *rg; 1177 u64 nr_pages = HVPFN_DOWN(mem->size); 1178 1179 /* Reject overlapping regions */ 1180 spin_lock(&partition->pt_mem_regions_lock); 1181 hlist_for_each_entry(rg, &partition->pt_mem_regions, hnode) { 1182 if (mem->guest_pfn + nr_pages <= rg->start_gfn || 1183 rg->start_gfn + rg->nr_pages <= mem->guest_pfn) 1184 continue; 1185 spin_unlock(&partition->pt_mem_regions_lock); 1186 return -EEXIST; 1187 } 1188 spin_unlock(&partition->pt_mem_regions_lock); 1189 1190 rg = mshv_region_create(mem->guest_pfn, nr_pages, 1191 mem->userspace_addr, mem->flags); 1192 if (IS_ERR(rg)) 1193 return PTR_ERR(rg); 1194 1195 if (is_mmio) 1196 rg->mreg_type = MSHV_REGION_TYPE_MMIO; 1197 else if (mshv_partition_encrypted(partition) || 1198 !mshv_region_movable_init(rg)) 1199 rg->mreg_type = MSHV_REGION_TYPE_MEM_PINNED; 1200 else 1201 rg->mreg_type = MSHV_REGION_TYPE_MEM_MOVABLE; 1202 1203 rg->partition = partition; 1204 1205 *regionpp = rg; 1206 1207 return 0; 1208 } 1209 1210 /** 1211 * mshv_prepare_pinned_region - Pin and map memory regions 1212 * @region: Pointer to the memory region structure 1213 * 1214 * This function processes memory regions that are explicitly marked as pinned. 1215 * Pinned regions are preallocated, mapped upfront, and do not rely on fault-based 1216 * population. The function ensures the region is properly populated, handles 1217 * encryption requirements for SNP partitions if applicable, maps the region, 1218 * and performs necessary sharing or eviction operations based on the mapping 1219 * result. 1220 * 1221 * Return: 0 on success, negative error code on failure. 1222 */ 1223 static int mshv_prepare_pinned_region(struct mshv_mem_region *region) 1224 { 1225 struct mshv_partition *partition = region->partition; 1226 int ret; 1227 1228 ret = mshv_region_pin(region); 1229 if (ret) { 1230 pt_err(partition, "Failed to pin memory region: %d\n", 1231 ret); 1232 goto err_out; 1233 } 1234 1235 /* 1236 * For an SNP partition it is a requirement that for every memory region 1237 * that we are going to map for this partition we should make sure that 1238 * host access to that region is released. This is ensured by doing an 1239 * additional hypercall which will update the SLAT to release host 1240 * access to guest memory regions. 1241 */ 1242 if (mshv_partition_encrypted(partition)) { 1243 ret = mshv_region_unshare(region); 1244 if (ret) { 1245 pt_err(partition, 1246 "Failed to unshare memory region (guest_pfn: %llu): %d\n", 1247 region->start_gfn, ret); 1248 goto invalidate_region; 1249 } 1250 } 1251 1252 ret = mshv_region_map(region); 1253 if (ret && mshv_partition_encrypted(partition)) { 1254 int shrc; 1255 1256 shrc = mshv_region_share(region); 1257 if (!shrc) 1258 goto invalidate_region; 1259 1260 pt_err(partition, 1261 "Failed to share memory region (guest_pfn: %llu): %d\n", 1262 region->start_gfn, shrc); 1263 /* 1264 * Don't unpin if marking shared failed because pages are no 1265 * longer mapped in the host, ie root, anymore. 1266 */ 1267 goto err_out; 1268 } 1269 1270 return 0; 1271 1272 invalidate_region: 1273 mshv_region_invalidate(region); 1274 err_out: 1275 return ret; 1276 } 1277 1278 /* 1279 * This maps two things: guest RAM and for pci passthru mmio space. 1280 * 1281 * mmio: 1282 * - vfio overloads vm_pgoff to store the mmio start pfn/spa. 1283 * - Two things need to happen for mapping mmio range: 1284 * 1. mapped in the uaddr so VMM can access it. 1285 * 2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it. 1286 * 1287 * This function takes care of the second. The first one is managed by vfio, 1288 * and hence is taken care of via vfio_pci_mmap_fault(). 1289 */ 1290 static long 1291 mshv_map_user_memory(struct mshv_partition *partition, 1292 struct mshv_user_mem_region mem) 1293 { 1294 struct mshv_mem_region *region; 1295 struct vm_area_struct *vma; 1296 bool is_mmio; 1297 ulong mmio_pfn; 1298 long ret; 1299 1300 if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP) || 1301 !access_ok((const void __user *)mem.userspace_addr, mem.size)) 1302 return -EINVAL; 1303 1304 mmap_read_lock(current->mm); 1305 vma = vma_lookup(current->mm, mem.userspace_addr); 1306 is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0; 1307 mmio_pfn = is_mmio ? vma->vm_pgoff : 0; 1308 mmap_read_unlock(current->mm); 1309 1310 if (!vma) 1311 return -EINVAL; 1312 1313 ret = mshv_partition_create_region(partition, &mem, ®ion, 1314 is_mmio); 1315 if (ret) 1316 return ret; 1317 1318 switch (region->mreg_type) { 1319 case MSHV_REGION_TYPE_MEM_PINNED: 1320 ret = mshv_prepare_pinned_region(region); 1321 break; 1322 case MSHV_REGION_TYPE_MEM_MOVABLE: 1323 /* 1324 * For movable memory regions, remap with no access to let 1325 * the hypervisor track dirty pages, enabling pre-copy live 1326 * migration. 1327 */ 1328 ret = hv_call_map_gpa_pages(partition->pt_id, 1329 region->start_gfn, 1330 region->nr_pages, 1331 HV_MAP_GPA_NO_ACCESS, NULL); 1332 break; 1333 case MSHV_REGION_TYPE_MMIO: 1334 ret = hv_call_map_mmio_pages(partition->pt_id, 1335 region->start_gfn, 1336 mmio_pfn, 1337 region->nr_pages); 1338 break; 1339 } 1340 1341 if (ret) 1342 goto errout; 1343 1344 spin_lock(&partition->pt_mem_regions_lock); 1345 hlist_add_head(®ion->hnode, &partition->pt_mem_regions); 1346 spin_unlock(&partition->pt_mem_regions_lock); 1347 1348 return 0; 1349 1350 errout: 1351 vfree(region); 1352 return ret; 1353 } 1354 1355 /* Called for unmapping both the guest ram and the mmio space */ 1356 static long 1357 mshv_unmap_user_memory(struct mshv_partition *partition, 1358 struct mshv_user_mem_region mem) 1359 { 1360 struct mshv_mem_region *region; 1361 1362 if (!(mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP))) 1363 return -EINVAL; 1364 1365 spin_lock(&partition->pt_mem_regions_lock); 1366 1367 region = mshv_partition_region_by_gfn(partition, mem.guest_pfn); 1368 if (!region) { 1369 spin_unlock(&partition->pt_mem_regions_lock); 1370 return -ENOENT; 1371 } 1372 1373 /* Paranoia check */ 1374 if (region->start_uaddr != mem.userspace_addr || 1375 region->start_gfn != mem.guest_pfn || 1376 region->nr_pages != HVPFN_DOWN(mem.size)) { 1377 spin_unlock(&partition->pt_mem_regions_lock); 1378 return -EINVAL; 1379 } 1380 1381 hlist_del(®ion->hnode); 1382 1383 spin_unlock(&partition->pt_mem_regions_lock); 1384 1385 mshv_region_put(region); 1386 1387 return 0; 1388 } 1389 1390 static long 1391 mshv_partition_ioctl_set_memory(struct mshv_partition *partition, 1392 struct mshv_user_mem_region __user *user_mem) 1393 { 1394 struct mshv_user_mem_region mem; 1395 1396 if (copy_from_user(&mem, user_mem, sizeof(mem))) 1397 return -EFAULT; 1398 1399 if (!mem.size || 1400 !PAGE_ALIGNED(mem.size) || 1401 !PAGE_ALIGNED(mem.userspace_addr) || 1402 (mem.flags & ~MSHV_SET_MEM_FLAGS_MASK) || 1403 mshv_field_nonzero(mem, rsvd)) 1404 return -EINVAL; 1405 1406 if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP)) 1407 return mshv_unmap_user_memory(partition, mem); 1408 1409 return mshv_map_user_memory(partition, mem); 1410 } 1411 1412 static long 1413 mshv_partition_ioctl_ioeventfd(struct mshv_partition *partition, 1414 void __user *user_args) 1415 { 1416 struct mshv_user_ioeventfd args; 1417 1418 if (copy_from_user(&args, user_args, sizeof(args))) 1419 return -EFAULT; 1420 1421 return mshv_set_unset_ioeventfd(partition, &args); 1422 } 1423 1424 static long 1425 mshv_partition_ioctl_irqfd(struct mshv_partition *partition, 1426 void __user *user_args) 1427 { 1428 struct mshv_user_irqfd args; 1429 1430 if (copy_from_user(&args, user_args, sizeof(args))) 1431 return -EFAULT; 1432 1433 return mshv_set_unset_irqfd(partition, &args); 1434 } 1435 1436 static long 1437 mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition *partition, 1438 void __user *user_args) 1439 { 1440 struct mshv_gpap_access_bitmap args; 1441 union hv_gpa_page_access_state *states; 1442 long ret, i; 1443 union hv_gpa_page_access_state_flags hv_flags = {}; 1444 u8 hv_type_mask; 1445 ulong bitmap_buf_sz, states_buf_sz; 1446 int written = 0; 1447 1448 if (copy_from_user(&args, user_args, sizeof(args))) 1449 return -EFAULT; 1450 1451 if (args.access_type >= MSHV_GPAP_ACCESS_TYPE_COUNT || 1452 args.access_op >= MSHV_GPAP_ACCESS_OP_COUNT || 1453 mshv_field_nonzero(args, rsvd) || !args.page_count || 1454 !args.bitmap_ptr) 1455 return -EINVAL; 1456 1457 if (check_mul_overflow(args.page_count, sizeof(*states), &states_buf_sz)) 1458 return -E2BIG; 1459 1460 /* Num bytes needed to store bitmap; one bit per page rounded up */ 1461 bitmap_buf_sz = DIV_ROUND_UP(args.page_count, 8); 1462 1463 /* Sanity check */ 1464 if (bitmap_buf_sz > states_buf_sz) 1465 return -EBADFD; 1466 1467 switch (args.access_type) { 1468 case MSHV_GPAP_ACCESS_TYPE_ACCESSED: 1469 hv_type_mask = 1; 1470 if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) { 1471 hv_flags.clear_accessed = 1; 1472 /* not accessed implies not dirty */ 1473 hv_flags.clear_dirty = 1; 1474 } else { /* MSHV_GPAP_ACCESS_OP_SET */ 1475 hv_flags.set_accessed = 1; 1476 } 1477 break; 1478 case MSHV_GPAP_ACCESS_TYPE_DIRTY: 1479 hv_type_mask = 2; 1480 if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) { 1481 hv_flags.clear_dirty = 1; 1482 } else { /* MSHV_GPAP_ACCESS_OP_SET */ 1483 hv_flags.set_dirty = 1; 1484 /* dirty implies accessed */ 1485 hv_flags.set_accessed = 1; 1486 } 1487 break; 1488 } 1489 1490 states = vzalloc(states_buf_sz); 1491 if (!states) 1492 return -ENOMEM; 1493 1494 ret = hv_call_get_gpa_access_states(partition->pt_id, args.page_count, 1495 args.gpap_base, hv_flags, &written, 1496 states); 1497 if (ret) 1498 goto free_return; 1499 1500 /* 1501 * Overwrite states buffer with bitmap - the bits in hv_type_mask 1502 * correspond to bitfields in hv_gpa_page_access_state 1503 */ 1504 for (i = 0; i < written; ++i) 1505 __assign_bit(i, (ulong *)states, 1506 states[i].as_uint8 & hv_type_mask); 1507 1508 /* zero the unused bits in the last byte(s) of the returned bitmap */ 1509 for (i = written; i < bitmap_buf_sz * 8; ++i) 1510 __clear_bit(i, (ulong *)states); 1511 1512 if (copy_to_user((void __user *)args.bitmap_ptr, states, bitmap_buf_sz)) 1513 ret = -EFAULT; 1514 1515 free_return: 1516 vfree(states); 1517 return ret; 1518 } 1519 1520 static long 1521 mshv_partition_ioctl_set_msi_routing(struct mshv_partition *partition, 1522 void __user *user_args) 1523 { 1524 struct mshv_user_irq_entry *entries = NULL; 1525 struct mshv_user_irq_table args; 1526 long ret; 1527 1528 if (copy_from_user(&args, user_args, sizeof(args))) 1529 return -EFAULT; 1530 1531 if (args.nr > MSHV_MAX_GUEST_IRQS || 1532 mshv_field_nonzero(args, rsvd)) 1533 return -EINVAL; 1534 1535 if (args.nr) { 1536 struct mshv_user_irq_table __user *urouting = user_args; 1537 1538 entries = vmemdup_user(urouting->entries, 1539 array_size(sizeof(*entries), 1540 args.nr)); 1541 if (IS_ERR(entries)) 1542 return PTR_ERR(entries); 1543 } 1544 ret = mshv_update_routing_table(partition, entries, args.nr); 1545 kvfree(entries); 1546 1547 return ret; 1548 } 1549 1550 static long 1551 mshv_partition_ioctl_initialize(struct mshv_partition *partition) 1552 { 1553 long ret; 1554 1555 if (partition->pt_initialized) 1556 return 0; 1557 1558 ret = hv_call_initialize_partition(partition->pt_id); 1559 if (ret) 1560 goto withdraw_mem; 1561 1562 ret = mshv_debugfs_partition_create(partition); 1563 if (ret) 1564 goto finalize_partition; 1565 1566 partition->pt_initialized = true; 1567 1568 return 0; 1569 1570 finalize_partition: 1571 hv_call_finalize_partition(partition->pt_id); 1572 withdraw_mem: 1573 hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); 1574 1575 return ret; 1576 } 1577 1578 static long 1579 mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) 1580 { 1581 struct mshv_partition *partition = filp->private_data; 1582 long ret; 1583 void __user *uarg = (void __user *)arg; 1584 1585 if (mutex_lock_killable(&partition->pt_mutex)) 1586 return -EINTR; 1587 1588 switch (ioctl) { 1589 case MSHV_INITIALIZE_PARTITION: 1590 ret = mshv_partition_ioctl_initialize(partition); 1591 break; 1592 case MSHV_SET_GUEST_MEMORY: 1593 ret = mshv_partition_ioctl_set_memory(partition, uarg); 1594 break; 1595 case MSHV_CREATE_VP: 1596 ret = mshv_partition_ioctl_create_vp(partition, uarg); 1597 break; 1598 case MSHV_IRQFD: 1599 ret = mshv_partition_ioctl_irqfd(partition, uarg); 1600 break; 1601 case MSHV_IOEVENTFD: 1602 ret = mshv_partition_ioctl_ioeventfd(partition, uarg); 1603 break; 1604 case MSHV_SET_MSI_ROUTING: 1605 ret = mshv_partition_ioctl_set_msi_routing(partition, uarg); 1606 break; 1607 case MSHV_GET_GPAP_ACCESS_BITMAP: 1608 ret = mshv_partition_ioctl_get_gpap_access_bitmap(partition, 1609 uarg); 1610 break; 1611 case MSHV_ROOT_HVCALL: 1612 ret = mshv_ioctl_passthru_hvcall(partition, true, uarg); 1613 break; 1614 default: 1615 ret = -ENOTTY; 1616 } 1617 1618 mutex_unlock(&partition->pt_mutex); 1619 return ret; 1620 } 1621 1622 static int 1623 disable_vp_dispatch(struct mshv_vp *vp) 1624 { 1625 int ret; 1626 struct hv_register_assoc dispatch_suspend = { 1627 .name = HV_REGISTER_DISPATCH_SUSPEND, 1628 .value.dispatch_suspend.suspended = 1, 1629 }; 1630 1631 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 1632 1, &dispatch_suspend); 1633 if (ret) 1634 vp_err(vp, "failed to suspend\n"); 1635 1636 return ret; 1637 } 1638 1639 static int 1640 get_vp_signaled_count(struct mshv_vp *vp, u64 *count) 1641 { 1642 int ret; 1643 struct hv_register_assoc root_signal_count = { 1644 .name = HV_REGISTER_VP_ROOT_SIGNAL_COUNT, 1645 }; 1646 1647 ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 1648 1, &root_signal_count); 1649 1650 if (ret) { 1651 vp_err(vp, "Failed to get root signal count"); 1652 *count = 0; 1653 return ret; 1654 } 1655 1656 *count = root_signal_count.value.reg64; 1657 1658 return ret; 1659 } 1660 1661 static void 1662 drain_vp_signals(struct mshv_vp *vp) 1663 { 1664 u64 hv_signal_count; 1665 u64 vp_signal_count; 1666 1667 get_vp_signaled_count(vp, &hv_signal_count); 1668 1669 vp_signal_count = atomic64_read(&vp->run.vp_signaled_count); 1670 1671 /* 1672 * There should be at most 1 outstanding notification, but be extra 1673 * careful anyway. 1674 */ 1675 while (hv_signal_count != vp_signal_count) { 1676 WARN_ON(hv_signal_count - vp_signal_count != 1); 1677 1678 if (wait_event_interruptible(vp->run.vp_suspend_queue, 1679 vp->run.kicked_by_hv == 1)) 1680 break; 1681 vp->run.kicked_by_hv = 0; 1682 vp_signal_count = atomic64_read(&vp->run.vp_signaled_count); 1683 } 1684 } 1685 1686 static void drain_all_vps(const struct mshv_partition *partition) 1687 { 1688 int i; 1689 struct mshv_vp *vp; 1690 1691 /* 1692 * VPs are reachable from ISR. It is safe to not take the partition 1693 * lock because nobody else can enter this function and drop the 1694 * partition from the list. 1695 */ 1696 for (i = 0; i < MSHV_MAX_VPS; i++) { 1697 vp = partition->pt_vp_array[i]; 1698 if (!vp) 1699 continue; 1700 /* 1701 * Disable dispatching of the VP in the hypervisor. After this 1702 * the hypervisor guarantees it won't generate any signals for 1703 * the VP and the hypervisor's VP signal count won't change. 1704 */ 1705 disable_vp_dispatch(vp); 1706 drain_vp_signals(vp); 1707 } 1708 } 1709 1710 static void 1711 remove_partition(struct mshv_partition *partition) 1712 { 1713 spin_lock(&mshv_root.pt_ht_lock); 1714 hlist_del_rcu(&partition->pt_hnode); 1715 spin_unlock(&mshv_root.pt_ht_lock); 1716 1717 synchronize_rcu(); 1718 } 1719 1720 /* 1721 * Tear down a partition and remove it from the list. 1722 * Partition's refcount must be 0 1723 */ 1724 static void destroy_partition(struct mshv_partition *partition) 1725 { 1726 struct mshv_vp *vp; 1727 struct mshv_mem_region *region; 1728 struct hlist_node *n; 1729 int i; 1730 1731 if (refcount_read(&partition->pt_ref_count)) { 1732 pt_err(partition, 1733 "Attempt to destroy partition but refcount > 0\n"); 1734 return; 1735 } 1736 1737 if (partition->pt_initialized) { 1738 /* 1739 * We only need to drain signals for root scheduler. This should be 1740 * done before removing the partition from the partition list. 1741 */ 1742 if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) 1743 drain_all_vps(partition); 1744 1745 /* Remove vps */ 1746 for (i = 0; i < MSHV_MAX_VPS; ++i) { 1747 vp = partition->pt_vp_array[i]; 1748 if (!vp) 1749 continue; 1750 1751 mshv_debugfs_vp_remove(vp); 1752 mshv_vp_stats_unmap(partition->pt_id, vp->vp_index, 1753 vp->vp_stats_pages); 1754 1755 if (vp->vp_register_page) { 1756 (void)hv_unmap_vp_state_page(partition->pt_id, 1757 vp->vp_index, 1758 HV_VP_STATE_PAGE_REGISTERS, 1759 virt_to_page(vp->vp_register_page), 1760 input_vtl_zero); 1761 vp->vp_register_page = NULL; 1762 } 1763 1764 (void)hv_unmap_vp_state_page(partition->pt_id, 1765 vp->vp_index, 1766 HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, 1767 virt_to_page(vp->vp_intercept_msg_page), 1768 input_vtl_zero); 1769 vp->vp_intercept_msg_page = NULL; 1770 1771 if (vp->vp_ghcb_page) { 1772 (void)hv_unmap_vp_state_page(partition->pt_id, 1773 vp->vp_index, 1774 HV_VP_STATE_PAGE_GHCB, 1775 virt_to_page(vp->vp_ghcb_page), 1776 input_vtl_normal); 1777 vp->vp_ghcb_page = NULL; 1778 } 1779 1780 kfree(vp); 1781 1782 partition->pt_vp_array[i] = NULL; 1783 } 1784 1785 mshv_debugfs_partition_remove(partition); 1786 1787 /* Deallocates and unmaps everything including vcpus, GPA mappings etc */ 1788 hv_call_finalize_partition(partition->pt_id); 1789 1790 partition->pt_initialized = false; 1791 } 1792 1793 remove_partition(partition); 1794 1795 hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions, 1796 hnode) { 1797 hlist_del(®ion->hnode); 1798 mshv_region_put(region); 1799 } 1800 1801 /* Withdraw and free all pages we deposited */ 1802 hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); 1803 hv_call_delete_partition(partition->pt_id); 1804 1805 mshv_free_routing_table(partition); 1806 kfree(partition); 1807 } 1808 1809 struct 1810 mshv_partition *mshv_partition_get(struct mshv_partition *partition) 1811 { 1812 if (refcount_inc_not_zero(&partition->pt_ref_count)) 1813 return partition; 1814 return NULL; 1815 } 1816 1817 struct 1818 mshv_partition *mshv_partition_find(u64 partition_id) 1819 __must_hold(RCU) 1820 { 1821 struct mshv_partition *p; 1822 1823 hash_for_each_possible_rcu(mshv_root.pt_htable, p, pt_hnode, 1824 partition_id) 1825 if (p->pt_id == partition_id) 1826 return p; 1827 1828 return NULL; 1829 } 1830 1831 void 1832 mshv_partition_put(struct mshv_partition *partition) 1833 { 1834 if (refcount_dec_and_test(&partition->pt_ref_count)) 1835 destroy_partition(partition); 1836 } 1837 1838 static int 1839 mshv_partition_release(struct inode *inode, struct file *filp) 1840 { 1841 struct mshv_partition *partition = filp->private_data; 1842 1843 mshv_eventfd_release(partition); 1844 1845 cleanup_srcu_struct(&partition->pt_irq_srcu); 1846 1847 mshv_partition_put(partition); 1848 1849 return 0; 1850 } 1851 1852 static int 1853 add_partition(struct mshv_partition *partition) 1854 { 1855 spin_lock(&mshv_root.pt_ht_lock); 1856 1857 hash_add_rcu(mshv_root.pt_htable, &partition->pt_hnode, 1858 partition->pt_id); 1859 1860 spin_unlock(&mshv_root.pt_ht_lock); 1861 1862 return 0; 1863 } 1864 1865 static_assert(MSHV_NUM_CPU_FEATURES_BANKS == 1866 HV_PARTITION_PROCESSOR_FEATURES_BANKS); 1867 1868 static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags, 1869 struct hv_partition_creation_properties *cr_props, 1870 union hv_partition_isolation_properties *isol_props) 1871 { 1872 int i; 1873 struct mshv_create_partition_v2 args; 1874 union hv_partition_processor_features *disabled_procs; 1875 union hv_partition_processor_xsave_features *disabled_xsave; 1876 1877 /* First, copy v1 struct in case user is on previous versions */ 1878 if (copy_from_user(&args, user_arg, 1879 sizeof(struct mshv_create_partition))) 1880 return -EFAULT; 1881 1882 if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) || 1883 args.pt_isolation >= MSHV_PT_ISOLATION_COUNT) 1884 return -EINVAL; 1885 1886 disabled_procs = &cr_props->disabled_processor_features; 1887 disabled_xsave = &cr_props->disabled_processor_xsave_features; 1888 1889 /* Check if user provided newer struct with feature fields */ 1890 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES)) { 1891 if (copy_from_user(&args, user_arg, sizeof(args))) 1892 return -EFAULT; 1893 1894 /* Re-validate v1 fields after second copy_from_user() */ 1895 if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) || 1896 args.pt_isolation >= MSHV_PT_ISOLATION_COUNT) 1897 return -EINVAL; 1898 1899 if (args.pt_num_cpu_fbanks != MSHV_NUM_CPU_FEATURES_BANKS || 1900 mshv_field_nonzero(args, pt_rsvd) || 1901 mshv_field_nonzero(args, pt_rsvd1)) 1902 return -EINVAL; 1903 1904 /* 1905 * Note this assumes MSHV_NUM_CPU_FEATURES_BANKS will never 1906 * change and equals HV_PARTITION_PROCESSOR_FEATURES_BANKS 1907 * (i.e. 2). 1908 * 1909 * Further banks (index >= 2) will be modifiable as 'early' 1910 * properties via the set partition property hypercall. 1911 */ 1912 for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++) 1913 disabled_procs->as_uint64[i] = args.pt_cpu_fbanks[i]; 1914 1915 #if IS_ENABLED(CONFIG_X86_64) 1916 disabled_xsave->as_uint64 = args.pt_disabled_xsave; 1917 #else 1918 /* 1919 * In practice this field is ignored on arm64, but safer to 1920 * zero it in case it is ever used. 1921 */ 1922 disabled_xsave->as_uint64 = 0; 1923 1924 if (mshv_field_nonzero(args, pt_rsvd2)) 1925 return -EINVAL; 1926 #endif 1927 } else { 1928 /* 1929 * v1 behavior: try to enable everything. The hypervisor will 1930 * disable features that are not supported. The banks can be 1931 * queried via the get partition property hypercall. 1932 */ 1933 for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++) 1934 disabled_procs->as_uint64[i] = 0; 1935 1936 disabled_xsave->as_uint64 = 0; 1937 } 1938 1939 /* Only support EXO partitions */ 1940 *pt_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION | 1941 HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED; 1942 1943 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_LAPIC)) 1944 *pt_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED; 1945 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_X2APIC)) 1946 *pt_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE; 1947 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_GPA_SUPER_PAGES)) 1948 *pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED; 1949 if (args.pt_flags & BIT(MSHV_PT_BIT_NESTED_VIRTUALIZATION)) 1950 *pt_flags |= HV_PARTITION_CREATION_FLAG_NESTED_VIRTUALIZATION_CAPABLE; 1951 if (args.pt_flags & BIT(MSHV_PT_BIT_SMT_ENABLED_GUEST)) 1952 *pt_flags |= HV_PARTITION_CREATION_FLAG_SMT_ENABLED_GUEST; 1953 1954 isol_props->as_uint64 = 0; 1955 1956 switch (args.pt_isolation) { 1957 case MSHV_PT_ISOLATION_NONE: 1958 isol_props->isolation_type = HV_PARTITION_ISOLATION_TYPE_NONE; 1959 break; 1960 } 1961 1962 return 0; 1963 } 1964 1965 static long 1966 mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev) 1967 { 1968 u64 creation_flags; 1969 struct hv_partition_creation_properties creation_properties; 1970 union hv_partition_isolation_properties isolation_properties; 1971 struct mshv_partition *partition; 1972 long ret; 1973 1974 ret = mshv_ioctl_process_pt_flags(user_arg, &creation_flags, 1975 &creation_properties, 1976 &isolation_properties); 1977 if (ret) 1978 return ret; 1979 1980 partition = kzalloc(sizeof(*partition), GFP_KERNEL); 1981 if (!partition) 1982 return -ENOMEM; 1983 1984 partition->pt_module_dev = module_dev; 1985 partition->isolation_type = isolation_properties.isolation_type; 1986 1987 refcount_set(&partition->pt_ref_count, 1); 1988 1989 mutex_init(&partition->pt_mutex); 1990 1991 mutex_init(&partition->pt_irq_lock); 1992 1993 init_completion(&partition->async_hypercall); 1994 1995 INIT_HLIST_HEAD(&partition->irq_ack_notifier_list); 1996 1997 INIT_HLIST_HEAD(&partition->pt_devices); 1998 1999 spin_lock_init(&partition->pt_mem_regions_lock); 2000 INIT_HLIST_HEAD(&partition->pt_mem_regions); 2001 2002 mshv_eventfd_init(partition); 2003 2004 ret = init_srcu_struct(&partition->pt_irq_srcu); 2005 if (ret) 2006 goto free_partition; 2007 2008 ret = hv_call_create_partition(creation_flags, 2009 creation_properties, 2010 isolation_properties, 2011 &partition->pt_id); 2012 if (ret) 2013 goto cleanup_irq_srcu; 2014 2015 ret = add_partition(partition); 2016 if (ret) 2017 goto delete_partition; 2018 2019 ret = mshv_init_async_handler(partition); 2020 if (!ret) { 2021 ret = FD_ADD(O_CLOEXEC, anon_inode_getfile("mshv_partition", 2022 &mshv_partition_fops, 2023 partition, O_RDWR)); 2024 if (ret >= 0) 2025 return ret; 2026 } 2027 remove_partition(partition); 2028 delete_partition: 2029 hv_call_delete_partition(partition->pt_id); 2030 cleanup_irq_srcu: 2031 cleanup_srcu_struct(&partition->pt_irq_srcu); 2032 free_partition: 2033 kfree(partition); 2034 2035 return ret; 2036 } 2037 2038 static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, 2039 unsigned long arg) 2040 { 2041 struct miscdevice *misc = filp->private_data; 2042 2043 switch (ioctl) { 2044 case MSHV_CREATE_PARTITION: 2045 return mshv_ioctl_create_partition((void __user *)arg, 2046 misc->this_device); 2047 case MSHV_ROOT_HVCALL: 2048 return mshv_ioctl_passthru_hvcall(NULL, false, 2049 (void __user *)arg); 2050 } 2051 2052 return -ENOTTY; 2053 } 2054 2055 static int 2056 mshv_dev_open(struct inode *inode, struct file *filp) 2057 { 2058 return 0; 2059 } 2060 2061 static int 2062 mshv_dev_release(struct inode *inode, struct file *filp) 2063 { 2064 return 0; 2065 } 2066 2067 static int mshv_cpuhp_online; 2068 static int mshv_root_sched_online; 2069 2070 static const char *scheduler_type_to_string(enum hv_scheduler_type type) 2071 { 2072 switch (type) { 2073 case HV_SCHEDULER_TYPE_LP: 2074 return "classic scheduler without SMT"; 2075 case HV_SCHEDULER_TYPE_LP_SMT: 2076 return "classic scheduler with SMT"; 2077 case HV_SCHEDULER_TYPE_CORE_SMT: 2078 return "core scheduler"; 2079 case HV_SCHEDULER_TYPE_ROOT: 2080 return "root scheduler"; 2081 default: 2082 return "unknown scheduler"; 2083 }; 2084 } 2085 2086 static int __init l1vh_retrieve_scheduler_type(enum hv_scheduler_type *out) 2087 { 2088 u64 integrated_sched_enabled; 2089 int ret; 2090 2091 *out = HV_SCHEDULER_TYPE_CORE_SMT; 2092 2093 if (!mshv_root.vmm_caps.vmm_enable_integrated_scheduler) 2094 return 0; 2095 2096 ret = hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF, 2097 HV_PARTITION_PROPERTY_INTEGRATED_SCHEDULER_ENABLED, 2098 0, &integrated_sched_enabled, 2099 sizeof(integrated_sched_enabled)); 2100 if (ret) 2101 return ret; 2102 2103 if (integrated_sched_enabled) 2104 *out = HV_SCHEDULER_TYPE_ROOT; 2105 2106 return 0; 2107 } 2108 2109 /* TODO move this to hv_common.c when needed outside */ 2110 static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out) 2111 { 2112 struct hv_input_get_system_property *input; 2113 struct hv_output_get_system_property *output; 2114 unsigned long flags; 2115 u64 status; 2116 2117 local_irq_save(flags); 2118 input = *this_cpu_ptr(hyperv_pcpu_input_arg); 2119 output = *this_cpu_ptr(hyperv_pcpu_output_arg); 2120 2121 memset(input, 0, sizeof(*input)); 2122 memset(output, 0, sizeof(*output)); 2123 input->property_id = HV_SYSTEM_PROPERTY_SCHEDULER_TYPE; 2124 2125 status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output); 2126 if (!hv_result_success(status)) { 2127 local_irq_restore(flags); 2128 pr_err("%s: %s\n", __func__, hv_result_to_string(status)); 2129 return hv_result_to_errno(status); 2130 } 2131 2132 *out = output->scheduler_type; 2133 local_irq_restore(flags); 2134 2135 return 0; 2136 } 2137 2138 /* Retrieve and stash the supported scheduler type */ 2139 static int __init mshv_retrieve_scheduler_type(struct device *dev) 2140 { 2141 int ret; 2142 2143 if (hv_l1vh_partition()) 2144 ret = l1vh_retrieve_scheduler_type(&hv_scheduler_type); 2145 else 2146 ret = hv_retrieve_scheduler_type(&hv_scheduler_type); 2147 if (ret) 2148 return ret; 2149 2150 dev_info(dev, "Hypervisor using %s\n", 2151 scheduler_type_to_string(hv_scheduler_type)); 2152 2153 switch (hv_scheduler_type) { 2154 case HV_SCHEDULER_TYPE_CORE_SMT: 2155 case HV_SCHEDULER_TYPE_LP_SMT: 2156 case HV_SCHEDULER_TYPE_ROOT: 2157 case HV_SCHEDULER_TYPE_LP: 2158 /* Supported scheduler, nothing to do */ 2159 break; 2160 default: 2161 dev_err(dev, "unsupported scheduler 0x%x, bailing.\n", 2162 hv_scheduler_type); 2163 return -EOPNOTSUPP; 2164 } 2165 2166 return 0; 2167 } 2168 2169 static int mshv_root_scheduler_init(unsigned int cpu) 2170 { 2171 void **inputarg, **outputarg, *p; 2172 2173 inputarg = (void **)this_cpu_ptr(root_scheduler_input); 2174 outputarg = (void **)this_cpu_ptr(root_scheduler_output); 2175 2176 /* Allocate two consecutive pages. One for input, one for output. */ 2177 p = kmalloc(2 * HV_HYP_PAGE_SIZE, GFP_KERNEL); 2178 if (!p) 2179 return -ENOMEM; 2180 2181 *inputarg = p; 2182 *outputarg = (char *)p + HV_HYP_PAGE_SIZE; 2183 2184 return 0; 2185 } 2186 2187 static int mshv_root_scheduler_cleanup(unsigned int cpu) 2188 { 2189 void *p, **inputarg, **outputarg; 2190 2191 inputarg = (void **)this_cpu_ptr(root_scheduler_input); 2192 outputarg = (void **)this_cpu_ptr(root_scheduler_output); 2193 2194 p = *inputarg; 2195 2196 *inputarg = NULL; 2197 *outputarg = NULL; 2198 2199 kfree(p); 2200 2201 return 0; 2202 } 2203 2204 /* Must be called after retrieving the scheduler type */ 2205 static int 2206 root_scheduler_init(struct device *dev) 2207 { 2208 int ret; 2209 2210 if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) 2211 return 0; 2212 2213 root_scheduler_input = alloc_percpu(void *); 2214 root_scheduler_output = alloc_percpu(void *); 2215 2216 if (!root_scheduler_input || !root_scheduler_output) { 2217 dev_err(dev, "Failed to allocate root scheduler buffers\n"); 2218 ret = -ENOMEM; 2219 goto out; 2220 } 2221 2222 ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_root_sched", 2223 mshv_root_scheduler_init, 2224 mshv_root_scheduler_cleanup); 2225 2226 if (ret < 0) { 2227 dev_err(dev, "Failed to setup root scheduler state: %i\n", ret); 2228 goto out; 2229 } 2230 2231 mshv_root_sched_online = ret; 2232 2233 return 0; 2234 2235 out: 2236 free_percpu(root_scheduler_input); 2237 free_percpu(root_scheduler_output); 2238 return ret; 2239 } 2240 2241 static void 2242 root_scheduler_deinit(void) 2243 { 2244 if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) 2245 return; 2246 2247 cpuhp_remove_state(mshv_root_sched_online); 2248 free_percpu(root_scheduler_input); 2249 free_percpu(root_scheduler_output); 2250 } 2251 2252 static int mshv_reboot_notify(struct notifier_block *nb, 2253 unsigned long code, void *unused) 2254 { 2255 cpuhp_remove_state(mshv_cpuhp_online); 2256 return 0; 2257 } 2258 2259 struct notifier_block mshv_reboot_nb = { 2260 .notifier_call = mshv_reboot_notify, 2261 }; 2262 2263 static void mshv_root_partition_exit(void) 2264 { 2265 unregister_reboot_notifier(&mshv_reboot_nb); 2266 } 2267 2268 static int __init mshv_root_partition_init(struct device *dev) 2269 { 2270 return register_reboot_notifier(&mshv_reboot_nb); 2271 } 2272 2273 static int __init mshv_init_vmm_caps(struct device *dev) 2274 { 2275 int ret; 2276 2277 ret = hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF, 2278 HV_PARTITION_PROPERTY_VMM_CAPABILITIES, 2279 0, &mshv_root.vmm_caps, 2280 sizeof(mshv_root.vmm_caps)); 2281 if (ret && hv_l1vh_partition()) { 2282 dev_err(dev, "Failed to get VMM capabilities: %d\n", ret); 2283 return ret; 2284 } 2285 2286 dev_dbg(dev, "vmm_caps = %#llx\n", mshv_root.vmm_caps.as_uint64[0]); 2287 2288 return 0; 2289 } 2290 2291 static int __init mshv_parent_partition_init(void) 2292 { 2293 int ret; 2294 struct device *dev; 2295 union hv_hypervisor_version_info version_info; 2296 2297 if (!hv_parent_partition() || is_kdump_kernel()) 2298 return -ENODEV; 2299 2300 if (hv_get_hypervisor_version(&version_info)) 2301 return -ENODEV; 2302 2303 ret = misc_register(&mshv_dev); 2304 if (ret) 2305 return ret; 2306 2307 dev = mshv_dev.this_device; 2308 2309 if (version_info.build_number < MSHV_HV_MIN_VERSION || 2310 version_info.build_number > MSHV_HV_MAX_VERSION) { 2311 dev_err(dev, "Running on unvalidated Hyper-V version\n"); 2312 dev_err(dev, "Versions: current: %u min: %u max: %u\n", 2313 version_info.build_number, MSHV_HV_MIN_VERSION, 2314 MSHV_HV_MAX_VERSION); 2315 } 2316 2317 mshv_root.synic_pages = alloc_percpu(struct hv_synic_pages); 2318 if (!mshv_root.synic_pages) { 2319 dev_err(dev, "Failed to allocate percpu synic page\n"); 2320 ret = -ENOMEM; 2321 goto device_deregister; 2322 } 2323 2324 ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic", 2325 mshv_synic_init, 2326 mshv_synic_cleanup); 2327 if (ret < 0) { 2328 dev_err(dev, "Failed to setup cpu hotplug state: %i\n", ret); 2329 goto free_synic_pages; 2330 } 2331 2332 mshv_cpuhp_online = ret; 2333 2334 ret = mshv_init_vmm_caps(dev); 2335 if (ret) 2336 goto remove_cpu_state; 2337 2338 ret = mshv_retrieve_scheduler_type(dev); 2339 if (ret) 2340 goto remove_cpu_state; 2341 2342 if (hv_root_partition()) 2343 ret = mshv_root_partition_init(dev); 2344 if (ret) 2345 goto remove_cpu_state; 2346 2347 ret = root_scheduler_init(dev); 2348 if (ret) 2349 goto exit_partition; 2350 2351 ret = mshv_debugfs_init(); 2352 if (ret) 2353 goto deinit_root_scheduler; 2354 2355 ret = mshv_irqfd_wq_init(); 2356 if (ret) 2357 goto exit_debugfs; 2358 2359 spin_lock_init(&mshv_root.pt_ht_lock); 2360 hash_init(mshv_root.pt_htable); 2361 2362 hv_setup_mshv_handler(mshv_isr); 2363 2364 return 0; 2365 2366 exit_debugfs: 2367 mshv_debugfs_exit(); 2368 deinit_root_scheduler: 2369 root_scheduler_deinit(); 2370 exit_partition: 2371 if (hv_root_partition()) 2372 mshv_root_partition_exit(); 2373 remove_cpu_state: 2374 cpuhp_remove_state(mshv_cpuhp_online); 2375 free_synic_pages: 2376 free_percpu(mshv_root.synic_pages); 2377 device_deregister: 2378 misc_deregister(&mshv_dev); 2379 return ret; 2380 } 2381 2382 static void __exit mshv_parent_partition_exit(void) 2383 { 2384 hv_setup_mshv_handler(NULL); 2385 mshv_port_table_fini(); 2386 mshv_debugfs_exit(); 2387 misc_deregister(&mshv_dev); 2388 mshv_irqfd_wq_cleanup(); 2389 root_scheduler_deinit(); 2390 if (hv_root_partition()) 2391 mshv_root_partition_exit(); 2392 cpuhp_remove_state(mshv_cpuhp_online); 2393 free_percpu(mshv_root.synic_pages); 2394 } 2395 2396 module_init(mshv_parent_partition_init); 2397 module_exit(mshv_parent_partition_exit); 2398