1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2024, Microsoft Corporation. 4 * 5 * The main part of the mshv_root module, providing APIs to create 6 * and manage guest partitions. 7 * 8 * Authors: Microsoft Linux virtualization team 9 */ 10 11 #include <linux/entry-virt.h> 12 #include <linux/kernel.h> 13 #include <linux/module.h> 14 #include <linux/fs.h> 15 #include <linux/miscdevice.h> 16 #include <linux/slab.h> 17 #include <linux/file.h> 18 #include <linux/anon_inodes.h> 19 #include <linux/mm.h> 20 #include <linux/io.h> 21 #include <linux/cpuhotplug.h> 22 #include <linux/random.h> 23 #include <asm/mshyperv.h> 24 #include <linux/hyperv.h> 25 #include <linux/notifier.h> 26 #include <linux/reboot.h> 27 #include <linux/kexec.h> 28 #include <linux/page-flags.h> 29 #include <linux/crash_dump.h> 30 #include <linux/panic_notifier.h> 31 #include <linux/vmalloc.h> 32 #include <linux/rseq.h> 33 34 #include "mshv_eventfd.h" 35 #include "mshv.h" 36 #include "mshv_root.h" 37 38 MODULE_AUTHOR("Microsoft"); 39 MODULE_LICENSE("GPL"); 40 MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv"); 41 42 /* HV_THREAD_COUNTER */ 43 #if defined(CONFIG_X86_64) 44 #define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 202 45 #elif defined(CONFIG_ARM64) 46 #define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 95 47 #endif 48 49 struct mshv_root mshv_root; 50 51 enum hv_scheduler_type hv_scheduler_type; 52 53 /* Once we implement the fast extended hypercall ABI they can go away. */ 54 static void * __percpu *root_scheduler_input; 55 static void * __percpu *root_scheduler_output; 56 57 static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); 58 static int mshv_dev_open(struct inode *inode, struct file *filp); 59 static int mshv_dev_release(struct inode *inode, struct file *filp); 60 static int mshv_vp_release(struct inode *inode, struct file *filp); 61 static long mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); 62 static int mshv_partition_release(struct inode *inode, struct file *filp); 63 static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); 64 static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma); 65 static vm_fault_t mshv_vp_fault(struct vm_fault *vmf); 66 static int mshv_init_async_handler(struct mshv_partition *partition); 67 static void mshv_async_hvcall_handler(void *data, u64 *status); 68 69 static const union hv_input_vtl input_vtl_zero; 70 static const union hv_input_vtl input_vtl_normal = { 71 .target_vtl = HV_NORMAL_VTL, 72 .use_target_vtl = 1, 73 }; 74 75 static const struct vm_operations_struct mshv_vp_vm_ops = { 76 .fault = mshv_vp_fault, 77 }; 78 79 static const struct file_operations mshv_vp_fops = { 80 .owner = THIS_MODULE, 81 .release = mshv_vp_release, 82 .unlocked_ioctl = mshv_vp_ioctl, 83 .llseek = noop_llseek, 84 .mmap = mshv_vp_mmap, 85 }; 86 87 static const struct file_operations mshv_partition_fops = { 88 .owner = THIS_MODULE, 89 .release = mshv_partition_release, 90 .unlocked_ioctl = mshv_partition_ioctl, 91 .llseek = noop_llseek, 92 }; 93 94 static const struct file_operations mshv_dev_fops = { 95 .owner = THIS_MODULE, 96 .open = mshv_dev_open, 97 .release = mshv_dev_release, 98 .unlocked_ioctl = mshv_dev_ioctl, 99 .llseek = noop_llseek, 100 }; 101 102 static struct miscdevice mshv_dev = { 103 .minor = MISC_DYNAMIC_MINOR, 104 .name = "mshv", 105 .fops = &mshv_dev_fops, 106 .mode = 0600, 107 }; 108 109 /* 110 * Only allow hypercalls that have a u64 partition id as the first member of 111 * the input structure. 112 * These are sorted by value. 113 */ 114 static u16 mshv_passthru_hvcalls[] = { 115 HVCALL_GET_PARTITION_PROPERTY, 116 HVCALL_GET_PARTITION_PROPERTY_EX, 117 HVCALL_SET_PARTITION_PROPERTY, 118 HVCALL_INSTALL_INTERCEPT, 119 HVCALL_GET_VP_REGISTERS, 120 HVCALL_SET_VP_REGISTERS, 121 HVCALL_TRANSLATE_VIRTUAL_ADDRESS, 122 HVCALL_CLEAR_VIRTUAL_INTERRUPT, 123 HVCALL_REGISTER_INTERCEPT_RESULT, 124 HVCALL_ASSERT_VIRTUAL_INTERRUPT, 125 HVCALL_GET_GPA_PAGES_ACCESS_STATES, 126 HVCALL_SIGNAL_EVENT_DIRECT, 127 HVCALL_POST_MESSAGE_DIRECT, 128 HVCALL_GET_VP_CPUID_VALUES, 129 }; 130 131 /* 132 * Only allow hypercalls that are safe to be called by the VMM with the host 133 * partition as target (i.e. HV_PARTITION_ID_SELF). Carefully audit that a 134 * hypercall cannot be misused by the VMM before adding it to this list. 135 */ 136 static u16 mshv_self_passthru_hvcalls[] = { 137 HVCALL_GET_PARTITION_PROPERTY, 138 HVCALL_GET_PARTITION_PROPERTY_EX, 139 }; 140 141 static bool mshv_hvcall_is_async(u16 code) 142 { 143 switch (code) { 144 case HVCALL_SET_PARTITION_PROPERTY: 145 return true; 146 default: 147 break; 148 } 149 return false; 150 } 151 152 static bool mshv_passthru_hvcall_allowed(u16 code, u64 pt_id) 153 { 154 int i; 155 int n = ARRAY_SIZE(mshv_passthru_hvcalls); 156 u16 *allowed_hvcalls = mshv_passthru_hvcalls; 157 158 if (pt_id == HV_PARTITION_ID_SELF) { 159 n = ARRAY_SIZE(mshv_self_passthru_hvcalls); 160 allowed_hvcalls = mshv_self_passthru_hvcalls; 161 } 162 163 for (i = 0; i < n; ++i) 164 if (allowed_hvcalls[i] == code) 165 return true; 166 167 return false; 168 } 169 170 static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition, 171 bool partition_locked, 172 void __user *user_args) 173 { 174 u64 status; 175 int ret = 0; 176 bool is_async; 177 struct mshv_root_hvcall args; 178 struct page *page; 179 unsigned int pages_order; 180 void *input_pg = NULL; 181 void *output_pg = NULL; 182 u16 reps_completed; 183 u64 pt_id = partition ? partition->pt_id : HV_PARTITION_ID_SELF; 184 185 if (copy_from_user(&args, user_args, sizeof(args))) 186 return -EFAULT; 187 188 if (args.status || !args.in_ptr || args.in_sz < sizeof(u64) || 189 mshv_field_nonzero(args, rsvd) || args.in_sz > HV_HYP_PAGE_SIZE) 190 return -EINVAL; 191 192 if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE)) 193 return -EINVAL; 194 195 if (!mshv_passthru_hvcall_allowed(args.code, pt_id)) 196 return -EINVAL; 197 198 is_async = mshv_hvcall_is_async(args.code); 199 if (is_async) { 200 /* async hypercalls can only be called from partition fd */ 201 if (!partition || !partition_locked) 202 return -EINVAL; 203 ret = mshv_init_async_handler(partition); 204 if (ret) 205 return ret; 206 } 207 208 pages_order = args.out_ptr ? 1 : 0; 209 page = alloc_pages(GFP_KERNEL, pages_order); 210 if (!page) 211 return -ENOMEM; 212 input_pg = page_address(page); 213 214 if (args.out_ptr) 215 output_pg = (char *)input_pg + PAGE_SIZE; 216 else 217 output_pg = NULL; 218 219 if (copy_from_user(input_pg, (void __user *)args.in_ptr, 220 args.in_sz)) { 221 ret = -EFAULT; 222 goto free_pages_out; 223 } 224 225 /* 226 * NOTE: This only works because all the allowed hypercalls' input 227 * structs begin with a u64 partition_id field. 228 */ 229 *(u64 *)input_pg = pt_id; 230 231 reps_completed = 0; 232 do { 233 if (args.reps) { 234 status = hv_do_rep_hypercall_ex(args.code, args.reps, 235 0, reps_completed, 236 input_pg, output_pg); 237 reps_completed = hv_repcomp(status); 238 } else { 239 status = hv_do_hypercall(args.code, input_pg, output_pg); 240 } 241 242 if (hv_result(status) == HV_STATUS_CALL_PENDING) { 243 if (is_async) { 244 mshv_async_hvcall_handler(partition, &status); 245 } else { /* Paranoia check. This shouldn't happen! */ 246 ret = -EBADFD; 247 goto free_pages_out; 248 } 249 } 250 251 if (hv_result_success(status)) 252 break; 253 254 if (!hv_result_needs_memory(status)) 255 ret = hv_result_to_errno(status); 256 else 257 ret = hv_deposit_memory(pt_id, status); 258 } while (!ret); 259 260 args.status = hv_result(status); 261 args.reps = reps_completed; 262 if (copy_to_user(user_args, &args, sizeof(args))) 263 ret = -EFAULT; 264 265 if (!ret && output_pg && 266 copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz)) 267 ret = -EFAULT; 268 269 free_pages_out: 270 free_pages((unsigned long)input_pg, pages_order); 271 272 return ret; 273 } 274 275 static inline bool is_ghcb_mapping_available(void) 276 { 277 #if IS_ENABLED(CONFIG_X86_64) 278 return ms_hyperv.ext_features & HV_VP_GHCB_ROOT_MAPPING_AVAILABLE; 279 #else 280 return 0; 281 #endif 282 } 283 284 static int mshv_get_vp_registers(u32 vp_index, u64 partition_id, u16 count, 285 struct hv_register_assoc *registers) 286 { 287 return hv_call_get_vp_registers(vp_index, partition_id, 288 count, input_vtl_zero, registers); 289 } 290 291 static int mshv_set_vp_registers(u32 vp_index, u64 partition_id, u16 count, 292 struct hv_register_assoc *registers) 293 { 294 return hv_call_set_vp_registers(vp_index, partition_id, 295 count, input_vtl_zero, registers); 296 } 297 298 /* 299 * Explicit guest vCPU suspend is asynchronous by nature (as it is requested by 300 * dom0 vCPU for guest vCPU) and thus it can race with "intercept" suspend, 301 * done by the hypervisor. 302 * "Intercept" suspend leads to asynchronous message delivery to dom0 which 303 * should be awaited to keep the VP loop consistent (i.e. no message pending 304 * upon VP resume). 305 * VP intercept suspend can't be done when the VP is explicitly suspended 306 * already, and thus can be only two possible race scenarios: 307 * 1. implicit suspend bit set -> explicit suspend bit set -> message sent 308 * 2. implicit suspend bit set -> message sent -> explicit suspend bit set 309 * Checking for implicit suspend bit set after explicit suspend request has 310 * succeeded in either case allows us to reliably identify, if there is a 311 * message to receive and deliver to VMM. 312 */ 313 static int 314 mshv_suspend_vp(const struct mshv_vp *vp, bool *message_in_flight) 315 { 316 struct hv_register_assoc explicit_suspend = { 317 .name = HV_REGISTER_EXPLICIT_SUSPEND 318 }; 319 struct hv_register_assoc intercept_suspend = { 320 .name = HV_REGISTER_INTERCEPT_SUSPEND 321 }; 322 union hv_explicit_suspend_register *es = 323 &explicit_suspend.value.explicit_suspend; 324 union hv_intercept_suspend_register *is = 325 &intercept_suspend.value.intercept_suspend; 326 int ret; 327 328 es->suspended = 1; 329 330 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 331 1, &explicit_suspend); 332 if (ret) { 333 vp_err(vp, "Failed to explicitly suspend vCPU\n"); 334 return ret; 335 } 336 337 ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 338 1, &intercept_suspend); 339 if (ret) { 340 vp_err(vp, "Failed to get intercept suspend state\n"); 341 return ret; 342 } 343 344 *message_in_flight = is->suspended; 345 346 return 0; 347 } 348 349 /* 350 * This function is used when VPs are scheduled by the hypervisor's 351 * scheduler. 352 * 353 * Caller has to make sure the registers contain cleared 354 * HV_REGISTER_INTERCEPT_SUSPEND and HV_REGISTER_EXPLICIT_SUSPEND registers 355 * exactly in this order (the hypervisor clears them sequentially) to avoid 356 * potential invalid clearing a newly arrived HV_REGISTER_INTERCEPT_SUSPEND 357 * after VP is released from HV_REGISTER_EXPLICIT_SUSPEND in case of the 358 * opposite order. 359 */ 360 static long mshv_run_vp_with_hyp_scheduler(struct mshv_vp *vp) 361 { 362 long ret; 363 struct hv_register_assoc suspend_regs[2] = { 364 { .name = HV_REGISTER_INTERCEPT_SUSPEND }, 365 { .name = HV_REGISTER_EXPLICIT_SUSPEND } 366 }; 367 size_t count = ARRAY_SIZE(suspend_regs); 368 369 /* Resume VP execution */ 370 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 371 count, suspend_regs); 372 if (ret) { 373 vp_err(vp, "Failed to resume vp execution. %lx\n", ret); 374 return ret; 375 } 376 377 ret = wait_event_interruptible(vp->run.vp_suspend_queue, 378 vp->run.kicked_by_hv == 1); 379 if (ret) { 380 bool message_in_flight; 381 382 /* 383 * Otherwise the waiting was interrupted by a signal: suspend 384 * the vCPU explicitly and copy message in flight (if any). 385 */ 386 ret = mshv_suspend_vp(vp, &message_in_flight); 387 if (ret) 388 return ret; 389 390 /* Return if no message in flight */ 391 if (!message_in_flight) 392 return -EINTR; 393 394 /* Wait for the message in flight. */ 395 wait_event(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1); 396 } 397 398 /* 399 * Reset the flag to make the wait_event call above work 400 * next time. 401 */ 402 vp->run.kicked_by_hv = 0; 403 404 return 0; 405 } 406 407 static int 408 mshv_vp_dispatch(struct mshv_vp *vp, u32 flags, 409 struct hv_output_dispatch_vp *res) 410 { 411 struct hv_input_dispatch_vp *input; 412 struct hv_output_dispatch_vp *output; 413 u64 status; 414 415 preempt_disable(); 416 input = *this_cpu_ptr(root_scheduler_input); 417 output = *this_cpu_ptr(root_scheduler_output); 418 419 memset(input, 0, sizeof(*input)); 420 memset(output, 0, sizeof(*output)); 421 422 input->partition_id = vp->vp_partition->pt_id; 423 input->vp_index = vp->vp_index; 424 input->time_slice = 0; /* Run forever until something happens */ 425 input->spec_ctrl = 0; /* TODO: set sensible flags */ 426 input->flags = flags; 427 428 vp->run.flags.root_sched_dispatched = 1; 429 status = hv_do_hypercall(HVCALL_DISPATCH_VP, input, output); 430 vp->run.flags.root_sched_dispatched = 0; 431 432 *res = *output; 433 preempt_enable(); 434 435 if (!hv_result_success(status)) 436 vp_err(vp, "%s: status %s\n", __func__, 437 hv_result_to_string(status)); 438 439 return hv_result_to_errno(status); 440 } 441 442 static int 443 mshv_vp_clear_explicit_suspend(struct mshv_vp *vp) 444 { 445 struct hv_register_assoc explicit_suspend = { 446 .name = HV_REGISTER_EXPLICIT_SUSPEND, 447 .value.explicit_suspend.suspended = 0, 448 }; 449 int ret; 450 451 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 452 1, &explicit_suspend); 453 454 if (ret) 455 vp_err(vp, "Failed to unsuspend\n"); 456 457 return ret; 458 } 459 460 #if IS_ENABLED(CONFIG_X86_64) 461 static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp) 462 { 463 if (!vp->vp_register_page) 464 return 0; 465 return vp->vp_register_page->interrupt_vectors.as_uint64; 466 } 467 #else 468 static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp) 469 { 470 return 0; 471 } 472 #endif 473 474 static bool mshv_vp_dispatch_thread_blocked(struct mshv_vp *vp) 475 { 476 struct hv_stats_page **stats = vp->vp_stats_pages; 477 u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->data; 478 u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->data; 479 480 return parent_vp_cntrs[HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED] || 481 self_vp_cntrs[HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED]; 482 } 483 484 static int 485 mshv_vp_wait_for_hv_kick(struct mshv_vp *vp) 486 { 487 int ret; 488 489 ret = wait_event_interruptible(vp->run.vp_suspend_queue, 490 (vp->run.kicked_by_hv == 1 && 491 !mshv_vp_dispatch_thread_blocked(vp)) || 492 mshv_vp_interrupt_pending(vp)); 493 if (ret) 494 return -EINTR; 495 496 vp->run.flags.root_sched_blocked = 0; 497 vp->run.kicked_by_hv = 0; 498 499 return 0; 500 } 501 502 /* Must be called with interrupts enabled */ 503 static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp) 504 { 505 long ret; 506 507 if (vp->run.flags.root_sched_blocked) { 508 /* 509 * Dispatch state of this VP is blocked. Need to wait 510 * for the hypervisor to clear the blocked state before 511 * dispatching it. 512 */ 513 ret = mshv_vp_wait_for_hv_kick(vp); 514 if (ret) 515 return ret; 516 } 517 518 do { 519 u32 flags = 0; 520 struct hv_output_dispatch_vp output; 521 522 if (__xfer_to_guest_mode_work_pending()) { 523 ret = xfer_to_guest_mode_handle_work(); 524 if (ret) 525 break; 526 } 527 528 if (vp->run.flags.intercept_suspend) 529 flags |= HV_DISPATCH_VP_FLAG_CLEAR_INTERCEPT_SUSPEND; 530 531 if (mshv_vp_interrupt_pending(vp)) 532 flags |= HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION; 533 534 ret = mshv_vp_dispatch(vp, flags, &output); 535 if (ret) 536 break; 537 538 vp->run.flags.intercept_suspend = 0; 539 540 if (output.dispatch_state == HV_VP_DISPATCH_STATE_BLOCKED) { 541 if (output.dispatch_event == 542 HV_VP_DISPATCH_EVENT_SUSPEND) { 543 /* 544 * TODO: remove the warning once VP canceling 545 * is supported 546 */ 547 WARN_ONCE(atomic64_read(&vp->run.vp_signaled_count), 548 "%s: vp#%d: unexpected explicit suspend\n", 549 __func__, vp->vp_index); 550 /* 551 * Need to clear explicit suspend before 552 * dispatching. 553 * Explicit suspend is either: 554 * - set right after the first VP dispatch or 555 * - set explicitly via hypercall 556 * Since the latter case is not yet supported, 557 * simply clear it here. 558 */ 559 ret = mshv_vp_clear_explicit_suspend(vp); 560 if (ret) 561 break; 562 563 ret = mshv_vp_wait_for_hv_kick(vp); 564 if (ret) 565 break; 566 } else { 567 vp->run.flags.root_sched_blocked = 1; 568 ret = mshv_vp_wait_for_hv_kick(vp); 569 if (ret) 570 break; 571 } 572 } else { 573 /* HV_VP_DISPATCH_STATE_READY */ 574 if (output.dispatch_event == 575 HV_VP_DISPATCH_EVENT_INTERCEPT) 576 vp->run.flags.intercept_suspend = 1; 577 } 578 } while (!vp->run.flags.intercept_suspend); 579 580 rseq_virt_userspace_exit(); 581 582 return ret; 583 } 584 585 static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ, 586 "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ"); 587 588 static struct mshv_mem_region * 589 mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn) 590 { 591 struct mshv_mem_region *region; 592 593 hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { 594 if (gfn >= region->start_gfn && 595 gfn < region->start_gfn + region->nr_pages) 596 return region; 597 } 598 599 return NULL; 600 } 601 602 static struct mshv_mem_region * 603 mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn) 604 { 605 struct mshv_mem_region *region; 606 607 spin_lock(&p->pt_mem_regions_lock); 608 region = mshv_partition_region_by_gfn(p, gfn); 609 if (!region || !mshv_region_get(region)) { 610 spin_unlock(&p->pt_mem_regions_lock); 611 return NULL; 612 } 613 spin_unlock(&p->pt_mem_regions_lock); 614 615 return region; 616 } 617 618 /** 619 * mshv_handle_gpa_intercept - Handle GPA (Guest Physical Address) intercepts. 620 * @vp: Pointer to the virtual processor structure. 621 * 622 * This function processes GPA intercepts by identifying the memory region 623 * corresponding to the intercepted GPA, aligning the page offset, and 624 * mapping the required pages. It ensures that the region is valid and 625 * handles faults efficiently by mapping multiple pages at once. 626 * 627 * Return: true if the intercept was handled successfully, false otherwise. 628 */ 629 static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) 630 { 631 struct mshv_partition *p = vp->vp_partition; 632 struct mshv_mem_region *region; 633 bool ret = false; 634 u64 gfn; 635 #if defined(CONFIG_X86_64) 636 struct hv_x64_memory_intercept_message *msg = 637 (struct hv_x64_memory_intercept_message *) 638 vp->vp_intercept_msg_page->u.payload; 639 #elif defined(CONFIG_ARM64) 640 struct hv_arm64_memory_intercept_message *msg = 641 (struct hv_arm64_memory_intercept_message *) 642 vp->vp_intercept_msg_page->u.payload; 643 #endif 644 enum hv_intercept_access_type access_type = 645 msg->header.intercept_access_type; 646 647 gfn = HVPFN_DOWN(msg->guest_physical_address); 648 649 region = mshv_partition_region_by_gfn_get(p, gfn); 650 if (!region) 651 return false; 652 653 if (access_type == HV_INTERCEPT_ACCESS_WRITE && 654 !(region->hv_map_flags & HV_MAP_GPA_WRITABLE)) 655 goto put_region; 656 657 if (access_type == HV_INTERCEPT_ACCESS_EXECUTE && 658 !(region->hv_map_flags & HV_MAP_GPA_EXECUTABLE)) 659 goto put_region; 660 661 /* Only movable memory ranges are supported for GPA intercepts */ 662 if (region->mreg_type == MSHV_REGION_TYPE_MEM_MOVABLE) 663 ret = mshv_region_handle_gfn_fault(region, gfn); 664 665 put_region: 666 mshv_region_put(region); 667 668 return ret; 669 } 670 671 static bool mshv_vp_handle_intercept(struct mshv_vp *vp) 672 { 673 switch (vp->vp_intercept_msg_page->header.message_type) { 674 case HVMSG_GPA_INTERCEPT: 675 return mshv_handle_gpa_intercept(vp); 676 } 677 return false; 678 } 679 680 static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg) 681 { 682 long rc; 683 684 do { 685 if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) 686 rc = mshv_run_vp_with_root_scheduler(vp); 687 else 688 rc = mshv_run_vp_with_hyp_scheduler(vp); 689 } while (rc == 0 && mshv_vp_handle_intercept(vp)); 690 691 if (rc) 692 return rc; 693 694 if (copy_to_user(ret_msg, vp->vp_intercept_msg_page, 695 sizeof(struct hv_message))) 696 rc = -EFAULT; 697 698 return rc; 699 } 700 701 static int 702 mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp *vp, 703 struct hv_vp_state_data state_data, 704 unsigned long user_pfn, size_t page_count, 705 bool is_set) 706 { 707 int completed, ret = 0; 708 unsigned long check; 709 struct page **pages; 710 711 if (page_count > INT_MAX) 712 return -EINVAL; 713 /* 714 * Check the arithmetic for wraparound/overflow. 715 * The last page address in the buffer is: 716 * (user_pfn + (page_count - 1)) * PAGE_SIZE 717 */ 718 if (check_add_overflow(user_pfn, (page_count - 1), &check)) 719 return -EOVERFLOW; 720 if (check_mul_overflow(check, PAGE_SIZE, &check)) 721 return -EOVERFLOW; 722 723 /* Pin user pages so hypervisor can copy directly to them */ 724 pages = kzalloc_objs(struct page *, page_count); 725 if (!pages) 726 return -ENOMEM; 727 728 for (completed = 0; completed < page_count; completed += ret) { 729 unsigned long user_addr = (user_pfn + completed) * PAGE_SIZE; 730 int remaining = page_count - completed; 731 732 ret = pin_user_pages_fast(user_addr, remaining, FOLL_WRITE, 733 &pages[completed]); 734 if (ret < 0) { 735 vp_err(vp, "%s: Failed to pin user pages error %i\n", 736 __func__, ret); 737 goto unpin_pages; 738 } 739 } 740 741 if (is_set) 742 ret = hv_call_set_vp_state(vp->vp_index, 743 vp->vp_partition->pt_id, 744 state_data, page_count, pages, 745 0, NULL); 746 else 747 ret = hv_call_get_vp_state(vp->vp_index, 748 vp->vp_partition->pt_id, 749 state_data, page_count, pages, 750 NULL); 751 752 unpin_pages: 753 unpin_user_pages(pages, completed); 754 kfree(pages); 755 return ret; 756 } 757 758 static long 759 mshv_vp_ioctl_get_set_state(struct mshv_vp *vp, 760 struct mshv_get_set_vp_state __user *user_args, 761 bool is_set) 762 { 763 struct mshv_get_set_vp_state args; 764 long ret = 0; 765 union hv_output_get_vp_state vp_state; 766 u32 data_sz; 767 struct hv_vp_state_data state_data = {}; 768 769 if (copy_from_user(&args, user_args, sizeof(args))) 770 return -EFAULT; 771 772 if (args.type >= MSHV_VP_STATE_COUNT || mshv_field_nonzero(args, rsvd) || 773 !args.buf_sz || !PAGE_ALIGNED(args.buf_sz) || 774 !PAGE_ALIGNED(args.buf_ptr)) 775 return -EINVAL; 776 777 if (!access_ok((void __user *)args.buf_ptr, args.buf_sz)) 778 return -EFAULT; 779 780 switch (args.type) { 781 case MSHV_VP_STATE_LAPIC: 782 state_data.type = HV_GET_SET_VP_STATE_LAPIC_STATE; 783 data_sz = HV_HYP_PAGE_SIZE; 784 break; 785 case MSHV_VP_STATE_XSAVE: 786 { 787 u64 data_sz_64; 788 789 ret = hv_call_get_partition_property(vp->vp_partition->pt_id, 790 HV_PARTITION_PROPERTY_XSAVE_STATES, 791 &state_data.xsave.states.as_uint64); 792 if (ret) 793 return ret; 794 795 ret = hv_call_get_partition_property(vp->vp_partition->pt_id, 796 HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE, 797 &data_sz_64); 798 if (ret) 799 return ret; 800 801 data_sz = (u32)data_sz_64; 802 state_data.xsave.flags = 0; 803 /* Always request legacy states */ 804 state_data.xsave.states.legacy_x87 = 1; 805 state_data.xsave.states.legacy_sse = 1; 806 state_data.type = HV_GET_SET_VP_STATE_XSAVE; 807 break; 808 } 809 case MSHV_VP_STATE_SIMP: 810 state_data.type = HV_GET_SET_VP_STATE_SIM_PAGE; 811 data_sz = HV_HYP_PAGE_SIZE; 812 break; 813 case MSHV_VP_STATE_SIEFP: 814 state_data.type = HV_GET_SET_VP_STATE_SIEF_PAGE; 815 data_sz = HV_HYP_PAGE_SIZE; 816 break; 817 case MSHV_VP_STATE_SYNTHETIC_TIMERS: 818 state_data.type = HV_GET_SET_VP_STATE_SYNTHETIC_TIMERS; 819 data_sz = sizeof(vp_state.synthetic_timers_state); 820 break; 821 default: 822 return -EINVAL; 823 } 824 825 if (copy_to_user(&user_args->buf_sz, &data_sz, sizeof(user_args->buf_sz))) 826 return -EFAULT; 827 828 if (data_sz > args.buf_sz) 829 return -EINVAL; 830 831 /* If the data is transmitted via pfns, delegate to helper */ 832 if (state_data.type & HV_GET_SET_VP_STATE_TYPE_PFN) { 833 unsigned long user_pfn = PFN_DOWN(args.buf_ptr); 834 size_t page_count = PFN_DOWN(args.buf_sz); 835 836 return mshv_vp_ioctl_get_set_state_pfn(vp, state_data, user_pfn, 837 page_count, is_set); 838 } 839 840 /* Paranoia check - this shouldn't happen! */ 841 if (data_sz > sizeof(vp_state)) { 842 vp_err(vp, "Invalid vp state data size!\n"); 843 return -EINVAL; 844 } 845 846 if (is_set) { 847 if (copy_from_user(&vp_state, (__user void *)args.buf_ptr, data_sz)) 848 return -EFAULT; 849 850 return hv_call_set_vp_state(vp->vp_index, 851 vp->vp_partition->pt_id, 852 state_data, 0, NULL, 853 sizeof(vp_state), (u8 *)&vp_state); 854 } 855 856 ret = hv_call_get_vp_state(vp->vp_index, vp->vp_partition->pt_id, 857 state_data, 0, NULL, &vp_state); 858 if (ret) 859 return ret; 860 861 if (copy_to_user((void __user *)args.buf_ptr, &vp_state, data_sz)) 862 return -EFAULT; 863 864 return 0; 865 } 866 867 static long 868 mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) 869 { 870 struct mshv_vp *vp = filp->private_data; 871 long r = -ENOTTY; 872 873 if (mutex_lock_killable(&vp->vp_mutex)) 874 return -EINTR; 875 876 switch (ioctl) { 877 case MSHV_RUN_VP: 878 r = mshv_vp_ioctl_run_vp(vp, (void __user *)arg); 879 break; 880 case MSHV_GET_VP_STATE: 881 r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, false); 882 break; 883 case MSHV_SET_VP_STATE: 884 r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, true); 885 break; 886 case MSHV_ROOT_HVCALL: 887 r = mshv_ioctl_passthru_hvcall(vp->vp_partition, false, 888 (void __user *)arg); 889 break; 890 default: 891 vp_warn(vp, "Invalid ioctl: %#x\n", ioctl); 892 break; 893 } 894 mutex_unlock(&vp->vp_mutex); 895 896 return r; 897 } 898 899 static vm_fault_t mshv_vp_fault(struct vm_fault *vmf) 900 { 901 struct mshv_vp *vp = vmf->vma->vm_file->private_data; 902 903 switch (vmf->vma->vm_pgoff) { 904 case MSHV_VP_MMAP_OFFSET_REGISTERS: 905 vmf->page = virt_to_page(vp->vp_register_page); 906 break; 907 case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE: 908 vmf->page = virt_to_page(vp->vp_intercept_msg_page); 909 break; 910 case MSHV_VP_MMAP_OFFSET_GHCB: 911 vmf->page = virt_to_page(vp->vp_ghcb_page); 912 break; 913 default: 914 return VM_FAULT_SIGBUS; 915 } 916 917 get_page(vmf->page); 918 919 return 0; 920 } 921 922 static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma) 923 { 924 struct mshv_vp *vp = file->private_data; 925 926 switch (vma->vm_pgoff) { 927 case MSHV_VP_MMAP_OFFSET_REGISTERS: 928 if (!vp->vp_register_page) 929 return -ENODEV; 930 break; 931 case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE: 932 if (!vp->vp_intercept_msg_page) 933 return -ENODEV; 934 break; 935 case MSHV_VP_MMAP_OFFSET_GHCB: 936 if (!vp->vp_ghcb_page) 937 return -ENODEV; 938 break; 939 default: 940 return -EINVAL; 941 } 942 943 vma->vm_ops = &mshv_vp_vm_ops; 944 return 0; 945 } 946 947 static int 948 mshv_vp_release(struct inode *inode, struct file *filp) 949 { 950 struct mshv_vp *vp = filp->private_data; 951 952 /* Rest of VP cleanup happens in destroy_partition() */ 953 mshv_partition_put(vp->vp_partition); 954 return 0; 955 } 956 957 void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index, 958 struct hv_stats_page *stats_pages[]) 959 { 960 union hv_stats_object_identity identity = { 961 .vp.partition_id = partition_id, 962 .vp.vp_index = vp_index, 963 }; 964 int err; 965 966 identity.vp.stats_area_type = HV_STATS_AREA_SELF; 967 err = hv_unmap_stats_page(HV_STATS_OBJECT_VP, 968 stats_pages[HV_STATS_AREA_SELF], 969 &identity); 970 if (err) 971 pr_err("%s: failed to unmap partition %llu vp %u self stats, err: %d\n", 972 __func__, partition_id, vp_index, err); 973 974 if (stats_pages[HV_STATS_AREA_PARENT] != stats_pages[HV_STATS_AREA_SELF]) { 975 identity.vp.stats_area_type = HV_STATS_AREA_PARENT; 976 err = hv_unmap_stats_page(HV_STATS_OBJECT_VP, 977 stats_pages[HV_STATS_AREA_PARENT], 978 &identity); 979 if (err) 980 pr_err("%s: failed to unmap partition %llu vp %u parent stats, err: %d\n", 981 __func__, partition_id, vp_index, err); 982 } 983 } 984 985 int mshv_vp_stats_map(u64 partition_id, u32 vp_index, 986 struct hv_stats_page *stats_pages[]) 987 { 988 union hv_stats_object_identity identity = { 989 .vp.partition_id = partition_id, 990 .vp.vp_index = vp_index, 991 }; 992 int err; 993 994 identity.vp.stats_area_type = HV_STATS_AREA_SELF; 995 err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity, 996 &stats_pages[HV_STATS_AREA_SELF]); 997 if (err) { 998 pr_err("%s: failed to map partition %llu vp %u self stats, err: %d\n", 999 __func__, partition_id, vp_index, err); 1000 return err; 1001 } 1002 1003 /* 1004 * L1VH partition cannot access its vp stats in parent area. 1005 */ 1006 if (is_l1vh_parent(partition_id)) { 1007 stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF]; 1008 } else { 1009 identity.vp.stats_area_type = HV_STATS_AREA_PARENT; 1010 err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity, 1011 &stats_pages[HV_STATS_AREA_PARENT]); 1012 if (err) { 1013 pr_err("%s: failed to map partition %llu vp %u parent stats, err: %d\n", 1014 __func__, partition_id, vp_index, err); 1015 goto unmap_self; 1016 } 1017 if (!stats_pages[HV_STATS_AREA_PARENT]) 1018 stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF]; 1019 } 1020 1021 return 0; 1022 1023 unmap_self: 1024 identity.vp.stats_area_type = HV_STATS_AREA_SELF; 1025 hv_unmap_stats_page(HV_STATS_OBJECT_VP, 1026 stats_pages[HV_STATS_AREA_SELF], 1027 &identity); 1028 return err; 1029 } 1030 1031 static long 1032 mshv_partition_ioctl_create_vp(struct mshv_partition *partition, 1033 void __user *arg) 1034 { 1035 struct mshv_create_vp args; 1036 struct mshv_vp *vp; 1037 struct page *intercept_msg_page, *register_page, *ghcb_page; 1038 struct hv_stats_page *stats_pages[2]; 1039 long ret; 1040 1041 if (copy_from_user(&args, arg, sizeof(args))) 1042 return -EFAULT; 1043 1044 if (args.vp_index >= MSHV_MAX_VPS) 1045 return -EINVAL; 1046 1047 if (partition->pt_vp_array[args.vp_index]) 1048 return -EEXIST; 1049 1050 ret = hv_call_create_vp(NUMA_NO_NODE, partition->pt_id, args.vp_index, 1051 0 /* Only valid for root partition VPs */); 1052 if (ret) 1053 return ret; 1054 1055 ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, 1056 HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, 1057 input_vtl_zero, &intercept_msg_page); 1058 if (ret) 1059 goto destroy_vp; 1060 1061 if (!mshv_partition_encrypted(partition)) { 1062 ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, 1063 HV_VP_STATE_PAGE_REGISTERS, 1064 input_vtl_zero, ®ister_page); 1065 if (ret) 1066 goto unmap_intercept_message_page; 1067 } 1068 1069 if (mshv_partition_encrypted(partition) && 1070 is_ghcb_mapping_available()) { 1071 ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, 1072 HV_VP_STATE_PAGE_GHCB, 1073 input_vtl_normal, &ghcb_page); 1074 if (ret) 1075 goto unmap_register_page; 1076 } 1077 1078 ret = mshv_vp_stats_map(partition->pt_id, args.vp_index, 1079 stats_pages); 1080 if (ret) 1081 goto unmap_ghcb_page; 1082 1083 vp = kzalloc_obj(*vp); 1084 if (!vp) 1085 goto unmap_stats_pages; 1086 1087 vp->vp_partition = mshv_partition_get(partition); 1088 if (!vp->vp_partition) { 1089 ret = -EBADF; 1090 goto free_vp; 1091 } 1092 1093 mutex_init(&vp->vp_mutex); 1094 init_waitqueue_head(&vp->run.vp_suspend_queue); 1095 atomic64_set(&vp->run.vp_signaled_count, 0); 1096 1097 vp->vp_index = args.vp_index; 1098 vp->vp_intercept_msg_page = page_to_virt(intercept_msg_page); 1099 if (!mshv_partition_encrypted(partition)) 1100 vp->vp_register_page = page_to_virt(register_page); 1101 1102 if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) 1103 vp->vp_ghcb_page = page_to_virt(ghcb_page); 1104 1105 memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages)); 1106 1107 ret = mshv_debugfs_vp_create(vp); 1108 if (ret) 1109 goto put_partition; 1110 1111 /* 1112 * Keep anon_inode_getfd last: it installs fd in the file struct and 1113 * thus makes the state accessible in user space. 1114 */ 1115 ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp, 1116 O_RDWR | O_CLOEXEC); 1117 if (ret < 0) 1118 goto remove_debugfs_vp; 1119 1120 /* already exclusive with the partition mutex for all ioctls */ 1121 partition->pt_vp_count++; 1122 partition->pt_vp_array[args.vp_index] = vp; 1123 1124 return ret; 1125 1126 remove_debugfs_vp: 1127 mshv_debugfs_vp_remove(vp); 1128 put_partition: 1129 mshv_partition_put(partition); 1130 free_vp: 1131 kfree(vp); 1132 unmap_stats_pages: 1133 mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages); 1134 unmap_ghcb_page: 1135 if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) 1136 hv_unmap_vp_state_page(partition->pt_id, args.vp_index, 1137 HV_VP_STATE_PAGE_GHCB, ghcb_page, 1138 input_vtl_normal); 1139 unmap_register_page: 1140 if (!mshv_partition_encrypted(partition)) 1141 hv_unmap_vp_state_page(partition->pt_id, args.vp_index, 1142 HV_VP_STATE_PAGE_REGISTERS, 1143 register_page, input_vtl_zero); 1144 unmap_intercept_message_page: 1145 hv_unmap_vp_state_page(partition->pt_id, args.vp_index, 1146 HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, 1147 intercept_msg_page, input_vtl_zero); 1148 destroy_vp: 1149 hv_call_delete_vp(partition->pt_id, args.vp_index); 1150 return ret; 1151 } 1152 1153 static int mshv_init_async_handler(struct mshv_partition *partition) 1154 { 1155 if (completion_done(&partition->async_hypercall)) { 1156 pt_err(partition, 1157 "Cannot issue async hypercall while another one in progress!\n"); 1158 return -EPERM; 1159 } 1160 1161 reinit_completion(&partition->async_hypercall); 1162 return 0; 1163 } 1164 1165 static void mshv_async_hvcall_handler(void *data, u64 *status) 1166 { 1167 struct mshv_partition *partition = data; 1168 1169 wait_for_completion(&partition->async_hypercall); 1170 pt_dbg(partition, "Async hypercall completed!\n"); 1171 1172 *status = partition->async_hypercall_status; 1173 } 1174 1175 /* 1176 * NB: caller checks and makes sure mem->size is page aligned 1177 * Returns: 0 with regionpp updated on success, or -errno 1178 */ 1179 static int mshv_partition_create_region(struct mshv_partition *partition, 1180 struct mshv_user_mem_region *mem, 1181 struct mshv_mem_region **regionpp, 1182 bool is_mmio) 1183 { 1184 struct mshv_mem_region *rg; 1185 u64 nr_pages = HVPFN_DOWN(mem->size); 1186 1187 /* Reject overlapping regions */ 1188 spin_lock(&partition->pt_mem_regions_lock); 1189 hlist_for_each_entry(rg, &partition->pt_mem_regions, hnode) { 1190 if (mem->guest_pfn + nr_pages <= rg->start_gfn || 1191 rg->start_gfn + rg->nr_pages <= mem->guest_pfn) 1192 continue; 1193 spin_unlock(&partition->pt_mem_regions_lock); 1194 return -EEXIST; 1195 } 1196 spin_unlock(&partition->pt_mem_regions_lock); 1197 1198 rg = mshv_region_create(mem->guest_pfn, nr_pages, 1199 mem->userspace_addr, mem->flags); 1200 if (IS_ERR(rg)) 1201 return PTR_ERR(rg); 1202 1203 if (is_mmio) 1204 rg->mreg_type = MSHV_REGION_TYPE_MMIO; 1205 else if (mshv_partition_encrypted(partition) || 1206 !mshv_region_movable_init(rg)) 1207 rg->mreg_type = MSHV_REGION_TYPE_MEM_PINNED; 1208 else 1209 rg->mreg_type = MSHV_REGION_TYPE_MEM_MOVABLE; 1210 1211 rg->partition = partition; 1212 1213 *regionpp = rg; 1214 1215 return 0; 1216 } 1217 1218 /** 1219 * mshv_prepare_pinned_region - Pin and map memory regions 1220 * @region: Pointer to the memory region structure 1221 * 1222 * This function processes memory regions that are explicitly marked as pinned. 1223 * Pinned regions are preallocated, mapped upfront, and do not rely on fault-based 1224 * population. The function ensures the region is properly populated, handles 1225 * encryption requirements for SNP partitions if applicable, maps the region, 1226 * and performs necessary sharing or eviction operations based on the mapping 1227 * result. 1228 * 1229 * Return: 0 on success, negative error code on failure. 1230 */ 1231 static int mshv_prepare_pinned_region(struct mshv_mem_region *region) 1232 { 1233 struct mshv_partition *partition = region->partition; 1234 int ret; 1235 1236 ret = mshv_region_pin(region); 1237 if (ret) { 1238 pt_err(partition, "Failed to pin memory region: %d\n", 1239 ret); 1240 goto err_out; 1241 } 1242 1243 /* 1244 * For an SNP partition it is a requirement that for every memory region 1245 * that we are going to map for this partition we should make sure that 1246 * host access to that region is released. This is ensured by doing an 1247 * additional hypercall which will update the SLAT to release host 1248 * access to guest memory regions. 1249 */ 1250 if (mshv_partition_encrypted(partition)) { 1251 ret = mshv_region_unshare(region); 1252 if (ret) { 1253 pt_err(partition, 1254 "Failed to unshare memory region (guest_pfn: %llu): %d\n", 1255 region->start_gfn, ret); 1256 goto invalidate_region; 1257 } 1258 } 1259 1260 ret = mshv_region_map(region); 1261 if (ret && mshv_partition_encrypted(partition)) { 1262 int shrc; 1263 1264 shrc = mshv_region_share(region); 1265 if (!shrc) 1266 goto invalidate_region; 1267 1268 pt_err(partition, 1269 "Failed to share memory region (guest_pfn: %llu): %d\n", 1270 region->start_gfn, shrc); 1271 /* 1272 * Don't unpin if marking shared failed because pages are no 1273 * longer mapped in the host, ie root, anymore. 1274 */ 1275 goto err_out; 1276 } 1277 1278 return 0; 1279 1280 invalidate_region: 1281 mshv_region_invalidate(region); 1282 err_out: 1283 return ret; 1284 } 1285 1286 /* 1287 * This maps two things: guest RAM and for pci passthru mmio space. 1288 * 1289 * mmio: 1290 * - vfio overloads vm_pgoff to store the mmio start pfn/spa. 1291 * - Two things need to happen for mapping mmio range: 1292 * 1. mapped in the uaddr so VMM can access it. 1293 * 2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it. 1294 * 1295 * This function takes care of the second. The first one is managed by vfio, 1296 * and hence is taken care of via vfio_pci_mmap_fault(). 1297 */ 1298 static long 1299 mshv_map_user_memory(struct mshv_partition *partition, 1300 struct mshv_user_mem_region *mem) 1301 { 1302 struct mshv_mem_region *region; 1303 struct vm_area_struct *vma; 1304 bool is_mmio; 1305 ulong mmio_pfn; 1306 long ret; 1307 1308 if (mem->flags & BIT(MSHV_SET_MEM_BIT_UNMAP) || 1309 !access_ok((const void __user *)mem->userspace_addr, mem->size)) 1310 return -EINVAL; 1311 1312 mmap_read_lock(current->mm); 1313 vma = vma_lookup(current->mm, mem->userspace_addr); 1314 is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0; 1315 mmio_pfn = is_mmio ? vma->vm_pgoff : 0; 1316 mmap_read_unlock(current->mm); 1317 1318 if (!vma) 1319 return -EINVAL; 1320 1321 ret = mshv_partition_create_region(partition, mem, ®ion, 1322 is_mmio); 1323 if (ret) 1324 return ret; 1325 1326 switch (region->mreg_type) { 1327 case MSHV_REGION_TYPE_MEM_PINNED: 1328 ret = mshv_prepare_pinned_region(region); 1329 break; 1330 case MSHV_REGION_TYPE_MEM_MOVABLE: 1331 /* 1332 * For movable memory regions, remap with no access to let 1333 * the hypervisor track dirty pages, enabling pre-copy live 1334 * migration. 1335 */ 1336 ret = hv_call_map_gpa_pages(partition->pt_id, 1337 region->start_gfn, 1338 region->nr_pages, 1339 HV_MAP_GPA_NO_ACCESS, NULL); 1340 break; 1341 case MSHV_REGION_TYPE_MMIO: 1342 ret = hv_call_map_mmio_pages(partition->pt_id, 1343 region->start_gfn, 1344 mmio_pfn, 1345 region->nr_pages); 1346 break; 1347 } 1348 1349 if (ret) 1350 goto errout; 1351 1352 spin_lock(&partition->pt_mem_regions_lock); 1353 hlist_add_head(®ion->hnode, &partition->pt_mem_regions); 1354 spin_unlock(&partition->pt_mem_regions_lock); 1355 1356 return 0; 1357 1358 errout: 1359 mshv_region_put(region); 1360 return ret; 1361 } 1362 1363 /* Called for unmapping both the guest ram and the mmio space */ 1364 static long 1365 mshv_unmap_user_memory(struct mshv_partition *partition, 1366 struct mshv_user_mem_region *mem) 1367 { 1368 struct mshv_mem_region *region; 1369 1370 if (!(mem->flags & BIT(MSHV_SET_MEM_BIT_UNMAP))) 1371 return -EINVAL; 1372 1373 spin_lock(&partition->pt_mem_regions_lock); 1374 1375 region = mshv_partition_region_by_gfn(partition, mem->guest_pfn); 1376 if (!region) { 1377 spin_unlock(&partition->pt_mem_regions_lock); 1378 return -ENOENT; 1379 } 1380 1381 /* Paranoia check */ 1382 if (region->start_uaddr != mem->userspace_addr || 1383 region->start_gfn != mem->guest_pfn || 1384 region->nr_pages != HVPFN_DOWN(mem->size)) { 1385 spin_unlock(&partition->pt_mem_regions_lock); 1386 return -EINVAL; 1387 } 1388 1389 hlist_del(®ion->hnode); 1390 1391 spin_unlock(&partition->pt_mem_regions_lock); 1392 1393 mshv_region_put(region); 1394 1395 return 0; 1396 } 1397 1398 static long 1399 mshv_partition_ioctl_set_memory(struct mshv_partition *partition, 1400 struct mshv_user_mem_region __user *user_mem) 1401 { 1402 struct mshv_user_mem_region mem; 1403 1404 if (copy_from_user(&mem, user_mem, sizeof(mem))) 1405 return -EFAULT; 1406 1407 if (!mem.size || 1408 !PAGE_ALIGNED(mem.size) || 1409 !PAGE_ALIGNED(mem.userspace_addr) || 1410 (mem.flags & ~MSHV_SET_MEM_FLAGS_MASK) || 1411 mshv_field_nonzero(mem, rsvd)) 1412 return -EINVAL; 1413 1414 if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP)) 1415 return mshv_unmap_user_memory(partition, &mem); 1416 1417 return mshv_map_user_memory(partition, &mem); 1418 } 1419 1420 static long 1421 mshv_partition_ioctl_ioeventfd(struct mshv_partition *partition, 1422 void __user *user_args) 1423 { 1424 struct mshv_user_ioeventfd args; 1425 1426 if (copy_from_user(&args, user_args, sizeof(args))) 1427 return -EFAULT; 1428 1429 return mshv_set_unset_ioeventfd(partition, &args); 1430 } 1431 1432 static long 1433 mshv_partition_ioctl_irqfd(struct mshv_partition *partition, 1434 void __user *user_args) 1435 { 1436 struct mshv_user_irqfd args; 1437 1438 if (copy_from_user(&args, user_args, sizeof(args))) 1439 return -EFAULT; 1440 1441 return mshv_set_unset_irqfd(partition, &args); 1442 } 1443 1444 static long 1445 mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition *partition, 1446 void __user *user_args) 1447 { 1448 struct mshv_gpap_access_bitmap args; 1449 union hv_gpa_page_access_state *states; 1450 long ret, i; 1451 union hv_gpa_page_access_state_flags hv_flags = {}; 1452 u8 hv_type_mask; 1453 ulong bitmap_buf_sz, states_buf_sz; 1454 int written = 0; 1455 1456 if (copy_from_user(&args, user_args, sizeof(args))) 1457 return -EFAULT; 1458 1459 if (args.access_type >= MSHV_GPAP_ACCESS_TYPE_COUNT || 1460 args.access_op >= MSHV_GPAP_ACCESS_OP_COUNT || 1461 mshv_field_nonzero(args, rsvd) || !args.page_count || 1462 !args.bitmap_ptr) 1463 return -EINVAL; 1464 1465 if (check_mul_overflow(args.page_count, sizeof(*states), &states_buf_sz)) 1466 return -E2BIG; 1467 1468 /* Num bytes needed to store bitmap; one bit per page rounded up */ 1469 bitmap_buf_sz = DIV_ROUND_UP(args.page_count, 8); 1470 1471 /* Sanity check */ 1472 if (bitmap_buf_sz > states_buf_sz) 1473 return -EBADFD; 1474 1475 switch (args.access_type) { 1476 case MSHV_GPAP_ACCESS_TYPE_ACCESSED: 1477 hv_type_mask = 1; 1478 if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) { 1479 hv_flags.clear_accessed = 1; 1480 /* not accessed implies not dirty */ 1481 hv_flags.clear_dirty = 1; 1482 } else { /* MSHV_GPAP_ACCESS_OP_SET */ 1483 hv_flags.set_accessed = 1; 1484 } 1485 break; 1486 case MSHV_GPAP_ACCESS_TYPE_DIRTY: 1487 hv_type_mask = 2; 1488 if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) { 1489 hv_flags.clear_dirty = 1; 1490 } else { /* MSHV_GPAP_ACCESS_OP_SET */ 1491 hv_flags.set_dirty = 1; 1492 /* dirty implies accessed */ 1493 hv_flags.set_accessed = 1; 1494 } 1495 break; 1496 } 1497 1498 states = vzalloc(states_buf_sz); 1499 if (!states) 1500 return -ENOMEM; 1501 1502 ret = hv_call_get_gpa_access_states(partition->pt_id, args.page_count, 1503 args.gpap_base, hv_flags, &written, 1504 states); 1505 if (ret) 1506 goto free_return; 1507 1508 /* 1509 * Overwrite states buffer with bitmap - the bits in hv_type_mask 1510 * correspond to bitfields in hv_gpa_page_access_state 1511 */ 1512 for (i = 0; i < written; ++i) 1513 __assign_bit(i, (ulong *)states, 1514 states[i].as_uint8 & hv_type_mask); 1515 1516 /* zero the unused bits in the last byte(s) of the returned bitmap */ 1517 for (i = written; i < bitmap_buf_sz * 8; ++i) 1518 __clear_bit(i, (ulong *)states); 1519 1520 if (copy_to_user((void __user *)args.bitmap_ptr, states, bitmap_buf_sz)) 1521 ret = -EFAULT; 1522 1523 free_return: 1524 vfree(states); 1525 return ret; 1526 } 1527 1528 static long 1529 mshv_partition_ioctl_set_msi_routing(struct mshv_partition *partition, 1530 void __user *user_args) 1531 { 1532 struct mshv_user_irq_entry *entries = NULL; 1533 struct mshv_user_irq_table args; 1534 long ret; 1535 1536 if (copy_from_user(&args, user_args, sizeof(args))) 1537 return -EFAULT; 1538 1539 if (args.nr > MSHV_MAX_GUEST_IRQS || 1540 mshv_field_nonzero(args, rsvd)) 1541 return -EINVAL; 1542 1543 if (args.nr) { 1544 struct mshv_user_irq_table __user *urouting = user_args; 1545 1546 entries = vmemdup_user(urouting->entries, 1547 array_size(sizeof(*entries), 1548 args.nr)); 1549 if (IS_ERR(entries)) 1550 return PTR_ERR(entries); 1551 } 1552 ret = mshv_update_routing_table(partition, entries, args.nr); 1553 kvfree(entries); 1554 1555 return ret; 1556 } 1557 1558 static long 1559 mshv_partition_ioctl_initialize(struct mshv_partition *partition) 1560 { 1561 long ret; 1562 1563 if (partition->pt_initialized) 1564 return 0; 1565 1566 ret = hv_call_initialize_partition(partition->pt_id); 1567 if (ret) 1568 goto withdraw_mem; 1569 1570 ret = mshv_debugfs_partition_create(partition); 1571 if (ret) 1572 goto finalize_partition; 1573 1574 partition->pt_initialized = true; 1575 1576 return 0; 1577 1578 finalize_partition: 1579 hv_call_finalize_partition(partition->pt_id); 1580 withdraw_mem: 1581 hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); 1582 1583 return ret; 1584 } 1585 1586 static long 1587 mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) 1588 { 1589 struct mshv_partition *partition = filp->private_data; 1590 long ret; 1591 void __user *uarg = (void __user *)arg; 1592 1593 if (mutex_lock_killable(&partition->pt_mutex)) 1594 return -EINTR; 1595 1596 switch (ioctl) { 1597 case MSHV_INITIALIZE_PARTITION: 1598 ret = mshv_partition_ioctl_initialize(partition); 1599 break; 1600 case MSHV_SET_GUEST_MEMORY: 1601 ret = mshv_partition_ioctl_set_memory(partition, uarg); 1602 break; 1603 case MSHV_CREATE_VP: 1604 ret = mshv_partition_ioctl_create_vp(partition, uarg); 1605 break; 1606 case MSHV_IRQFD: 1607 ret = mshv_partition_ioctl_irqfd(partition, uarg); 1608 break; 1609 case MSHV_IOEVENTFD: 1610 ret = mshv_partition_ioctl_ioeventfd(partition, uarg); 1611 break; 1612 case MSHV_SET_MSI_ROUTING: 1613 ret = mshv_partition_ioctl_set_msi_routing(partition, uarg); 1614 break; 1615 case MSHV_GET_GPAP_ACCESS_BITMAP: 1616 ret = mshv_partition_ioctl_get_gpap_access_bitmap(partition, 1617 uarg); 1618 break; 1619 case MSHV_ROOT_HVCALL: 1620 ret = mshv_ioctl_passthru_hvcall(partition, true, uarg); 1621 break; 1622 default: 1623 ret = -ENOTTY; 1624 } 1625 1626 mutex_unlock(&partition->pt_mutex); 1627 return ret; 1628 } 1629 1630 static int 1631 disable_vp_dispatch(struct mshv_vp *vp) 1632 { 1633 int ret; 1634 struct hv_register_assoc dispatch_suspend = { 1635 .name = HV_REGISTER_DISPATCH_SUSPEND, 1636 .value.dispatch_suspend.suspended = 1, 1637 }; 1638 1639 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 1640 1, &dispatch_suspend); 1641 if (ret) 1642 vp_err(vp, "failed to suspend\n"); 1643 1644 return ret; 1645 } 1646 1647 static int 1648 get_vp_signaled_count(struct mshv_vp *vp, u64 *count) 1649 { 1650 int ret; 1651 struct hv_register_assoc root_signal_count = { 1652 .name = HV_REGISTER_VP_ROOT_SIGNAL_COUNT, 1653 }; 1654 1655 ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 1656 1, &root_signal_count); 1657 1658 if (ret) { 1659 vp_err(vp, "Failed to get root signal count"); 1660 *count = 0; 1661 return ret; 1662 } 1663 1664 *count = root_signal_count.value.reg64; 1665 1666 return ret; 1667 } 1668 1669 static void 1670 drain_vp_signals(struct mshv_vp *vp) 1671 { 1672 u64 hv_signal_count; 1673 u64 vp_signal_count; 1674 1675 get_vp_signaled_count(vp, &hv_signal_count); 1676 1677 vp_signal_count = atomic64_read(&vp->run.vp_signaled_count); 1678 1679 /* 1680 * There should be at most 1 outstanding notification, but be extra 1681 * careful anyway. 1682 */ 1683 while (hv_signal_count != vp_signal_count) { 1684 WARN_ON(hv_signal_count - vp_signal_count != 1); 1685 1686 if (wait_event_interruptible(vp->run.vp_suspend_queue, 1687 vp->run.kicked_by_hv == 1)) 1688 break; 1689 vp->run.kicked_by_hv = 0; 1690 vp_signal_count = atomic64_read(&vp->run.vp_signaled_count); 1691 } 1692 } 1693 1694 static void drain_all_vps(const struct mshv_partition *partition) 1695 { 1696 int i; 1697 struct mshv_vp *vp; 1698 1699 /* 1700 * VPs are reachable from ISR. It is safe to not take the partition 1701 * lock because nobody else can enter this function and drop the 1702 * partition from the list. 1703 */ 1704 for (i = 0; i < MSHV_MAX_VPS; i++) { 1705 vp = partition->pt_vp_array[i]; 1706 if (!vp) 1707 continue; 1708 /* 1709 * Disable dispatching of the VP in the hypervisor. After this 1710 * the hypervisor guarantees it won't generate any signals for 1711 * the VP and the hypervisor's VP signal count won't change. 1712 */ 1713 disable_vp_dispatch(vp); 1714 drain_vp_signals(vp); 1715 } 1716 } 1717 1718 static void 1719 remove_partition(struct mshv_partition *partition) 1720 { 1721 spin_lock(&mshv_root.pt_ht_lock); 1722 hlist_del_rcu(&partition->pt_hnode); 1723 spin_unlock(&mshv_root.pt_ht_lock); 1724 1725 synchronize_rcu(); 1726 } 1727 1728 /* 1729 * Tear down a partition and remove it from the list. 1730 * Partition's refcount must be 0 1731 */ 1732 static void destroy_partition(struct mshv_partition *partition) 1733 { 1734 struct mshv_vp *vp; 1735 struct mshv_mem_region *region; 1736 struct hlist_node *n; 1737 int i; 1738 1739 if (refcount_read(&partition->pt_ref_count)) { 1740 pt_err(partition, 1741 "Attempt to destroy partition but refcount > 0\n"); 1742 return; 1743 } 1744 1745 if (partition->pt_initialized) { 1746 /* 1747 * We only need to drain signals for root scheduler. This should be 1748 * done before removing the partition from the partition list. 1749 */ 1750 if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) 1751 drain_all_vps(partition); 1752 1753 /* Remove vps */ 1754 for (i = 0; i < MSHV_MAX_VPS; ++i) { 1755 vp = partition->pt_vp_array[i]; 1756 if (!vp) 1757 continue; 1758 1759 mshv_debugfs_vp_remove(vp); 1760 mshv_vp_stats_unmap(partition->pt_id, vp->vp_index, 1761 vp->vp_stats_pages); 1762 1763 if (vp->vp_register_page) { 1764 (void)hv_unmap_vp_state_page(partition->pt_id, 1765 vp->vp_index, 1766 HV_VP_STATE_PAGE_REGISTERS, 1767 virt_to_page(vp->vp_register_page), 1768 input_vtl_zero); 1769 vp->vp_register_page = NULL; 1770 } 1771 1772 (void)hv_unmap_vp_state_page(partition->pt_id, 1773 vp->vp_index, 1774 HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, 1775 virt_to_page(vp->vp_intercept_msg_page), 1776 input_vtl_zero); 1777 vp->vp_intercept_msg_page = NULL; 1778 1779 if (vp->vp_ghcb_page) { 1780 (void)hv_unmap_vp_state_page(partition->pt_id, 1781 vp->vp_index, 1782 HV_VP_STATE_PAGE_GHCB, 1783 virt_to_page(vp->vp_ghcb_page), 1784 input_vtl_normal); 1785 vp->vp_ghcb_page = NULL; 1786 } 1787 1788 kfree(vp); 1789 1790 partition->pt_vp_array[i] = NULL; 1791 } 1792 1793 mshv_debugfs_partition_remove(partition); 1794 1795 /* Deallocates and unmaps everything including vcpus, GPA mappings etc */ 1796 hv_call_finalize_partition(partition->pt_id); 1797 1798 partition->pt_initialized = false; 1799 } 1800 1801 remove_partition(partition); 1802 1803 hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions, 1804 hnode) { 1805 hlist_del(®ion->hnode); 1806 mshv_region_put(region); 1807 } 1808 1809 /* Withdraw and free all pages we deposited */ 1810 hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); 1811 hv_call_delete_partition(partition->pt_id); 1812 1813 mshv_free_routing_table(partition); 1814 kfree(partition); 1815 } 1816 1817 struct 1818 mshv_partition *mshv_partition_get(struct mshv_partition *partition) 1819 { 1820 if (refcount_inc_not_zero(&partition->pt_ref_count)) 1821 return partition; 1822 return NULL; 1823 } 1824 1825 struct 1826 mshv_partition *mshv_partition_find(u64 partition_id) 1827 __must_hold(RCU) 1828 { 1829 struct mshv_partition *p; 1830 1831 hash_for_each_possible_rcu(mshv_root.pt_htable, p, pt_hnode, 1832 partition_id) 1833 if (p->pt_id == partition_id) 1834 return p; 1835 1836 return NULL; 1837 } 1838 1839 void 1840 mshv_partition_put(struct mshv_partition *partition) 1841 { 1842 if (refcount_dec_and_test(&partition->pt_ref_count)) 1843 destroy_partition(partition); 1844 } 1845 1846 static int 1847 mshv_partition_release(struct inode *inode, struct file *filp) 1848 { 1849 struct mshv_partition *partition = filp->private_data; 1850 1851 mshv_eventfd_release(partition); 1852 1853 cleanup_srcu_struct(&partition->pt_irq_srcu); 1854 1855 mshv_partition_put(partition); 1856 1857 return 0; 1858 } 1859 1860 static int 1861 add_partition(struct mshv_partition *partition) 1862 { 1863 spin_lock(&mshv_root.pt_ht_lock); 1864 1865 hash_add_rcu(mshv_root.pt_htable, &partition->pt_hnode, 1866 partition->pt_id); 1867 1868 spin_unlock(&mshv_root.pt_ht_lock); 1869 1870 return 0; 1871 } 1872 1873 static_assert(MSHV_NUM_CPU_FEATURES_BANKS == 1874 HV_PARTITION_PROCESSOR_FEATURES_BANKS); 1875 1876 static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags, 1877 struct hv_partition_creation_properties *cr_props, 1878 union hv_partition_isolation_properties *isol_props) 1879 { 1880 int i; 1881 struct mshv_create_partition_v2 args; 1882 union hv_partition_processor_features *disabled_procs; 1883 union hv_partition_processor_xsave_features *disabled_xsave; 1884 1885 /* First, copy v1 struct in case user is on previous versions */ 1886 if (copy_from_user(&args, user_arg, 1887 sizeof(struct mshv_create_partition))) 1888 return -EFAULT; 1889 1890 if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) || 1891 args.pt_isolation >= MSHV_PT_ISOLATION_COUNT) 1892 return -EINVAL; 1893 1894 disabled_procs = &cr_props->disabled_processor_features; 1895 disabled_xsave = &cr_props->disabled_processor_xsave_features; 1896 1897 /* Check if user provided newer struct with feature fields */ 1898 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES)) { 1899 if (copy_from_user(&args, user_arg, sizeof(args))) 1900 return -EFAULT; 1901 1902 /* Re-validate v1 fields after second copy_from_user() */ 1903 if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) || 1904 args.pt_isolation >= MSHV_PT_ISOLATION_COUNT) 1905 return -EINVAL; 1906 1907 if (args.pt_num_cpu_fbanks != MSHV_NUM_CPU_FEATURES_BANKS || 1908 mshv_field_nonzero(args, pt_rsvd) || 1909 mshv_field_nonzero(args, pt_rsvd1)) 1910 return -EINVAL; 1911 1912 /* 1913 * Note this assumes MSHV_NUM_CPU_FEATURES_BANKS will never 1914 * change and equals HV_PARTITION_PROCESSOR_FEATURES_BANKS 1915 * (i.e. 2). 1916 * 1917 * Further banks (index >= 2) will be modifiable as 'early' 1918 * properties via the set partition property hypercall. 1919 */ 1920 for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++) 1921 disabled_procs->as_uint64[i] = args.pt_cpu_fbanks[i]; 1922 1923 #if IS_ENABLED(CONFIG_X86_64) 1924 disabled_xsave->as_uint64 = args.pt_disabled_xsave; 1925 #else 1926 /* 1927 * In practice this field is ignored on arm64, but safer to 1928 * zero it in case it is ever used. 1929 */ 1930 disabled_xsave->as_uint64 = 0; 1931 1932 if (mshv_field_nonzero(args, pt_rsvd2)) 1933 return -EINVAL; 1934 #endif 1935 } else { 1936 /* 1937 * v1 behavior: try to enable everything. The hypervisor will 1938 * disable features that are not supported. The banks can be 1939 * queried via the get partition property hypercall. 1940 */ 1941 for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++) 1942 disabled_procs->as_uint64[i] = 0; 1943 1944 disabled_xsave->as_uint64 = 0; 1945 } 1946 1947 /* Only support EXO partitions */ 1948 *pt_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION | 1949 HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED; 1950 1951 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_LAPIC)) 1952 *pt_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED; 1953 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_X2APIC)) 1954 *pt_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE; 1955 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_GPA_SUPER_PAGES)) 1956 *pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED; 1957 if (args.pt_flags & BIT(MSHV_PT_BIT_NESTED_VIRTUALIZATION)) 1958 *pt_flags |= HV_PARTITION_CREATION_FLAG_NESTED_VIRTUALIZATION_CAPABLE; 1959 if (args.pt_flags & BIT(MSHV_PT_BIT_SMT_ENABLED_GUEST)) 1960 *pt_flags |= HV_PARTITION_CREATION_FLAG_SMT_ENABLED_GUEST; 1961 1962 isol_props->as_uint64 = 0; 1963 1964 switch (args.pt_isolation) { 1965 case MSHV_PT_ISOLATION_NONE: 1966 isol_props->isolation_type = HV_PARTITION_ISOLATION_TYPE_NONE; 1967 break; 1968 } 1969 1970 return 0; 1971 } 1972 1973 static long 1974 mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev) 1975 { 1976 u64 creation_flags; 1977 struct hv_partition_creation_properties creation_properties; 1978 union hv_partition_isolation_properties isolation_properties; 1979 struct mshv_partition *partition; 1980 long ret; 1981 1982 ret = mshv_ioctl_process_pt_flags(user_arg, &creation_flags, 1983 &creation_properties, 1984 &isolation_properties); 1985 if (ret) 1986 return ret; 1987 1988 partition = kzalloc_obj(*partition); 1989 if (!partition) 1990 return -ENOMEM; 1991 1992 partition->pt_module_dev = module_dev; 1993 partition->isolation_type = isolation_properties.isolation_type; 1994 1995 refcount_set(&partition->pt_ref_count, 1); 1996 1997 mutex_init(&partition->pt_mutex); 1998 1999 mutex_init(&partition->pt_irq_lock); 2000 2001 init_completion(&partition->async_hypercall); 2002 2003 INIT_HLIST_HEAD(&partition->irq_ack_notifier_list); 2004 2005 INIT_HLIST_HEAD(&partition->pt_devices); 2006 2007 spin_lock_init(&partition->pt_mem_regions_lock); 2008 INIT_HLIST_HEAD(&partition->pt_mem_regions); 2009 2010 mshv_eventfd_init(partition); 2011 2012 ret = init_srcu_struct(&partition->pt_irq_srcu); 2013 if (ret) 2014 goto free_partition; 2015 2016 ret = hv_call_create_partition(creation_flags, 2017 creation_properties, 2018 isolation_properties, 2019 &partition->pt_id); 2020 if (ret) 2021 goto cleanup_irq_srcu; 2022 2023 ret = add_partition(partition); 2024 if (ret) 2025 goto delete_partition; 2026 2027 ret = mshv_init_async_handler(partition); 2028 if (!ret) { 2029 ret = FD_ADD(O_CLOEXEC, anon_inode_getfile("mshv_partition", 2030 &mshv_partition_fops, 2031 partition, O_RDWR)); 2032 if (ret >= 0) 2033 return ret; 2034 } 2035 remove_partition(partition); 2036 delete_partition: 2037 hv_call_delete_partition(partition->pt_id); 2038 cleanup_irq_srcu: 2039 cleanup_srcu_struct(&partition->pt_irq_srcu); 2040 free_partition: 2041 kfree(partition); 2042 2043 return ret; 2044 } 2045 2046 static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, 2047 unsigned long arg) 2048 { 2049 struct miscdevice *misc = filp->private_data; 2050 2051 switch (ioctl) { 2052 case MSHV_CREATE_PARTITION: 2053 return mshv_ioctl_create_partition((void __user *)arg, 2054 misc->this_device); 2055 case MSHV_ROOT_HVCALL: 2056 return mshv_ioctl_passthru_hvcall(NULL, false, 2057 (void __user *)arg); 2058 } 2059 2060 return -ENOTTY; 2061 } 2062 2063 static int 2064 mshv_dev_open(struct inode *inode, struct file *filp) 2065 { 2066 return 0; 2067 } 2068 2069 static int 2070 mshv_dev_release(struct inode *inode, struct file *filp) 2071 { 2072 return 0; 2073 } 2074 2075 static int mshv_root_sched_online; 2076 2077 static const char *scheduler_type_to_string(enum hv_scheduler_type type) 2078 { 2079 switch (type) { 2080 case HV_SCHEDULER_TYPE_LP: 2081 return "classic scheduler without SMT"; 2082 case HV_SCHEDULER_TYPE_LP_SMT: 2083 return "classic scheduler with SMT"; 2084 case HV_SCHEDULER_TYPE_CORE_SMT: 2085 return "core scheduler"; 2086 case HV_SCHEDULER_TYPE_ROOT: 2087 return "root scheduler"; 2088 default: 2089 return "unknown scheduler"; 2090 }; 2091 } 2092 2093 static int __init l1vh_retrieve_scheduler_type(enum hv_scheduler_type *out) 2094 { 2095 u64 integrated_sched_enabled; 2096 int ret; 2097 2098 *out = HV_SCHEDULER_TYPE_CORE_SMT; 2099 2100 if (!mshv_root.vmm_caps.vmm_enable_integrated_scheduler) 2101 return 0; 2102 2103 ret = hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF, 2104 HV_PARTITION_PROPERTY_INTEGRATED_SCHEDULER_ENABLED, 2105 0, &integrated_sched_enabled, 2106 sizeof(integrated_sched_enabled)); 2107 if (ret) 2108 return ret; 2109 2110 if (integrated_sched_enabled) 2111 *out = HV_SCHEDULER_TYPE_ROOT; 2112 2113 return 0; 2114 } 2115 2116 /* TODO move this to hv_common.c when needed outside */ 2117 static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out) 2118 { 2119 struct hv_input_get_system_property *input; 2120 struct hv_output_get_system_property *output; 2121 unsigned long flags; 2122 u64 status; 2123 2124 local_irq_save(flags); 2125 input = *this_cpu_ptr(hyperv_pcpu_input_arg); 2126 output = *this_cpu_ptr(hyperv_pcpu_output_arg); 2127 2128 memset(input, 0, sizeof(*input)); 2129 memset(output, 0, sizeof(*output)); 2130 input->property_id = HV_SYSTEM_PROPERTY_SCHEDULER_TYPE; 2131 2132 status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output); 2133 if (!hv_result_success(status)) { 2134 local_irq_restore(flags); 2135 pr_err("%s: %s\n", __func__, hv_result_to_string(status)); 2136 return hv_result_to_errno(status); 2137 } 2138 2139 *out = output->scheduler_type; 2140 local_irq_restore(flags); 2141 2142 return 0; 2143 } 2144 2145 /* Retrieve and stash the supported scheduler type */ 2146 static int __init mshv_retrieve_scheduler_type(struct device *dev) 2147 { 2148 int ret; 2149 2150 if (hv_l1vh_partition()) 2151 ret = l1vh_retrieve_scheduler_type(&hv_scheduler_type); 2152 else 2153 ret = hv_retrieve_scheduler_type(&hv_scheduler_type); 2154 if (ret) 2155 return ret; 2156 2157 dev_info(dev, "Hypervisor using %s\n", 2158 scheduler_type_to_string(hv_scheduler_type)); 2159 2160 switch (hv_scheduler_type) { 2161 case HV_SCHEDULER_TYPE_CORE_SMT: 2162 case HV_SCHEDULER_TYPE_LP_SMT: 2163 case HV_SCHEDULER_TYPE_ROOT: 2164 case HV_SCHEDULER_TYPE_LP: 2165 /* Supported scheduler, nothing to do */ 2166 break; 2167 default: 2168 dev_err(dev, "unsupported scheduler 0x%x, bailing.\n", 2169 hv_scheduler_type); 2170 return -EOPNOTSUPP; 2171 } 2172 2173 return 0; 2174 } 2175 2176 static int mshv_root_scheduler_init(unsigned int cpu) 2177 { 2178 void **inputarg, **outputarg, *p; 2179 2180 inputarg = (void **)this_cpu_ptr(root_scheduler_input); 2181 outputarg = (void **)this_cpu_ptr(root_scheduler_output); 2182 2183 /* Allocate two consecutive pages. One for input, one for output. */ 2184 p = kmalloc(2 * HV_HYP_PAGE_SIZE, GFP_KERNEL); 2185 if (!p) 2186 return -ENOMEM; 2187 2188 *inputarg = p; 2189 *outputarg = (char *)p + HV_HYP_PAGE_SIZE; 2190 2191 return 0; 2192 } 2193 2194 static int mshv_root_scheduler_cleanup(unsigned int cpu) 2195 { 2196 void *p, **inputarg, **outputarg; 2197 2198 inputarg = (void **)this_cpu_ptr(root_scheduler_input); 2199 outputarg = (void **)this_cpu_ptr(root_scheduler_output); 2200 2201 p = *inputarg; 2202 2203 *inputarg = NULL; 2204 *outputarg = NULL; 2205 2206 kfree(p); 2207 2208 return 0; 2209 } 2210 2211 /* Must be called after retrieving the scheduler type */ 2212 static int 2213 root_scheduler_init(struct device *dev) 2214 { 2215 int ret; 2216 2217 if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) 2218 return 0; 2219 2220 root_scheduler_input = alloc_percpu(void *); 2221 root_scheduler_output = alloc_percpu(void *); 2222 2223 if (!root_scheduler_input || !root_scheduler_output) { 2224 dev_err(dev, "Failed to allocate root scheduler buffers\n"); 2225 ret = -ENOMEM; 2226 goto out; 2227 } 2228 2229 ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_root_sched", 2230 mshv_root_scheduler_init, 2231 mshv_root_scheduler_cleanup); 2232 2233 if (ret < 0) { 2234 dev_err(dev, "Failed to setup root scheduler state: %i\n", ret); 2235 goto out; 2236 } 2237 2238 mshv_root_sched_online = ret; 2239 2240 return 0; 2241 2242 out: 2243 free_percpu(root_scheduler_input); 2244 free_percpu(root_scheduler_output); 2245 return ret; 2246 } 2247 2248 static void 2249 root_scheduler_deinit(void) 2250 { 2251 if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) 2252 return; 2253 2254 cpuhp_remove_state(mshv_root_sched_online); 2255 free_percpu(root_scheduler_input); 2256 free_percpu(root_scheduler_output); 2257 } 2258 2259 static int __init mshv_init_vmm_caps(struct device *dev) 2260 { 2261 int ret; 2262 2263 ret = hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF, 2264 HV_PARTITION_PROPERTY_VMM_CAPABILITIES, 2265 0, &mshv_root.vmm_caps, 2266 sizeof(mshv_root.vmm_caps)); 2267 if (ret && hv_l1vh_partition()) { 2268 dev_err(dev, "Failed to get VMM capabilities: %d\n", ret); 2269 return ret; 2270 } 2271 2272 dev_dbg(dev, "vmm_caps = %#llx\n", mshv_root.vmm_caps.as_uint64[0]); 2273 2274 return 0; 2275 } 2276 2277 static int __init mshv_parent_partition_init(void) 2278 { 2279 int ret; 2280 struct device *dev; 2281 union hv_hypervisor_version_info version_info; 2282 2283 if (!hv_parent_partition() || is_kdump_kernel()) 2284 return -ENODEV; 2285 2286 if (hv_get_hypervisor_version(&version_info)) 2287 return -ENODEV; 2288 2289 ret = misc_register(&mshv_dev); 2290 if (ret) 2291 return ret; 2292 2293 dev = mshv_dev.this_device; 2294 2295 if (version_info.build_number < MSHV_HV_MIN_VERSION || 2296 version_info.build_number > MSHV_HV_MAX_VERSION) { 2297 dev_err(dev, "Running on unvalidated Hyper-V version\n"); 2298 dev_err(dev, "Versions: current: %u min: %u max: %u\n", 2299 version_info.build_number, MSHV_HV_MIN_VERSION, 2300 MSHV_HV_MAX_VERSION); 2301 } 2302 2303 ret = mshv_synic_init(dev); 2304 if (ret) 2305 goto device_deregister; 2306 2307 ret = mshv_init_vmm_caps(dev); 2308 if (ret) 2309 goto synic_cleanup; 2310 2311 ret = mshv_retrieve_scheduler_type(dev); 2312 if (ret) 2313 goto synic_cleanup; 2314 2315 ret = root_scheduler_init(dev); 2316 if (ret) 2317 goto synic_cleanup; 2318 2319 ret = mshv_debugfs_init(); 2320 if (ret) 2321 goto deinit_root_scheduler; 2322 2323 ret = mshv_irqfd_wq_init(); 2324 if (ret) 2325 goto exit_debugfs; 2326 2327 spin_lock_init(&mshv_root.pt_ht_lock); 2328 hash_init(mshv_root.pt_htable); 2329 2330 hv_setup_mshv_handler(mshv_isr); 2331 2332 return 0; 2333 2334 exit_debugfs: 2335 mshv_debugfs_exit(); 2336 deinit_root_scheduler: 2337 root_scheduler_deinit(); 2338 synic_cleanup: 2339 mshv_synic_exit(); 2340 device_deregister: 2341 misc_deregister(&mshv_dev); 2342 return ret; 2343 } 2344 2345 static void __exit mshv_parent_partition_exit(void) 2346 { 2347 hv_setup_mshv_handler(NULL); 2348 mshv_port_table_fini(); 2349 mshv_debugfs_exit(); 2350 misc_deregister(&mshv_dev); 2351 mshv_irqfd_wq_cleanup(); 2352 root_scheduler_deinit(); 2353 mshv_synic_exit(); 2354 } 2355 2356 module_init(mshv_parent_partition_init); 2357 module_exit(mshv_parent_partition_exit); 2358