1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2024, Microsoft Corporation. 4 * 5 * The main part of the mshv_root module, providing APIs to create 6 * and manage guest partitions. 7 * 8 * Authors: Microsoft Linux virtualization team 9 */ 10 11 #include <linux/entry-virt.h> 12 #include <linux/kernel.h> 13 #include <linux/module.h> 14 #include <linux/fs.h> 15 #include <linux/miscdevice.h> 16 #include <linux/slab.h> 17 #include <linux/file.h> 18 #include <linux/anon_inodes.h> 19 #include <linux/mm.h> 20 #include <linux/io.h> 21 #include <linux/cpuhotplug.h> 22 #include <linux/random.h> 23 #include <asm/mshyperv.h> 24 #include <linux/hyperv.h> 25 #include <linux/notifier.h> 26 #include <linux/reboot.h> 27 #include <linux/kexec.h> 28 #include <linux/page-flags.h> 29 #include <linux/crash_dump.h> 30 #include <linux/panic_notifier.h> 31 #include <linux/vmalloc.h> 32 33 #include "mshv_eventfd.h" 34 #include "mshv.h" 35 #include "mshv_root.h" 36 37 MODULE_AUTHOR("Microsoft"); 38 MODULE_LICENSE("GPL"); 39 MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv"); 40 41 /* TODO move this to another file when debugfs code is added */ 42 enum hv_stats_vp_counters { /* HV_THREAD_COUNTER */ 43 #if defined(CONFIG_X86) 44 VpRootDispatchThreadBlocked = 201, 45 #elif defined(CONFIG_ARM64) 46 VpRootDispatchThreadBlocked = 94, 47 #endif 48 VpStatsMaxCounter 49 }; 50 51 struct hv_stats_page { 52 union { 53 u64 vp_cntrs[VpStatsMaxCounter]; /* VP counters */ 54 u8 data[HV_HYP_PAGE_SIZE]; 55 }; 56 } __packed; 57 58 struct mshv_root mshv_root; 59 60 enum hv_scheduler_type hv_scheduler_type; 61 62 /* Once we implement the fast extended hypercall ABI they can go away. */ 63 static void * __percpu *root_scheduler_input; 64 static void * __percpu *root_scheduler_output; 65 66 static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); 67 static int mshv_dev_open(struct inode *inode, struct file *filp); 68 static int mshv_dev_release(struct inode *inode, struct file *filp); 69 static int mshv_vp_release(struct inode *inode, struct file *filp); 70 static long mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); 71 static int mshv_partition_release(struct inode *inode, struct file *filp); 72 static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); 73 static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma); 74 static vm_fault_t mshv_vp_fault(struct vm_fault *vmf); 75 static int mshv_init_async_handler(struct mshv_partition *partition); 76 static void mshv_async_hvcall_handler(void *data, u64 *status); 77 78 static const union hv_input_vtl input_vtl_zero; 79 static const union hv_input_vtl input_vtl_normal = { 80 .target_vtl = HV_NORMAL_VTL, 81 .use_target_vtl = 1, 82 }; 83 84 static const struct vm_operations_struct mshv_vp_vm_ops = { 85 .fault = mshv_vp_fault, 86 }; 87 88 static const struct file_operations mshv_vp_fops = { 89 .owner = THIS_MODULE, 90 .release = mshv_vp_release, 91 .unlocked_ioctl = mshv_vp_ioctl, 92 .llseek = noop_llseek, 93 .mmap = mshv_vp_mmap, 94 }; 95 96 static const struct file_operations mshv_partition_fops = { 97 .owner = THIS_MODULE, 98 .release = mshv_partition_release, 99 .unlocked_ioctl = mshv_partition_ioctl, 100 .llseek = noop_llseek, 101 }; 102 103 static const struct file_operations mshv_dev_fops = { 104 .owner = THIS_MODULE, 105 .open = mshv_dev_open, 106 .release = mshv_dev_release, 107 .unlocked_ioctl = mshv_dev_ioctl, 108 .llseek = noop_llseek, 109 }; 110 111 static struct miscdevice mshv_dev = { 112 .minor = MISC_DYNAMIC_MINOR, 113 .name = "mshv", 114 .fops = &mshv_dev_fops, 115 .mode = 0600, 116 }; 117 118 /* 119 * Only allow hypercalls that have a u64 partition id as the first member of 120 * the input structure. 121 * These are sorted by value. 122 */ 123 static u16 mshv_passthru_hvcalls[] = { 124 HVCALL_GET_PARTITION_PROPERTY, 125 HVCALL_SET_PARTITION_PROPERTY, 126 HVCALL_INSTALL_INTERCEPT, 127 HVCALL_GET_VP_REGISTERS, 128 HVCALL_SET_VP_REGISTERS, 129 HVCALL_TRANSLATE_VIRTUAL_ADDRESS, 130 HVCALL_CLEAR_VIRTUAL_INTERRUPT, 131 HVCALL_REGISTER_INTERCEPT_RESULT, 132 HVCALL_ASSERT_VIRTUAL_INTERRUPT, 133 HVCALL_GET_GPA_PAGES_ACCESS_STATES, 134 HVCALL_SIGNAL_EVENT_DIRECT, 135 HVCALL_POST_MESSAGE_DIRECT, 136 HVCALL_GET_VP_CPUID_VALUES, 137 }; 138 139 static bool mshv_hvcall_is_async(u16 code) 140 { 141 switch (code) { 142 case HVCALL_SET_PARTITION_PROPERTY: 143 return true; 144 default: 145 break; 146 } 147 return false; 148 } 149 150 static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition, 151 bool partition_locked, 152 void __user *user_args) 153 { 154 u64 status; 155 int ret = 0, i; 156 bool is_async; 157 struct mshv_root_hvcall args; 158 struct page *page; 159 unsigned int pages_order; 160 void *input_pg = NULL; 161 void *output_pg = NULL; 162 163 if (copy_from_user(&args, user_args, sizeof(args))) 164 return -EFAULT; 165 166 if (args.status || !args.in_ptr || args.in_sz < sizeof(u64) || 167 mshv_field_nonzero(args, rsvd) || args.in_sz > HV_HYP_PAGE_SIZE) 168 return -EINVAL; 169 170 if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE)) 171 return -EINVAL; 172 173 for (i = 0; i < ARRAY_SIZE(mshv_passthru_hvcalls); ++i) 174 if (args.code == mshv_passthru_hvcalls[i]) 175 break; 176 177 if (i >= ARRAY_SIZE(mshv_passthru_hvcalls)) 178 return -EINVAL; 179 180 is_async = mshv_hvcall_is_async(args.code); 181 if (is_async) { 182 /* async hypercalls can only be called from partition fd */ 183 if (!partition_locked) 184 return -EINVAL; 185 ret = mshv_init_async_handler(partition); 186 if (ret) 187 return ret; 188 } 189 190 pages_order = args.out_ptr ? 1 : 0; 191 page = alloc_pages(GFP_KERNEL, pages_order); 192 if (!page) 193 return -ENOMEM; 194 input_pg = page_address(page); 195 196 if (args.out_ptr) 197 output_pg = (char *)input_pg + PAGE_SIZE; 198 else 199 output_pg = NULL; 200 201 if (copy_from_user(input_pg, (void __user *)args.in_ptr, 202 args.in_sz)) { 203 ret = -EFAULT; 204 goto free_pages_out; 205 } 206 207 /* 208 * NOTE: This only works because all the allowed hypercalls' input 209 * structs begin with a u64 partition_id field. 210 */ 211 *(u64 *)input_pg = partition->pt_id; 212 213 if (args.reps) 214 status = hv_do_rep_hypercall(args.code, args.reps, 0, 215 input_pg, output_pg); 216 else 217 status = hv_do_hypercall(args.code, input_pg, output_pg); 218 219 if (hv_result(status) == HV_STATUS_CALL_PENDING) { 220 if (is_async) { 221 mshv_async_hvcall_handler(partition, &status); 222 } else { /* Paranoia check. This shouldn't happen! */ 223 ret = -EBADFD; 224 goto free_pages_out; 225 } 226 } 227 228 if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) { 229 ret = hv_call_deposit_pages(NUMA_NO_NODE, partition->pt_id, 1); 230 if (!ret) 231 ret = -EAGAIN; 232 } else if (!hv_result_success(status)) { 233 ret = hv_result_to_errno(status); 234 } 235 236 /* 237 * Always return the status and output data regardless of result. 238 * The VMM may need it to determine how to proceed. E.g. the status may 239 * contain the number of reps completed if a rep hypercall partially 240 * succeeded. 241 */ 242 args.status = hv_result(status); 243 args.reps = args.reps ? hv_repcomp(status) : 0; 244 if (copy_to_user(user_args, &args, sizeof(args))) 245 ret = -EFAULT; 246 247 if (output_pg && 248 copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz)) 249 ret = -EFAULT; 250 251 free_pages_out: 252 free_pages((unsigned long)input_pg, pages_order); 253 254 return ret; 255 } 256 257 static inline bool is_ghcb_mapping_available(void) 258 { 259 #if IS_ENABLED(CONFIG_X86_64) 260 return ms_hyperv.ext_features & HV_VP_GHCB_ROOT_MAPPING_AVAILABLE; 261 #else 262 return 0; 263 #endif 264 } 265 266 static int mshv_get_vp_registers(u32 vp_index, u64 partition_id, u16 count, 267 struct hv_register_assoc *registers) 268 { 269 return hv_call_get_vp_registers(vp_index, partition_id, 270 count, input_vtl_zero, registers); 271 } 272 273 static int mshv_set_vp_registers(u32 vp_index, u64 partition_id, u16 count, 274 struct hv_register_assoc *registers) 275 { 276 return hv_call_set_vp_registers(vp_index, partition_id, 277 count, input_vtl_zero, registers); 278 } 279 280 /* 281 * Explicit guest vCPU suspend is asynchronous by nature (as it is requested by 282 * dom0 vCPU for guest vCPU) and thus it can race with "intercept" suspend, 283 * done by the hypervisor. 284 * "Intercept" suspend leads to asynchronous message delivery to dom0 which 285 * should be awaited to keep the VP loop consistent (i.e. no message pending 286 * upon VP resume). 287 * VP intercept suspend can't be done when the VP is explicitly suspended 288 * already, and thus can be only two possible race scenarios: 289 * 1. implicit suspend bit set -> explicit suspend bit set -> message sent 290 * 2. implicit suspend bit set -> message sent -> explicit suspend bit set 291 * Checking for implicit suspend bit set after explicit suspend request has 292 * succeeded in either case allows us to reliably identify, if there is a 293 * message to receive and deliver to VMM. 294 */ 295 static int 296 mshv_suspend_vp(const struct mshv_vp *vp, bool *message_in_flight) 297 { 298 struct hv_register_assoc explicit_suspend = { 299 .name = HV_REGISTER_EXPLICIT_SUSPEND 300 }; 301 struct hv_register_assoc intercept_suspend = { 302 .name = HV_REGISTER_INTERCEPT_SUSPEND 303 }; 304 union hv_explicit_suspend_register *es = 305 &explicit_suspend.value.explicit_suspend; 306 union hv_intercept_suspend_register *is = 307 &intercept_suspend.value.intercept_suspend; 308 int ret; 309 310 es->suspended = 1; 311 312 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 313 1, &explicit_suspend); 314 if (ret) { 315 vp_err(vp, "Failed to explicitly suspend vCPU\n"); 316 return ret; 317 } 318 319 ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 320 1, &intercept_suspend); 321 if (ret) { 322 vp_err(vp, "Failed to get intercept suspend state\n"); 323 return ret; 324 } 325 326 *message_in_flight = is->suspended; 327 328 return 0; 329 } 330 331 /* 332 * This function is used when VPs are scheduled by the hypervisor's 333 * scheduler. 334 * 335 * Caller has to make sure the registers contain cleared 336 * HV_REGISTER_INTERCEPT_SUSPEND and HV_REGISTER_EXPLICIT_SUSPEND registers 337 * exactly in this order (the hypervisor clears them sequentially) to avoid 338 * potential invalid clearing a newly arrived HV_REGISTER_INTERCEPT_SUSPEND 339 * after VP is released from HV_REGISTER_EXPLICIT_SUSPEND in case of the 340 * opposite order. 341 */ 342 static long mshv_run_vp_with_hyp_scheduler(struct mshv_vp *vp) 343 { 344 long ret; 345 struct hv_register_assoc suspend_regs[2] = { 346 { .name = HV_REGISTER_INTERCEPT_SUSPEND }, 347 { .name = HV_REGISTER_EXPLICIT_SUSPEND } 348 }; 349 size_t count = ARRAY_SIZE(suspend_regs); 350 351 /* Resume VP execution */ 352 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 353 count, suspend_regs); 354 if (ret) { 355 vp_err(vp, "Failed to resume vp execution. %lx\n", ret); 356 return ret; 357 } 358 359 ret = wait_event_interruptible(vp->run.vp_suspend_queue, 360 vp->run.kicked_by_hv == 1); 361 if (ret) { 362 bool message_in_flight; 363 364 /* 365 * Otherwise the waiting was interrupted by a signal: suspend 366 * the vCPU explicitly and copy message in flight (if any). 367 */ 368 ret = mshv_suspend_vp(vp, &message_in_flight); 369 if (ret) 370 return ret; 371 372 /* Return if no message in flight */ 373 if (!message_in_flight) 374 return -EINTR; 375 376 /* Wait for the message in flight. */ 377 wait_event(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1); 378 } 379 380 /* 381 * Reset the flag to make the wait_event call above work 382 * next time. 383 */ 384 vp->run.kicked_by_hv = 0; 385 386 return 0; 387 } 388 389 static int 390 mshv_vp_dispatch(struct mshv_vp *vp, u32 flags, 391 struct hv_output_dispatch_vp *res) 392 { 393 struct hv_input_dispatch_vp *input; 394 struct hv_output_dispatch_vp *output; 395 u64 status; 396 397 preempt_disable(); 398 input = *this_cpu_ptr(root_scheduler_input); 399 output = *this_cpu_ptr(root_scheduler_output); 400 401 memset(input, 0, sizeof(*input)); 402 memset(output, 0, sizeof(*output)); 403 404 input->partition_id = vp->vp_partition->pt_id; 405 input->vp_index = vp->vp_index; 406 input->time_slice = 0; /* Run forever until something happens */ 407 input->spec_ctrl = 0; /* TODO: set sensible flags */ 408 input->flags = flags; 409 410 vp->run.flags.root_sched_dispatched = 1; 411 status = hv_do_hypercall(HVCALL_DISPATCH_VP, input, output); 412 vp->run.flags.root_sched_dispatched = 0; 413 414 *res = *output; 415 preempt_enable(); 416 417 if (!hv_result_success(status)) 418 vp_err(vp, "%s: status %s\n", __func__, 419 hv_result_to_string(status)); 420 421 return hv_result_to_errno(status); 422 } 423 424 static int 425 mshv_vp_clear_explicit_suspend(struct mshv_vp *vp) 426 { 427 struct hv_register_assoc explicit_suspend = { 428 .name = HV_REGISTER_EXPLICIT_SUSPEND, 429 .value.explicit_suspend.suspended = 0, 430 }; 431 int ret; 432 433 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 434 1, &explicit_suspend); 435 436 if (ret) 437 vp_err(vp, "Failed to unsuspend\n"); 438 439 return ret; 440 } 441 442 #if IS_ENABLED(CONFIG_X86_64) 443 static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp) 444 { 445 if (!vp->vp_register_page) 446 return 0; 447 return vp->vp_register_page->interrupt_vectors.as_uint64; 448 } 449 #else 450 static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp) 451 { 452 return 0; 453 } 454 #endif 455 456 static bool mshv_vp_dispatch_thread_blocked(struct mshv_vp *vp) 457 { 458 struct hv_stats_page **stats = vp->vp_stats_pages; 459 u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->vp_cntrs; 460 u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->vp_cntrs; 461 462 if (self_vp_cntrs[VpRootDispatchThreadBlocked]) 463 return self_vp_cntrs[VpRootDispatchThreadBlocked]; 464 return parent_vp_cntrs[VpRootDispatchThreadBlocked]; 465 } 466 467 static int 468 mshv_vp_wait_for_hv_kick(struct mshv_vp *vp) 469 { 470 int ret; 471 472 ret = wait_event_interruptible(vp->run.vp_suspend_queue, 473 (vp->run.kicked_by_hv == 1 && 474 !mshv_vp_dispatch_thread_blocked(vp)) || 475 mshv_vp_interrupt_pending(vp)); 476 if (ret) 477 return -EINTR; 478 479 vp->run.flags.root_sched_blocked = 0; 480 vp->run.kicked_by_hv = 0; 481 482 return 0; 483 } 484 485 /* Must be called with interrupts enabled */ 486 static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp) 487 { 488 long ret; 489 490 if (vp->run.flags.root_sched_blocked) { 491 /* 492 * Dispatch state of this VP is blocked. Need to wait 493 * for the hypervisor to clear the blocked state before 494 * dispatching it. 495 */ 496 ret = mshv_vp_wait_for_hv_kick(vp); 497 if (ret) 498 return ret; 499 } 500 501 do { 502 u32 flags = 0; 503 struct hv_output_dispatch_vp output; 504 505 if (__xfer_to_guest_mode_work_pending()) { 506 ret = xfer_to_guest_mode_handle_work(); 507 if (ret) 508 break; 509 } 510 511 if (vp->run.flags.intercept_suspend) 512 flags |= HV_DISPATCH_VP_FLAG_CLEAR_INTERCEPT_SUSPEND; 513 514 if (mshv_vp_interrupt_pending(vp)) 515 flags |= HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION; 516 517 ret = mshv_vp_dispatch(vp, flags, &output); 518 if (ret) 519 break; 520 521 vp->run.flags.intercept_suspend = 0; 522 523 if (output.dispatch_state == HV_VP_DISPATCH_STATE_BLOCKED) { 524 if (output.dispatch_event == 525 HV_VP_DISPATCH_EVENT_SUSPEND) { 526 /* 527 * TODO: remove the warning once VP canceling 528 * is supported 529 */ 530 WARN_ONCE(atomic64_read(&vp->run.vp_signaled_count), 531 "%s: vp#%d: unexpected explicit suspend\n", 532 __func__, vp->vp_index); 533 /* 534 * Need to clear explicit suspend before 535 * dispatching. 536 * Explicit suspend is either: 537 * - set right after the first VP dispatch or 538 * - set explicitly via hypercall 539 * Since the latter case is not yet supported, 540 * simply clear it here. 541 */ 542 ret = mshv_vp_clear_explicit_suspend(vp); 543 if (ret) 544 break; 545 546 ret = mshv_vp_wait_for_hv_kick(vp); 547 if (ret) 548 break; 549 } else { 550 vp->run.flags.root_sched_blocked = 1; 551 ret = mshv_vp_wait_for_hv_kick(vp); 552 if (ret) 553 break; 554 } 555 } else { 556 /* HV_VP_DISPATCH_STATE_READY */ 557 if (output.dispatch_event == 558 HV_VP_DISPATCH_EVENT_INTERCEPT) 559 vp->run.flags.intercept_suspend = 1; 560 } 561 } while (!vp->run.flags.intercept_suspend); 562 563 return ret; 564 } 565 566 static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ, 567 "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ"); 568 569 static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg) 570 { 571 long rc; 572 573 if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) 574 rc = mshv_run_vp_with_root_scheduler(vp); 575 else 576 rc = mshv_run_vp_with_hyp_scheduler(vp); 577 578 if (rc) 579 return rc; 580 581 if (copy_to_user(ret_msg, vp->vp_intercept_msg_page, 582 sizeof(struct hv_message))) 583 rc = -EFAULT; 584 585 return rc; 586 } 587 588 static int 589 mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp *vp, 590 struct hv_vp_state_data state_data, 591 unsigned long user_pfn, size_t page_count, 592 bool is_set) 593 { 594 int completed, ret = 0; 595 unsigned long check; 596 struct page **pages; 597 598 if (page_count > INT_MAX) 599 return -EINVAL; 600 /* 601 * Check the arithmetic for wraparound/overflow. 602 * The last page address in the buffer is: 603 * (user_pfn + (page_count - 1)) * PAGE_SIZE 604 */ 605 if (check_add_overflow(user_pfn, (page_count - 1), &check)) 606 return -EOVERFLOW; 607 if (check_mul_overflow(check, PAGE_SIZE, &check)) 608 return -EOVERFLOW; 609 610 /* Pin user pages so hypervisor can copy directly to them */ 611 pages = kcalloc(page_count, sizeof(struct page *), GFP_KERNEL); 612 if (!pages) 613 return -ENOMEM; 614 615 for (completed = 0; completed < page_count; completed += ret) { 616 unsigned long user_addr = (user_pfn + completed) * PAGE_SIZE; 617 int remaining = page_count - completed; 618 619 ret = pin_user_pages_fast(user_addr, remaining, FOLL_WRITE, 620 &pages[completed]); 621 if (ret < 0) { 622 vp_err(vp, "%s: Failed to pin user pages error %i\n", 623 __func__, ret); 624 goto unpin_pages; 625 } 626 } 627 628 if (is_set) 629 ret = hv_call_set_vp_state(vp->vp_index, 630 vp->vp_partition->pt_id, 631 state_data, page_count, pages, 632 0, NULL); 633 else 634 ret = hv_call_get_vp_state(vp->vp_index, 635 vp->vp_partition->pt_id, 636 state_data, page_count, pages, 637 NULL); 638 639 unpin_pages: 640 unpin_user_pages(pages, completed); 641 kfree(pages); 642 return ret; 643 } 644 645 static long 646 mshv_vp_ioctl_get_set_state(struct mshv_vp *vp, 647 struct mshv_get_set_vp_state __user *user_args, 648 bool is_set) 649 { 650 struct mshv_get_set_vp_state args; 651 long ret = 0; 652 union hv_output_get_vp_state vp_state; 653 u32 data_sz; 654 struct hv_vp_state_data state_data = {}; 655 656 if (copy_from_user(&args, user_args, sizeof(args))) 657 return -EFAULT; 658 659 if (args.type >= MSHV_VP_STATE_COUNT || mshv_field_nonzero(args, rsvd) || 660 !args.buf_sz || !PAGE_ALIGNED(args.buf_sz) || 661 !PAGE_ALIGNED(args.buf_ptr)) 662 return -EINVAL; 663 664 if (!access_ok((void __user *)args.buf_ptr, args.buf_sz)) 665 return -EFAULT; 666 667 switch (args.type) { 668 case MSHV_VP_STATE_LAPIC: 669 state_data.type = HV_GET_SET_VP_STATE_LAPIC_STATE; 670 data_sz = HV_HYP_PAGE_SIZE; 671 break; 672 case MSHV_VP_STATE_XSAVE: 673 { 674 u64 data_sz_64; 675 676 ret = hv_call_get_partition_property(vp->vp_partition->pt_id, 677 HV_PARTITION_PROPERTY_XSAVE_STATES, 678 &state_data.xsave.states.as_uint64); 679 if (ret) 680 return ret; 681 682 ret = hv_call_get_partition_property(vp->vp_partition->pt_id, 683 HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE, 684 &data_sz_64); 685 if (ret) 686 return ret; 687 688 data_sz = (u32)data_sz_64; 689 state_data.xsave.flags = 0; 690 /* Always request legacy states */ 691 state_data.xsave.states.legacy_x87 = 1; 692 state_data.xsave.states.legacy_sse = 1; 693 state_data.type = HV_GET_SET_VP_STATE_XSAVE; 694 break; 695 } 696 case MSHV_VP_STATE_SIMP: 697 state_data.type = HV_GET_SET_VP_STATE_SIM_PAGE; 698 data_sz = HV_HYP_PAGE_SIZE; 699 break; 700 case MSHV_VP_STATE_SIEFP: 701 state_data.type = HV_GET_SET_VP_STATE_SIEF_PAGE; 702 data_sz = HV_HYP_PAGE_SIZE; 703 break; 704 case MSHV_VP_STATE_SYNTHETIC_TIMERS: 705 state_data.type = HV_GET_SET_VP_STATE_SYNTHETIC_TIMERS; 706 data_sz = sizeof(vp_state.synthetic_timers_state); 707 break; 708 default: 709 return -EINVAL; 710 } 711 712 if (copy_to_user(&user_args->buf_sz, &data_sz, sizeof(user_args->buf_sz))) 713 return -EFAULT; 714 715 if (data_sz > args.buf_sz) 716 return -EINVAL; 717 718 /* If the data is transmitted via pfns, delegate to helper */ 719 if (state_data.type & HV_GET_SET_VP_STATE_TYPE_PFN) { 720 unsigned long user_pfn = PFN_DOWN(args.buf_ptr); 721 size_t page_count = PFN_DOWN(args.buf_sz); 722 723 return mshv_vp_ioctl_get_set_state_pfn(vp, state_data, user_pfn, 724 page_count, is_set); 725 } 726 727 /* Paranoia check - this shouldn't happen! */ 728 if (data_sz > sizeof(vp_state)) { 729 vp_err(vp, "Invalid vp state data size!\n"); 730 return -EINVAL; 731 } 732 733 if (is_set) { 734 if (copy_from_user(&vp_state, (__user void *)args.buf_ptr, data_sz)) 735 return -EFAULT; 736 737 return hv_call_set_vp_state(vp->vp_index, 738 vp->vp_partition->pt_id, 739 state_data, 0, NULL, 740 sizeof(vp_state), (u8 *)&vp_state); 741 } 742 743 ret = hv_call_get_vp_state(vp->vp_index, vp->vp_partition->pt_id, 744 state_data, 0, NULL, &vp_state); 745 if (ret) 746 return ret; 747 748 if (copy_to_user((void __user *)args.buf_ptr, &vp_state, data_sz)) 749 return -EFAULT; 750 751 return 0; 752 } 753 754 static long 755 mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) 756 { 757 struct mshv_vp *vp = filp->private_data; 758 long r = -ENOTTY; 759 760 if (mutex_lock_killable(&vp->vp_mutex)) 761 return -EINTR; 762 763 switch (ioctl) { 764 case MSHV_RUN_VP: 765 r = mshv_vp_ioctl_run_vp(vp, (void __user *)arg); 766 break; 767 case MSHV_GET_VP_STATE: 768 r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, false); 769 break; 770 case MSHV_SET_VP_STATE: 771 r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, true); 772 break; 773 case MSHV_ROOT_HVCALL: 774 r = mshv_ioctl_passthru_hvcall(vp->vp_partition, false, 775 (void __user *)arg); 776 break; 777 default: 778 vp_warn(vp, "Invalid ioctl: %#x\n", ioctl); 779 break; 780 } 781 mutex_unlock(&vp->vp_mutex); 782 783 return r; 784 } 785 786 static vm_fault_t mshv_vp_fault(struct vm_fault *vmf) 787 { 788 struct mshv_vp *vp = vmf->vma->vm_file->private_data; 789 790 switch (vmf->vma->vm_pgoff) { 791 case MSHV_VP_MMAP_OFFSET_REGISTERS: 792 vmf->page = virt_to_page(vp->vp_register_page); 793 break; 794 case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE: 795 vmf->page = virt_to_page(vp->vp_intercept_msg_page); 796 break; 797 case MSHV_VP_MMAP_OFFSET_GHCB: 798 vmf->page = virt_to_page(vp->vp_ghcb_page); 799 break; 800 default: 801 return VM_FAULT_SIGBUS; 802 } 803 804 get_page(vmf->page); 805 806 return 0; 807 } 808 809 static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma) 810 { 811 struct mshv_vp *vp = file->private_data; 812 813 switch (vma->vm_pgoff) { 814 case MSHV_VP_MMAP_OFFSET_REGISTERS: 815 if (!vp->vp_register_page) 816 return -ENODEV; 817 break; 818 case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE: 819 if (!vp->vp_intercept_msg_page) 820 return -ENODEV; 821 break; 822 case MSHV_VP_MMAP_OFFSET_GHCB: 823 if (!vp->vp_ghcb_page) 824 return -ENODEV; 825 break; 826 default: 827 return -EINVAL; 828 } 829 830 vma->vm_ops = &mshv_vp_vm_ops; 831 return 0; 832 } 833 834 static int 835 mshv_vp_release(struct inode *inode, struct file *filp) 836 { 837 struct mshv_vp *vp = filp->private_data; 838 839 /* Rest of VP cleanup happens in destroy_partition() */ 840 mshv_partition_put(vp->vp_partition); 841 return 0; 842 } 843 844 static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index) 845 { 846 union hv_stats_object_identity identity = { 847 .vp.partition_id = partition_id, 848 .vp.vp_index = vp_index, 849 }; 850 851 identity.vp.stats_area_type = HV_STATS_AREA_SELF; 852 hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); 853 854 identity.vp.stats_area_type = HV_STATS_AREA_PARENT; 855 hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); 856 } 857 858 static int mshv_vp_stats_map(u64 partition_id, u32 vp_index, 859 void *stats_pages[]) 860 { 861 union hv_stats_object_identity identity = { 862 .vp.partition_id = partition_id, 863 .vp.vp_index = vp_index, 864 }; 865 int err; 866 867 identity.vp.stats_area_type = HV_STATS_AREA_SELF; 868 err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity, 869 &stats_pages[HV_STATS_AREA_SELF]); 870 if (err) 871 return err; 872 873 identity.vp.stats_area_type = HV_STATS_AREA_PARENT; 874 err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity, 875 &stats_pages[HV_STATS_AREA_PARENT]); 876 if (err) 877 goto unmap_self; 878 879 return 0; 880 881 unmap_self: 882 identity.vp.stats_area_type = HV_STATS_AREA_SELF; 883 hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); 884 return err; 885 } 886 887 static long 888 mshv_partition_ioctl_create_vp(struct mshv_partition *partition, 889 void __user *arg) 890 { 891 struct mshv_create_vp args; 892 struct mshv_vp *vp; 893 struct page *intercept_message_page, *register_page, *ghcb_page; 894 void *stats_pages[2]; 895 long ret; 896 897 if (copy_from_user(&args, arg, sizeof(args))) 898 return -EFAULT; 899 900 if (args.vp_index >= MSHV_MAX_VPS) 901 return -EINVAL; 902 903 if (partition->pt_vp_array[args.vp_index]) 904 return -EEXIST; 905 906 ret = hv_call_create_vp(NUMA_NO_NODE, partition->pt_id, args.vp_index, 907 0 /* Only valid for root partition VPs */); 908 if (ret) 909 return ret; 910 911 ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index, 912 HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, 913 input_vtl_zero, 914 &intercept_message_page); 915 if (ret) 916 goto destroy_vp; 917 918 if (!mshv_partition_encrypted(partition)) { 919 ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index, 920 HV_VP_STATE_PAGE_REGISTERS, 921 input_vtl_zero, 922 ®ister_page); 923 if (ret) 924 goto unmap_intercept_message_page; 925 } 926 927 if (mshv_partition_encrypted(partition) && 928 is_ghcb_mapping_available()) { 929 ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index, 930 HV_VP_STATE_PAGE_GHCB, 931 input_vtl_normal, 932 &ghcb_page); 933 if (ret) 934 goto unmap_register_page; 935 } 936 937 if (hv_parent_partition()) { 938 ret = mshv_vp_stats_map(partition->pt_id, args.vp_index, 939 stats_pages); 940 if (ret) 941 goto unmap_ghcb_page; 942 } 943 944 vp = kzalloc(sizeof(*vp), GFP_KERNEL); 945 if (!vp) 946 goto unmap_stats_pages; 947 948 vp->vp_partition = mshv_partition_get(partition); 949 if (!vp->vp_partition) { 950 ret = -EBADF; 951 goto free_vp; 952 } 953 954 mutex_init(&vp->vp_mutex); 955 init_waitqueue_head(&vp->run.vp_suspend_queue); 956 atomic64_set(&vp->run.vp_signaled_count, 0); 957 958 vp->vp_index = args.vp_index; 959 vp->vp_intercept_msg_page = page_to_virt(intercept_message_page); 960 if (!mshv_partition_encrypted(partition)) 961 vp->vp_register_page = page_to_virt(register_page); 962 963 if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) 964 vp->vp_ghcb_page = page_to_virt(ghcb_page); 965 966 if (hv_parent_partition()) 967 memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages)); 968 969 /* 970 * Keep anon_inode_getfd last: it installs fd in the file struct and 971 * thus makes the state accessible in user space. 972 */ 973 ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp, 974 O_RDWR | O_CLOEXEC); 975 if (ret < 0) 976 goto put_partition; 977 978 /* already exclusive with the partition mutex for all ioctls */ 979 partition->pt_vp_count++; 980 partition->pt_vp_array[args.vp_index] = vp; 981 982 return ret; 983 984 put_partition: 985 mshv_partition_put(partition); 986 free_vp: 987 kfree(vp); 988 unmap_stats_pages: 989 if (hv_parent_partition()) 990 mshv_vp_stats_unmap(partition->pt_id, args.vp_index); 991 unmap_ghcb_page: 992 if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) { 993 hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index, 994 HV_VP_STATE_PAGE_GHCB, 995 input_vtl_normal); 996 } 997 unmap_register_page: 998 if (!mshv_partition_encrypted(partition)) { 999 hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index, 1000 HV_VP_STATE_PAGE_REGISTERS, 1001 input_vtl_zero); 1002 } 1003 unmap_intercept_message_page: 1004 hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index, 1005 HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, 1006 input_vtl_zero); 1007 destroy_vp: 1008 hv_call_delete_vp(partition->pt_id, args.vp_index); 1009 return ret; 1010 } 1011 1012 static int mshv_init_async_handler(struct mshv_partition *partition) 1013 { 1014 if (completion_done(&partition->async_hypercall)) { 1015 pt_err(partition, 1016 "Cannot issue async hypercall while another one in progress!\n"); 1017 return -EPERM; 1018 } 1019 1020 reinit_completion(&partition->async_hypercall); 1021 return 0; 1022 } 1023 1024 static void mshv_async_hvcall_handler(void *data, u64 *status) 1025 { 1026 struct mshv_partition *partition = data; 1027 1028 wait_for_completion(&partition->async_hypercall); 1029 pt_dbg(partition, "Async hypercall completed!\n"); 1030 1031 *status = partition->async_hypercall_status; 1032 } 1033 1034 static int 1035 mshv_partition_region_share(struct mshv_mem_region *region) 1036 { 1037 u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED; 1038 1039 if (region->flags.large_pages) 1040 flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; 1041 1042 return hv_call_modify_spa_host_access(region->partition->pt_id, 1043 region->pages, region->nr_pages, 1044 HV_MAP_GPA_READABLE | HV_MAP_GPA_WRITABLE, 1045 flags, true); 1046 } 1047 1048 static int 1049 mshv_partition_region_unshare(struct mshv_mem_region *region) 1050 { 1051 u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE; 1052 1053 if (region->flags.large_pages) 1054 flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; 1055 1056 return hv_call_modify_spa_host_access(region->partition->pt_id, 1057 region->pages, region->nr_pages, 1058 0, 1059 flags, false); 1060 } 1061 1062 static int 1063 mshv_region_remap_pages(struct mshv_mem_region *region, u32 map_flags, 1064 u64 page_offset, u64 page_count) 1065 { 1066 if (page_offset + page_count > region->nr_pages) 1067 return -EINVAL; 1068 1069 if (region->flags.large_pages) 1070 map_flags |= HV_MAP_GPA_LARGE_PAGE; 1071 1072 /* ask the hypervisor to map guest ram */ 1073 return hv_call_map_gpa_pages(region->partition->pt_id, 1074 region->start_gfn + page_offset, 1075 page_count, map_flags, 1076 region->pages + page_offset); 1077 } 1078 1079 static int 1080 mshv_region_map(struct mshv_mem_region *region) 1081 { 1082 u32 map_flags = region->hv_map_flags; 1083 1084 return mshv_region_remap_pages(region, map_flags, 1085 0, region->nr_pages); 1086 } 1087 1088 static void 1089 mshv_region_evict_pages(struct mshv_mem_region *region, 1090 u64 page_offset, u64 page_count) 1091 { 1092 if (region->flags.range_pinned) 1093 unpin_user_pages(region->pages + page_offset, page_count); 1094 1095 memset(region->pages + page_offset, 0, 1096 page_count * sizeof(struct page *)); 1097 } 1098 1099 static void 1100 mshv_region_evict(struct mshv_mem_region *region) 1101 { 1102 mshv_region_evict_pages(region, 0, region->nr_pages); 1103 } 1104 1105 static int 1106 mshv_region_populate_pages(struct mshv_mem_region *region, 1107 u64 page_offset, u64 page_count) 1108 { 1109 u64 done_count, nr_pages; 1110 struct page **pages; 1111 __u64 userspace_addr; 1112 int ret; 1113 1114 if (page_offset + page_count > region->nr_pages) 1115 return -EINVAL; 1116 1117 for (done_count = 0; done_count < page_count; done_count += ret) { 1118 pages = region->pages + page_offset + done_count; 1119 userspace_addr = region->start_uaddr + 1120 (page_offset + done_count) * 1121 HV_HYP_PAGE_SIZE; 1122 nr_pages = min(page_count - done_count, 1123 MSHV_PIN_PAGES_BATCH_SIZE); 1124 1125 /* 1126 * Pinning assuming 4k pages works for large pages too. 1127 * All page structs within the large page are returned. 1128 * 1129 * Pin requests are batched because pin_user_pages_fast 1130 * with the FOLL_LONGTERM flag does a large temporary 1131 * allocation of contiguous memory. 1132 */ 1133 if (region->flags.range_pinned) 1134 ret = pin_user_pages_fast(userspace_addr, 1135 nr_pages, 1136 FOLL_WRITE | FOLL_LONGTERM, 1137 pages); 1138 else 1139 ret = -EOPNOTSUPP; 1140 1141 if (ret < 0) 1142 goto release_pages; 1143 } 1144 1145 if (PageHuge(region->pages[page_offset])) 1146 region->flags.large_pages = true; 1147 1148 return 0; 1149 1150 release_pages: 1151 mshv_region_evict_pages(region, page_offset, done_count); 1152 return ret; 1153 } 1154 1155 static int 1156 mshv_region_populate(struct mshv_mem_region *region) 1157 { 1158 return mshv_region_populate_pages(region, 0, region->nr_pages); 1159 } 1160 1161 static struct mshv_mem_region * 1162 mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn) 1163 { 1164 struct mshv_mem_region *region; 1165 1166 hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { 1167 if (gfn >= region->start_gfn && 1168 gfn < region->start_gfn + region->nr_pages) 1169 return region; 1170 } 1171 1172 return NULL; 1173 } 1174 1175 static struct mshv_mem_region * 1176 mshv_partition_region_by_uaddr(struct mshv_partition *partition, u64 uaddr) 1177 { 1178 struct mshv_mem_region *region; 1179 1180 hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { 1181 if (uaddr >= region->start_uaddr && 1182 uaddr < region->start_uaddr + 1183 (region->nr_pages << HV_HYP_PAGE_SHIFT)) 1184 return region; 1185 } 1186 1187 return NULL; 1188 } 1189 1190 /* 1191 * NB: caller checks and makes sure mem->size is page aligned 1192 * Returns: 0 with regionpp updated on success, or -errno 1193 */ 1194 static int mshv_partition_create_region(struct mshv_partition *partition, 1195 struct mshv_user_mem_region *mem, 1196 struct mshv_mem_region **regionpp, 1197 bool is_mmio) 1198 { 1199 struct mshv_mem_region *region; 1200 u64 nr_pages = HVPFN_DOWN(mem->size); 1201 1202 /* Reject overlapping regions */ 1203 if (mshv_partition_region_by_gfn(partition, mem->guest_pfn) || 1204 mshv_partition_region_by_gfn(partition, mem->guest_pfn + nr_pages - 1) || 1205 mshv_partition_region_by_uaddr(partition, mem->userspace_addr) || 1206 mshv_partition_region_by_uaddr(partition, mem->userspace_addr + mem->size - 1)) 1207 return -EEXIST; 1208 1209 region = vzalloc(sizeof(*region) + sizeof(struct page *) * nr_pages); 1210 if (!region) 1211 return -ENOMEM; 1212 1213 region->nr_pages = nr_pages; 1214 region->start_gfn = mem->guest_pfn; 1215 region->start_uaddr = mem->userspace_addr; 1216 region->hv_map_flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_ADJUSTABLE; 1217 if (mem->flags & BIT(MSHV_SET_MEM_BIT_WRITABLE)) 1218 region->hv_map_flags |= HV_MAP_GPA_WRITABLE; 1219 if (mem->flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE)) 1220 region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE; 1221 1222 /* Note: large_pages flag populated when we pin the pages */ 1223 if (!is_mmio) 1224 region->flags.range_pinned = true; 1225 1226 region->partition = partition; 1227 1228 *regionpp = region; 1229 1230 return 0; 1231 } 1232 1233 /* 1234 * Map guest ram. if snp, make sure to release that from the host first 1235 * Side Effects: In case of failure, pages are unpinned when feasible. 1236 */ 1237 static int 1238 mshv_partition_mem_region_map(struct mshv_mem_region *region) 1239 { 1240 struct mshv_partition *partition = region->partition; 1241 int ret; 1242 1243 ret = mshv_region_populate(region); 1244 if (ret) { 1245 pt_err(partition, "Failed to populate memory region: %d\n", 1246 ret); 1247 goto err_out; 1248 } 1249 1250 /* 1251 * For an SNP partition it is a requirement that for every memory region 1252 * that we are going to map for this partition we should make sure that 1253 * host access to that region is released. This is ensured by doing an 1254 * additional hypercall which will update the SLAT to release host 1255 * access to guest memory regions. 1256 */ 1257 if (mshv_partition_encrypted(partition)) { 1258 ret = mshv_partition_region_unshare(region); 1259 if (ret) { 1260 pt_err(partition, 1261 "Failed to unshare memory region (guest_pfn: %llu): %d\n", 1262 region->start_gfn, ret); 1263 goto evict_region; 1264 } 1265 } 1266 1267 ret = mshv_region_map(region); 1268 if (ret && mshv_partition_encrypted(partition)) { 1269 int shrc; 1270 1271 shrc = mshv_partition_region_share(region); 1272 if (!shrc) 1273 goto evict_region; 1274 1275 pt_err(partition, 1276 "Failed to share memory region (guest_pfn: %llu): %d\n", 1277 region->start_gfn, shrc); 1278 /* 1279 * Don't unpin if marking shared failed because pages are no 1280 * longer mapped in the host, ie root, anymore. 1281 */ 1282 goto err_out; 1283 } 1284 1285 return 0; 1286 1287 evict_region: 1288 mshv_region_evict(region); 1289 err_out: 1290 return ret; 1291 } 1292 1293 /* 1294 * This maps two things: guest RAM and for pci passthru mmio space. 1295 * 1296 * mmio: 1297 * - vfio overloads vm_pgoff to store the mmio start pfn/spa. 1298 * - Two things need to happen for mapping mmio range: 1299 * 1. mapped in the uaddr so VMM can access it. 1300 * 2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it. 1301 * 1302 * This function takes care of the second. The first one is managed by vfio, 1303 * and hence is taken care of via vfio_pci_mmap_fault(). 1304 */ 1305 static long 1306 mshv_map_user_memory(struct mshv_partition *partition, 1307 struct mshv_user_mem_region mem) 1308 { 1309 struct mshv_mem_region *region; 1310 struct vm_area_struct *vma; 1311 bool is_mmio; 1312 ulong mmio_pfn; 1313 long ret; 1314 1315 if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP) || 1316 !access_ok((const void *)mem.userspace_addr, mem.size)) 1317 return -EINVAL; 1318 1319 mmap_read_lock(current->mm); 1320 vma = vma_lookup(current->mm, mem.userspace_addr); 1321 is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0; 1322 mmio_pfn = is_mmio ? vma->vm_pgoff : 0; 1323 mmap_read_unlock(current->mm); 1324 1325 if (!vma) 1326 return -EINVAL; 1327 1328 ret = mshv_partition_create_region(partition, &mem, ®ion, 1329 is_mmio); 1330 if (ret) 1331 return ret; 1332 1333 if (is_mmio) 1334 ret = hv_call_map_mmio_pages(partition->pt_id, mem.guest_pfn, 1335 mmio_pfn, HVPFN_DOWN(mem.size)); 1336 else 1337 ret = mshv_partition_mem_region_map(region); 1338 1339 if (ret) 1340 goto errout; 1341 1342 /* Install the new region */ 1343 hlist_add_head(®ion->hnode, &partition->pt_mem_regions); 1344 1345 return 0; 1346 1347 errout: 1348 vfree(region); 1349 return ret; 1350 } 1351 1352 /* Called for unmapping both the guest ram and the mmio space */ 1353 static long 1354 mshv_unmap_user_memory(struct mshv_partition *partition, 1355 struct mshv_user_mem_region mem) 1356 { 1357 struct mshv_mem_region *region; 1358 u32 unmap_flags = 0; 1359 1360 if (!(mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP))) 1361 return -EINVAL; 1362 1363 region = mshv_partition_region_by_gfn(partition, mem.guest_pfn); 1364 if (!region) 1365 return -EINVAL; 1366 1367 /* Paranoia check */ 1368 if (region->start_uaddr != mem.userspace_addr || 1369 region->start_gfn != mem.guest_pfn || 1370 region->nr_pages != HVPFN_DOWN(mem.size)) 1371 return -EINVAL; 1372 1373 hlist_del(®ion->hnode); 1374 1375 if (region->flags.large_pages) 1376 unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE; 1377 1378 /* ignore unmap failures and continue as process may be exiting */ 1379 hv_call_unmap_gpa_pages(partition->pt_id, region->start_gfn, 1380 region->nr_pages, unmap_flags); 1381 1382 mshv_region_evict(region); 1383 1384 vfree(region); 1385 return 0; 1386 } 1387 1388 static long 1389 mshv_partition_ioctl_set_memory(struct mshv_partition *partition, 1390 struct mshv_user_mem_region __user *user_mem) 1391 { 1392 struct mshv_user_mem_region mem; 1393 1394 if (copy_from_user(&mem, user_mem, sizeof(mem))) 1395 return -EFAULT; 1396 1397 if (!mem.size || 1398 !PAGE_ALIGNED(mem.size) || 1399 !PAGE_ALIGNED(mem.userspace_addr) || 1400 (mem.flags & ~MSHV_SET_MEM_FLAGS_MASK) || 1401 mshv_field_nonzero(mem, rsvd)) 1402 return -EINVAL; 1403 1404 if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP)) 1405 return mshv_unmap_user_memory(partition, mem); 1406 1407 return mshv_map_user_memory(partition, mem); 1408 } 1409 1410 static long 1411 mshv_partition_ioctl_ioeventfd(struct mshv_partition *partition, 1412 void __user *user_args) 1413 { 1414 struct mshv_user_ioeventfd args; 1415 1416 if (copy_from_user(&args, user_args, sizeof(args))) 1417 return -EFAULT; 1418 1419 return mshv_set_unset_ioeventfd(partition, &args); 1420 } 1421 1422 static long 1423 mshv_partition_ioctl_irqfd(struct mshv_partition *partition, 1424 void __user *user_args) 1425 { 1426 struct mshv_user_irqfd args; 1427 1428 if (copy_from_user(&args, user_args, sizeof(args))) 1429 return -EFAULT; 1430 1431 return mshv_set_unset_irqfd(partition, &args); 1432 } 1433 1434 static long 1435 mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition *partition, 1436 void __user *user_args) 1437 { 1438 struct mshv_gpap_access_bitmap args; 1439 union hv_gpa_page_access_state *states; 1440 long ret, i; 1441 union hv_gpa_page_access_state_flags hv_flags = {}; 1442 u8 hv_type_mask; 1443 ulong bitmap_buf_sz, states_buf_sz; 1444 int written = 0; 1445 1446 if (copy_from_user(&args, user_args, sizeof(args))) 1447 return -EFAULT; 1448 1449 if (args.access_type >= MSHV_GPAP_ACCESS_TYPE_COUNT || 1450 args.access_op >= MSHV_GPAP_ACCESS_OP_COUNT || 1451 mshv_field_nonzero(args, rsvd) || !args.page_count || 1452 !args.bitmap_ptr) 1453 return -EINVAL; 1454 1455 if (check_mul_overflow(args.page_count, sizeof(*states), &states_buf_sz)) 1456 return -E2BIG; 1457 1458 /* Num bytes needed to store bitmap; one bit per page rounded up */ 1459 bitmap_buf_sz = DIV_ROUND_UP(args.page_count, 8); 1460 1461 /* Sanity check */ 1462 if (bitmap_buf_sz > states_buf_sz) 1463 return -EBADFD; 1464 1465 switch (args.access_type) { 1466 case MSHV_GPAP_ACCESS_TYPE_ACCESSED: 1467 hv_type_mask = 1; 1468 if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) { 1469 hv_flags.clear_accessed = 1; 1470 /* not accessed implies not dirty */ 1471 hv_flags.clear_dirty = 1; 1472 } else { /* MSHV_GPAP_ACCESS_OP_SET */ 1473 hv_flags.set_accessed = 1; 1474 } 1475 break; 1476 case MSHV_GPAP_ACCESS_TYPE_DIRTY: 1477 hv_type_mask = 2; 1478 if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) { 1479 hv_flags.clear_dirty = 1; 1480 } else { /* MSHV_GPAP_ACCESS_OP_SET */ 1481 hv_flags.set_dirty = 1; 1482 /* dirty implies accessed */ 1483 hv_flags.set_accessed = 1; 1484 } 1485 break; 1486 } 1487 1488 states = vzalloc(states_buf_sz); 1489 if (!states) 1490 return -ENOMEM; 1491 1492 ret = hv_call_get_gpa_access_states(partition->pt_id, args.page_count, 1493 args.gpap_base, hv_flags, &written, 1494 states); 1495 if (ret) 1496 goto free_return; 1497 1498 /* 1499 * Overwrite states buffer with bitmap - the bits in hv_type_mask 1500 * correspond to bitfields in hv_gpa_page_access_state 1501 */ 1502 for (i = 0; i < written; ++i) 1503 __assign_bit(i, (ulong *)states, 1504 states[i].as_uint8 & hv_type_mask); 1505 1506 /* zero the unused bits in the last byte(s) of the returned bitmap */ 1507 for (i = written; i < bitmap_buf_sz * 8; ++i) 1508 __clear_bit(i, (ulong *)states); 1509 1510 if (copy_to_user((void __user *)args.bitmap_ptr, states, bitmap_buf_sz)) 1511 ret = -EFAULT; 1512 1513 free_return: 1514 vfree(states); 1515 return ret; 1516 } 1517 1518 static long 1519 mshv_partition_ioctl_set_msi_routing(struct mshv_partition *partition, 1520 void __user *user_args) 1521 { 1522 struct mshv_user_irq_entry *entries = NULL; 1523 struct mshv_user_irq_table args; 1524 long ret; 1525 1526 if (copy_from_user(&args, user_args, sizeof(args))) 1527 return -EFAULT; 1528 1529 if (args.nr > MSHV_MAX_GUEST_IRQS || 1530 mshv_field_nonzero(args, rsvd)) 1531 return -EINVAL; 1532 1533 if (args.nr) { 1534 struct mshv_user_irq_table __user *urouting = user_args; 1535 1536 entries = vmemdup_user(urouting->entries, 1537 array_size(sizeof(*entries), 1538 args.nr)); 1539 if (IS_ERR(entries)) 1540 return PTR_ERR(entries); 1541 } 1542 ret = mshv_update_routing_table(partition, entries, args.nr); 1543 kvfree(entries); 1544 1545 return ret; 1546 } 1547 1548 static long 1549 mshv_partition_ioctl_initialize(struct mshv_partition *partition) 1550 { 1551 long ret; 1552 1553 if (partition->pt_initialized) 1554 return 0; 1555 1556 ret = hv_call_initialize_partition(partition->pt_id); 1557 if (ret) 1558 goto withdraw_mem; 1559 1560 partition->pt_initialized = true; 1561 1562 return 0; 1563 1564 withdraw_mem: 1565 hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); 1566 1567 return ret; 1568 } 1569 1570 static long 1571 mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) 1572 { 1573 struct mshv_partition *partition = filp->private_data; 1574 long ret; 1575 void __user *uarg = (void __user *)arg; 1576 1577 if (mutex_lock_killable(&partition->pt_mutex)) 1578 return -EINTR; 1579 1580 switch (ioctl) { 1581 case MSHV_INITIALIZE_PARTITION: 1582 ret = mshv_partition_ioctl_initialize(partition); 1583 break; 1584 case MSHV_SET_GUEST_MEMORY: 1585 ret = mshv_partition_ioctl_set_memory(partition, uarg); 1586 break; 1587 case MSHV_CREATE_VP: 1588 ret = mshv_partition_ioctl_create_vp(partition, uarg); 1589 break; 1590 case MSHV_IRQFD: 1591 ret = mshv_partition_ioctl_irqfd(partition, uarg); 1592 break; 1593 case MSHV_IOEVENTFD: 1594 ret = mshv_partition_ioctl_ioeventfd(partition, uarg); 1595 break; 1596 case MSHV_SET_MSI_ROUTING: 1597 ret = mshv_partition_ioctl_set_msi_routing(partition, uarg); 1598 break; 1599 case MSHV_GET_GPAP_ACCESS_BITMAP: 1600 ret = mshv_partition_ioctl_get_gpap_access_bitmap(partition, 1601 uarg); 1602 break; 1603 case MSHV_ROOT_HVCALL: 1604 ret = mshv_ioctl_passthru_hvcall(partition, true, uarg); 1605 break; 1606 default: 1607 ret = -ENOTTY; 1608 } 1609 1610 mutex_unlock(&partition->pt_mutex); 1611 return ret; 1612 } 1613 1614 static int 1615 disable_vp_dispatch(struct mshv_vp *vp) 1616 { 1617 int ret; 1618 struct hv_register_assoc dispatch_suspend = { 1619 .name = HV_REGISTER_DISPATCH_SUSPEND, 1620 .value.dispatch_suspend.suspended = 1, 1621 }; 1622 1623 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 1624 1, &dispatch_suspend); 1625 if (ret) 1626 vp_err(vp, "failed to suspend\n"); 1627 1628 return ret; 1629 } 1630 1631 static int 1632 get_vp_signaled_count(struct mshv_vp *vp, u64 *count) 1633 { 1634 int ret; 1635 struct hv_register_assoc root_signal_count = { 1636 .name = HV_REGISTER_VP_ROOT_SIGNAL_COUNT, 1637 }; 1638 1639 ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 1640 1, &root_signal_count); 1641 1642 if (ret) { 1643 vp_err(vp, "Failed to get root signal count"); 1644 *count = 0; 1645 return ret; 1646 } 1647 1648 *count = root_signal_count.value.reg64; 1649 1650 return ret; 1651 } 1652 1653 static void 1654 drain_vp_signals(struct mshv_vp *vp) 1655 { 1656 u64 hv_signal_count; 1657 u64 vp_signal_count; 1658 1659 get_vp_signaled_count(vp, &hv_signal_count); 1660 1661 vp_signal_count = atomic64_read(&vp->run.vp_signaled_count); 1662 1663 /* 1664 * There should be at most 1 outstanding notification, but be extra 1665 * careful anyway. 1666 */ 1667 while (hv_signal_count != vp_signal_count) { 1668 WARN_ON(hv_signal_count - vp_signal_count != 1); 1669 1670 if (wait_event_interruptible(vp->run.vp_suspend_queue, 1671 vp->run.kicked_by_hv == 1)) 1672 break; 1673 vp->run.kicked_by_hv = 0; 1674 vp_signal_count = atomic64_read(&vp->run.vp_signaled_count); 1675 } 1676 } 1677 1678 static void drain_all_vps(const struct mshv_partition *partition) 1679 { 1680 int i; 1681 struct mshv_vp *vp; 1682 1683 /* 1684 * VPs are reachable from ISR. It is safe to not take the partition 1685 * lock because nobody else can enter this function and drop the 1686 * partition from the list. 1687 */ 1688 for (i = 0; i < MSHV_MAX_VPS; i++) { 1689 vp = partition->pt_vp_array[i]; 1690 if (!vp) 1691 continue; 1692 /* 1693 * Disable dispatching of the VP in the hypervisor. After this 1694 * the hypervisor guarantees it won't generate any signals for 1695 * the VP and the hypervisor's VP signal count won't change. 1696 */ 1697 disable_vp_dispatch(vp); 1698 drain_vp_signals(vp); 1699 } 1700 } 1701 1702 static void 1703 remove_partition(struct mshv_partition *partition) 1704 { 1705 spin_lock(&mshv_root.pt_ht_lock); 1706 hlist_del_rcu(&partition->pt_hnode); 1707 spin_unlock(&mshv_root.pt_ht_lock); 1708 1709 synchronize_rcu(); 1710 } 1711 1712 /* 1713 * Tear down a partition and remove it from the list. 1714 * Partition's refcount must be 0 1715 */ 1716 static void destroy_partition(struct mshv_partition *partition) 1717 { 1718 struct mshv_vp *vp; 1719 struct mshv_mem_region *region; 1720 int i, ret; 1721 struct hlist_node *n; 1722 1723 if (refcount_read(&partition->pt_ref_count)) { 1724 pt_err(partition, 1725 "Attempt to destroy partition but refcount > 0\n"); 1726 return; 1727 } 1728 1729 if (partition->pt_initialized) { 1730 /* 1731 * We only need to drain signals for root scheduler. This should be 1732 * done before removing the partition from the partition list. 1733 */ 1734 if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) 1735 drain_all_vps(partition); 1736 1737 /* Remove vps */ 1738 for (i = 0; i < MSHV_MAX_VPS; ++i) { 1739 vp = partition->pt_vp_array[i]; 1740 if (!vp) 1741 continue; 1742 1743 if (hv_parent_partition()) 1744 mshv_vp_stats_unmap(partition->pt_id, vp->vp_index); 1745 1746 if (vp->vp_register_page) { 1747 (void)hv_call_unmap_vp_state_page(partition->pt_id, 1748 vp->vp_index, 1749 HV_VP_STATE_PAGE_REGISTERS, 1750 input_vtl_zero); 1751 vp->vp_register_page = NULL; 1752 } 1753 1754 (void)hv_call_unmap_vp_state_page(partition->pt_id, 1755 vp->vp_index, 1756 HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, 1757 input_vtl_zero); 1758 vp->vp_intercept_msg_page = NULL; 1759 1760 if (vp->vp_ghcb_page) { 1761 (void)hv_call_unmap_vp_state_page(partition->pt_id, 1762 vp->vp_index, 1763 HV_VP_STATE_PAGE_GHCB, 1764 input_vtl_normal); 1765 vp->vp_ghcb_page = NULL; 1766 } 1767 1768 kfree(vp); 1769 1770 partition->pt_vp_array[i] = NULL; 1771 } 1772 1773 /* Deallocates and unmaps everything including vcpus, GPA mappings etc */ 1774 hv_call_finalize_partition(partition->pt_id); 1775 1776 partition->pt_initialized = false; 1777 } 1778 1779 remove_partition(partition); 1780 1781 /* Remove regions, regain access to the memory and unpin the pages */ 1782 hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions, 1783 hnode) { 1784 hlist_del(®ion->hnode); 1785 1786 if (mshv_partition_encrypted(partition)) { 1787 ret = mshv_partition_region_share(region); 1788 if (ret) { 1789 pt_err(partition, 1790 "Failed to regain access to memory, unpinning user pages will fail and crash the host error: %d\n", 1791 ret); 1792 return; 1793 } 1794 } 1795 1796 mshv_region_evict(region); 1797 1798 vfree(region); 1799 } 1800 1801 /* Withdraw and free all pages we deposited */ 1802 hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); 1803 hv_call_delete_partition(partition->pt_id); 1804 1805 mshv_free_routing_table(partition); 1806 kfree(partition); 1807 } 1808 1809 struct 1810 mshv_partition *mshv_partition_get(struct mshv_partition *partition) 1811 { 1812 if (refcount_inc_not_zero(&partition->pt_ref_count)) 1813 return partition; 1814 return NULL; 1815 } 1816 1817 struct 1818 mshv_partition *mshv_partition_find(u64 partition_id) 1819 __must_hold(RCU) 1820 { 1821 struct mshv_partition *p; 1822 1823 hash_for_each_possible_rcu(mshv_root.pt_htable, p, pt_hnode, 1824 partition_id) 1825 if (p->pt_id == partition_id) 1826 return p; 1827 1828 return NULL; 1829 } 1830 1831 void 1832 mshv_partition_put(struct mshv_partition *partition) 1833 { 1834 if (refcount_dec_and_test(&partition->pt_ref_count)) 1835 destroy_partition(partition); 1836 } 1837 1838 static int 1839 mshv_partition_release(struct inode *inode, struct file *filp) 1840 { 1841 struct mshv_partition *partition = filp->private_data; 1842 1843 mshv_eventfd_release(partition); 1844 1845 cleanup_srcu_struct(&partition->pt_irq_srcu); 1846 1847 mshv_partition_put(partition); 1848 1849 return 0; 1850 } 1851 1852 static int 1853 add_partition(struct mshv_partition *partition) 1854 { 1855 spin_lock(&mshv_root.pt_ht_lock); 1856 1857 hash_add_rcu(mshv_root.pt_htable, &partition->pt_hnode, 1858 partition->pt_id); 1859 1860 spin_unlock(&mshv_root.pt_ht_lock); 1861 1862 return 0; 1863 } 1864 1865 static long 1866 mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev) 1867 { 1868 struct mshv_create_partition args; 1869 u64 creation_flags; 1870 struct hv_partition_creation_properties creation_properties = {}; 1871 union hv_partition_isolation_properties isolation_properties = {}; 1872 struct mshv_partition *partition; 1873 struct file *file; 1874 int fd; 1875 long ret; 1876 1877 if (copy_from_user(&args, user_arg, sizeof(args))) 1878 return -EFAULT; 1879 1880 if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) || 1881 args.pt_isolation >= MSHV_PT_ISOLATION_COUNT) 1882 return -EINVAL; 1883 1884 /* Only support EXO partitions */ 1885 creation_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION | 1886 HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED; 1887 1888 if (args.pt_flags & BIT(MSHV_PT_BIT_LAPIC)) 1889 creation_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED; 1890 if (args.pt_flags & BIT(MSHV_PT_BIT_X2APIC)) 1891 creation_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE; 1892 if (args.pt_flags & BIT(MSHV_PT_BIT_GPA_SUPER_PAGES)) 1893 creation_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED; 1894 1895 switch (args.pt_isolation) { 1896 case MSHV_PT_ISOLATION_NONE: 1897 isolation_properties.isolation_type = 1898 HV_PARTITION_ISOLATION_TYPE_NONE; 1899 break; 1900 } 1901 1902 partition = kzalloc(sizeof(*partition), GFP_KERNEL); 1903 if (!partition) 1904 return -ENOMEM; 1905 1906 partition->pt_module_dev = module_dev; 1907 partition->isolation_type = isolation_properties.isolation_type; 1908 1909 refcount_set(&partition->pt_ref_count, 1); 1910 1911 mutex_init(&partition->pt_mutex); 1912 1913 mutex_init(&partition->pt_irq_lock); 1914 1915 init_completion(&partition->async_hypercall); 1916 1917 INIT_HLIST_HEAD(&partition->irq_ack_notifier_list); 1918 1919 INIT_HLIST_HEAD(&partition->pt_devices); 1920 1921 INIT_HLIST_HEAD(&partition->pt_mem_regions); 1922 1923 mshv_eventfd_init(partition); 1924 1925 ret = init_srcu_struct(&partition->pt_irq_srcu); 1926 if (ret) 1927 goto free_partition; 1928 1929 ret = hv_call_create_partition(creation_flags, 1930 creation_properties, 1931 isolation_properties, 1932 &partition->pt_id); 1933 if (ret) 1934 goto cleanup_irq_srcu; 1935 1936 ret = add_partition(partition); 1937 if (ret) 1938 goto delete_partition; 1939 1940 ret = mshv_init_async_handler(partition); 1941 if (ret) 1942 goto remove_partition; 1943 1944 fd = get_unused_fd_flags(O_CLOEXEC); 1945 if (fd < 0) { 1946 ret = fd; 1947 goto remove_partition; 1948 } 1949 1950 file = anon_inode_getfile("mshv_partition", &mshv_partition_fops, 1951 partition, O_RDWR); 1952 if (IS_ERR(file)) { 1953 ret = PTR_ERR(file); 1954 goto put_fd; 1955 } 1956 1957 fd_install(fd, file); 1958 1959 return fd; 1960 1961 put_fd: 1962 put_unused_fd(fd); 1963 remove_partition: 1964 remove_partition(partition); 1965 delete_partition: 1966 hv_call_delete_partition(partition->pt_id); 1967 cleanup_irq_srcu: 1968 cleanup_srcu_struct(&partition->pt_irq_srcu); 1969 free_partition: 1970 kfree(partition); 1971 1972 return ret; 1973 } 1974 1975 static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, 1976 unsigned long arg) 1977 { 1978 struct miscdevice *misc = filp->private_data; 1979 1980 switch (ioctl) { 1981 case MSHV_CREATE_PARTITION: 1982 return mshv_ioctl_create_partition((void __user *)arg, 1983 misc->this_device); 1984 } 1985 1986 return -ENOTTY; 1987 } 1988 1989 static int 1990 mshv_dev_open(struct inode *inode, struct file *filp) 1991 { 1992 return 0; 1993 } 1994 1995 static int 1996 mshv_dev_release(struct inode *inode, struct file *filp) 1997 { 1998 return 0; 1999 } 2000 2001 static int mshv_cpuhp_online; 2002 static int mshv_root_sched_online; 2003 2004 static const char *scheduler_type_to_string(enum hv_scheduler_type type) 2005 { 2006 switch (type) { 2007 case HV_SCHEDULER_TYPE_LP: 2008 return "classic scheduler without SMT"; 2009 case HV_SCHEDULER_TYPE_LP_SMT: 2010 return "classic scheduler with SMT"; 2011 case HV_SCHEDULER_TYPE_CORE_SMT: 2012 return "core scheduler"; 2013 case HV_SCHEDULER_TYPE_ROOT: 2014 return "root scheduler"; 2015 default: 2016 return "unknown scheduler"; 2017 }; 2018 } 2019 2020 /* TODO move this to hv_common.c when needed outside */ 2021 static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out) 2022 { 2023 struct hv_input_get_system_property *input; 2024 struct hv_output_get_system_property *output; 2025 unsigned long flags; 2026 u64 status; 2027 2028 local_irq_save(flags); 2029 input = *this_cpu_ptr(hyperv_pcpu_input_arg); 2030 output = *this_cpu_ptr(hyperv_pcpu_output_arg); 2031 2032 memset(input, 0, sizeof(*input)); 2033 memset(output, 0, sizeof(*output)); 2034 input->property_id = HV_SYSTEM_PROPERTY_SCHEDULER_TYPE; 2035 2036 status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output); 2037 if (!hv_result_success(status)) { 2038 local_irq_restore(flags); 2039 pr_err("%s: %s\n", __func__, hv_result_to_string(status)); 2040 return hv_result_to_errno(status); 2041 } 2042 2043 *out = output->scheduler_type; 2044 local_irq_restore(flags); 2045 2046 return 0; 2047 } 2048 2049 /* Retrieve and stash the supported scheduler type */ 2050 static int __init mshv_retrieve_scheduler_type(struct device *dev) 2051 { 2052 int ret = 0; 2053 2054 if (hv_l1vh_partition()) 2055 hv_scheduler_type = HV_SCHEDULER_TYPE_CORE_SMT; 2056 else 2057 ret = hv_retrieve_scheduler_type(&hv_scheduler_type); 2058 2059 if (ret) 2060 return ret; 2061 2062 dev_info(dev, "Hypervisor using %s\n", 2063 scheduler_type_to_string(hv_scheduler_type)); 2064 2065 switch (hv_scheduler_type) { 2066 case HV_SCHEDULER_TYPE_CORE_SMT: 2067 case HV_SCHEDULER_TYPE_LP_SMT: 2068 case HV_SCHEDULER_TYPE_ROOT: 2069 case HV_SCHEDULER_TYPE_LP: 2070 /* Supported scheduler, nothing to do */ 2071 break; 2072 default: 2073 dev_err(dev, "unsupported scheduler 0x%x, bailing.\n", 2074 hv_scheduler_type); 2075 return -EOPNOTSUPP; 2076 } 2077 2078 return 0; 2079 } 2080 2081 static int mshv_root_scheduler_init(unsigned int cpu) 2082 { 2083 void **inputarg, **outputarg, *p; 2084 2085 inputarg = (void **)this_cpu_ptr(root_scheduler_input); 2086 outputarg = (void **)this_cpu_ptr(root_scheduler_output); 2087 2088 /* Allocate two consecutive pages. One for input, one for output. */ 2089 p = kmalloc(2 * HV_HYP_PAGE_SIZE, GFP_KERNEL); 2090 if (!p) 2091 return -ENOMEM; 2092 2093 *inputarg = p; 2094 *outputarg = (char *)p + HV_HYP_PAGE_SIZE; 2095 2096 return 0; 2097 } 2098 2099 static int mshv_root_scheduler_cleanup(unsigned int cpu) 2100 { 2101 void *p, **inputarg, **outputarg; 2102 2103 inputarg = (void **)this_cpu_ptr(root_scheduler_input); 2104 outputarg = (void **)this_cpu_ptr(root_scheduler_output); 2105 2106 p = *inputarg; 2107 2108 *inputarg = NULL; 2109 *outputarg = NULL; 2110 2111 kfree(p); 2112 2113 return 0; 2114 } 2115 2116 /* Must be called after retrieving the scheduler type */ 2117 static int 2118 root_scheduler_init(struct device *dev) 2119 { 2120 int ret; 2121 2122 if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) 2123 return 0; 2124 2125 root_scheduler_input = alloc_percpu(void *); 2126 root_scheduler_output = alloc_percpu(void *); 2127 2128 if (!root_scheduler_input || !root_scheduler_output) { 2129 dev_err(dev, "Failed to allocate root scheduler buffers\n"); 2130 ret = -ENOMEM; 2131 goto out; 2132 } 2133 2134 ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_root_sched", 2135 mshv_root_scheduler_init, 2136 mshv_root_scheduler_cleanup); 2137 2138 if (ret < 0) { 2139 dev_err(dev, "Failed to setup root scheduler state: %i\n", ret); 2140 goto out; 2141 } 2142 2143 mshv_root_sched_online = ret; 2144 2145 return 0; 2146 2147 out: 2148 free_percpu(root_scheduler_input); 2149 free_percpu(root_scheduler_output); 2150 return ret; 2151 } 2152 2153 static void 2154 root_scheduler_deinit(void) 2155 { 2156 if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) 2157 return; 2158 2159 cpuhp_remove_state(mshv_root_sched_online); 2160 free_percpu(root_scheduler_input); 2161 free_percpu(root_scheduler_output); 2162 } 2163 2164 static int mshv_reboot_notify(struct notifier_block *nb, 2165 unsigned long code, void *unused) 2166 { 2167 cpuhp_remove_state(mshv_cpuhp_online); 2168 return 0; 2169 } 2170 2171 struct notifier_block mshv_reboot_nb = { 2172 .notifier_call = mshv_reboot_notify, 2173 }; 2174 2175 static void mshv_root_partition_exit(void) 2176 { 2177 unregister_reboot_notifier(&mshv_reboot_nb); 2178 root_scheduler_deinit(); 2179 } 2180 2181 static int __init mshv_root_partition_init(struct device *dev) 2182 { 2183 int err; 2184 2185 err = root_scheduler_init(dev); 2186 if (err) 2187 return err; 2188 2189 err = register_reboot_notifier(&mshv_reboot_nb); 2190 if (err) 2191 goto root_sched_deinit; 2192 2193 return 0; 2194 2195 root_sched_deinit: 2196 root_scheduler_deinit(); 2197 return err; 2198 } 2199 2200 static int __init mshv_parent_partition_init(void) 2201 { 2202 int ret; 2203 struct device *dev; 2204 union hv_hypervisor_version_info version_info; 2205 2206 if (!hv_parent_partition() || is_kdump_kernel()) 2207 return -ENODEV; 2208 2209 if (hv_get_hypervisor_version(&version_info)) 2210 return -ENODEV; 2211 2212 ret = misc_register(&mshv_dev); 2213 if (ret) 2214 return ret; 2215 2216 dev = mshv_dev.this_device; 2217 2218 if (version_info.build_number < MSHV_HV_MIN_VERSION || 2219 version_info.build_number > MSHV_HV_MAX_VERSION) { 2220 dev_err(dev, "Running on unvalidated Hyper-V version\n"); 2221 dev_err(dev, "Versions: current: %u min: %u max: %u\n", 2222 version_info.build_number, MSHV_HV_MIN_VERSION, 2223 MSHV_HV_MAX_VERSION); 2224 } 2225 2226 mshv_root.synic_pages = alloc_percpu(struct hv_synic_pages); 2227 if (!mshv_root.synic_pages) { 2228 dev_err(dev, "Failed to allocate percpu synic page\n"); 2229 ret = -ENOMEM; 2230 goto device_deregister; 2231 } 2232 2233 ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic", 2234 mshv_synic_init, 2235 mshv_synic_cleanup); 2236 if (ret < 0) { 2237 dev_err(dev, "Failed to setup cpu hotplug state: %i\n", ret); 2238 goto free_synic_pages; 2239 } 2240 2241 mshv_cpuhp_online = ret; 2242 2243 ret = mshv_retrieve_scheduler_type(dev); 2244 if (ret) 2245 goto remove_cpu_state; 2246 2247 if (hv_root_partition()) 2248 ret = mshv_root_partition_init(dev); 2249 if (ret) 2250 goto remove_cpu_state; 2251 2252 ret = mshv_irqfd_wq_init(); 2253 if (ret) 2254 goto exit_partition; 2255 2256 spin_lock_init(&mshv_root.pt_ht_lock); 2257 hash_init(mshv_root.pt_htable); 2258 2259 hv_setup_mshv_handler(mshv_isr); 2260 2261 return 0; 2262 2263 exit_partition: 2264 if (hv_root_partition()) 2265 mshv_root_partition_exit(); 2266 remove_cpu_state: 2267 cpuhp_remove_state(mshv_cpuhp_online); 2268 free_synic_pages: 2269 free_percpu(mshv_root.synic_pages); 2270 device_deregister: 2271 misc_deregister(&mshv_dev); 2272 return ret; 2273 } 2274 2275 static void __exit mshv_parent_partition_exit(void) 2276 { 2277 hv_setup_mshv_handler(NULL); 2278 mshv_port_table_fini(); 2279 misc_deregister(&mshv_dev); 2280 mshv_irqfd_wq_cleanup(); 2281 if (hv_root_partition()) 2282 mshv_root_partition_exit(); 2283 cpuhp_remove_state(mshv_cpuhp_online); 2284 free_percpu(mshv_root.synic_pages); 2285 } 2286 2287 module_init(mshv_parent_partition_init); 2288 module_exit(mshv_parent_partition_exit); 2289