1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (c) 2024, Microsoft Corporation.
4 *
5 * The main part of the mshv_root module, providing APIs to create
6 * and manage guest partitions.
7 *
8 * Authors: Microsoft Linux virtualization team
9 */
10
11 #include <linux/entry-virt.h>
12 #include <linux/kernel.h>
13 #include <linux/module.h>
14 #include <linux/fs.h>
15 #include <linux/miscdevice.h>
16 #include <linux/slab.h>
17 #include <linux/file.h>
18 #include <linux/anon_inodes.h>
19 #include <linux/mm.h>
20 #include <linux/io.h>
21 #include <linux/cpuhotplug.h>
22 #include <linux/random.h>
23 #include <asm/mshyperv.h>
24 #include <linux/hyperv.h>
25 #include <linux/notifier.h>
26 #include <linux/reboot.h>
27 #include <linux/kexec.h>
28 #include <linux/page-flags.h>
29 #include <linux/crash_dump.h>
30 #include <linux/panic_notifier.h>
31 #include <linux/vmalloc.h>
32 #include <linux/rseq.h>
33
34 #include "mshv_eventfd.h"
35 #include "mshv.h"
36 #include "mshv_root.h"
37
38 MODULE_AUTHOR("Microsoft");
39 MODULE_LICENSE("GPL");
40 MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv");
41
42 /* TODO move this to another file when debugfs code is added */
43 enum hv_stats_vp_counters { /* HV_THREAD_COUNTER */
44 #if defined(CONFIG_X86)
45 VpRootDispatchThreadBlocked = 202,
46 #elif defined(CONFIG_ARM64)
47 VpRootDispatchThreadBlocked = 94,
48 #endif
49 VpStatsMaxCounter
50 };
51
52 struct hv_stats_page {
53 union {
54 u64 vp_cntrs[VpStatsMaxCounter]; /* VP counters */
55 u8 data[HV_HYP_PAGE_SIZE];
56 };
57 } __packed;
58
59 struct mshv_root mshv_root;
60
61 enum hv_scheduler_type hv_scheduler_type;
62
63 /* Once we implement the fast extended hypercall ABI they can go away. */
64 static void * __percpu *root_scheduler_input;
65 static void * __percpu *root_scheduler_output;
66
67 static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
68 static int mshv_dev_open(struct inode *inode, struct file *filp);
69 static int mshv_dev_release(struct inode *inode, struct file *filp);
70 static int mshv_vp_release(struct inode *inode, struct file *filp);
71 static long mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
72 static int mshv_partition_release(struct inode *inode, struct file *filp);
73 static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
74 static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma);
75 static vm_fault_t mshv_vp_fault(struct vm_fault *vmf);
76 static int mshv_init_async_handler(struct mshv_partition *partition);
77 static void mshv_async_hvcall_handler(void *data, u64 *status);
78
79 static const union hv_input_vtl input_vtl_zero;
80 static const union hv_input_vtl input_vtl_normal = {
81 .target_vtl = HV_NORMAL_VTL,
82 .use_target_vtl = 1,
83 };
84
85 static const struct vm_operations_struct mshv_vp_vm_ops = {
86 .fault = mshv_vp_fault,
87 };
88
89 static const struct file_operations mshv_vp_fops = {
90 .owner = THIS_MODULE,
91 .release = mshv_vp_release,
92 .unlocked_ioctl = mshv_vp_ioctl,
93 .llseek = noop_llseek,
94 .mmap = mshv_vp_mmap,
95 };
96
97 static const struct file_operations mshv_partition_fops = {
98 .owner = THIS_MODULE,
99 .release = mshv_partition_release,
100 .unlocked_ioctl = mshv_partition_ioctl,
101 .llseek = noop_llseek,
102 };
103
104 static const struct file_operations mshv_dev_fops = {
105 .owner = THIS_MODULE,
106 .open = mshv_dev_open,
107 .release = mshv_dev_release,
108 .unlocked_ioctl = mshv_dev_ioctl,
109 .llseek = noop_llseek,
110 };
111
112 static struct miscdevice mshv_dev = {
113 .minor = MISC_DYNAMIC_MINOR,
114 .name = "mshv",
115 .fops = &mshv_dev_fops,
116 .mode = 0600,
117 };
118
119 /*
120 * Only allow hypercalls that have a u64 partition id as the first member of
121 * the input structure.
122 * These are sorted by value.
123 */
124 static u16 mshv_passthru_hvcalls[] = {
125 HVCALL_GET_PARTITION_PROPERTY,
126 HVCALL_GET_PARTITION_PROPERTY_EX,
127 HVCALL_SET_PARTITION_PROPERTY,
128 HVCALL_INSTALL_INTERCEPT,
129 HVCALL_GET_VP_REGISTERS,
130 HVCALL_SET_VP_REGISTERS,
131 HVCALL_TRANSLATE_VIRTUAL_ADDRESS,
132 HVCALL_CLEAR_VIRTUAL_INTERRUPT,
133 HVCALL_REGISTER_INTERCEPT_RESULT,
134 HVCALL_ASSERT_VIRTUAL_INTERRUPT,
135 HVCALL_GET_GPA_PAGES_ACCESS_STATES,
136 HVCALL_SIGNAL_EVENT_DIRECT,
137 HVCALL_POST_MESSAGE_DIRECT,
138 HVCALL_GET_VP_CPUID_VALUES,
139 };
140
141 /*
142 * Only allow hypercalls that are safe to be called by the VMM with the host
143 * partition as target (i.e. HV_PARTITION_ID_SELF). Carefully audit that a
144 * hypercall cannot be misused by the VMM before adding it to this list.
145 */
146 static u16 mshv_self_passthru_hvcalls[] = {
147 HVCALL_GET_PARTITION_PROPERTY,
148 HVCALL_GET_PARTITION_PROPERTY_EX,
149 };
150
mshv_hvcall_is_async(u16 code)151 static bool mshv_hvcall_is_async(u16 code)
152 {
153 switch (code) {
154 case HVCALL_SET_PARTITION_PROPERTY:
155 return true;
156 default:
157 break;
158 }
159 return false;
160 }
161
mshv_passthru_hvcall_allowed(u16 code,u64 pt_id)162 static bool mshv_passthru_hvcall_allowed(u16 code, u64 pt_id)
163 {
164 int i;
165 int n = ARRAY_SIZE(mshv_passthru_hvcalls);
166 u16 *allowed_hvcalls = mshv_passthru_hvcalls;
167
168 if (pt_id == HV_PARTITION_ID_SELF) {
169 n = ARRAY_SIZE(mshv_self_passthru_hvcalls);
170 allowed_hvcalls = mshv_self_passthru_hvcalls;
171 }
172
173 for (i = 0; i < n; ++i)
174 if (allowed_hvcalls[i] == code)
175 return true;
176
177 return false;
178 }
179
mshv_ioctl_passthru_hvcall(struct mshv_partition * partition,bool partition_locked,void __user * user_args)180 static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition,
181 bool partition_locked,
182 void __user *user_args)
183 {
184 u64 status;
185 int ret = 0;
186 bool is_async;
187 struct mshv_root_hvcall args;
188 struct page *page;
189 unsigned int pages_order;
190 void *input_pg = NULL;
191 void *output_pg = NULL;
192 u16 reps_completed;
193 u64 pt_id = partition ? partition->pt_id : HV_PARTITION_ID_SELF;
194
195 if (copy_from_user(&args, user_args, sizeof(args)))
196 return -EFAULT;
197
198 if (args.status || !args.in_ptr || args.in_sz < sizeof(u64) ||
199 mshv_field_nonzero(args, rsvd) || args.in_sz > HV_HYP_PAGE_SIZE)
200 return -EINVAL;
201
202 if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE))
203 return -EINVAL;
204
205 if (!mshv_passthru_hvcall_allowed(args.code, pt_id))
206 return -EINVAL;
207
208 is_async = mshv_hvcall_is_async(args.code);
209 if (is_async) {
210 /* async hypercalls can only be called from partition fd */
211 if (!partition || !partition_locked)
212 return -EINVAL;
213 ret = mshv_init_async_handler(partition);
214 if (ret)
215 return ret;
216 }
217
218 pages_order = args.out_ptr ? 1 : 0;
219 page = alloc_pages(GFP_KERNEL, pages_order);
220 if (!page)
221 return -ENOMEM;
222 input_pg = page_address(page);
223
224 if (args.out_ptr)
225 output_pg = (char *)input_pg + PAGE_SIZE;
226 else
227 output_pg = NULL;
228
229 if (copy_from_user(input_pg, (void __user *)args.in_ptr,
230 args.in_sz)) {
231 ret = -EFAULT;
232 goto free_pages_out;
233 }
234
235 /*
236 * NOTE: This only works because all the allowed hypercalls' input
237 * structs begin with a u64 partition_id field.
238 */
239 *(u64 *)input_pg = pt_id;
240
241 reps_completed = 0;
242 do {
243 if (args.reps) {
244 status = hv_do_rep_hypercall_ex(args.code, args.reps,
245 0, reps_completed,
246 input_pg, output_pg);
247 reps_completed = hv_repcomp(status);
248 } else {
249 status = hv_do_hypercall(args.code, input_pg, output_pg);
250 }
251
252 if (hv_result(status) == HV_STATUS_CALL_PENDING) {
253 if (is_async) {
254 mshv_async_hvcall_handler(partition, &status);
255 } else { /* Paranoia check. This shouldn't happen! */
256 ret = -EBADFD;
257 goto free_pages_out;
258 }
259 }
260
261 if (hv_result_success(status))
262 break;
263
264 if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY)
265 ret = hv_result_to_errno(status);
266 else
267 ret = hv_call_deposit_pages(NUMA_NO_NODE,
268 pt_id, 1);
269 } while (!ret);
270
271 args.status = hv_result(status);
272 args.reps = reps_completed;
273 if (copy_to_user(user_args, &args, sizeof(args)))
274 ret = -EFAULT;
275
276 if (!ret && output_pg &&
277 copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz))
278 ret = -EFAULT;
279
280 free_pages_out:
281 free_pages((unsigned long)input_pg, pages_order);
282
283 return ret;
284 }
285
is_ghcb_mapping_available(void)286 static inline bool is_ghcb_mapping_available(void)
287 {
288 #if IS_ENABLED(CONFIG_X86_64)
289 return ms_hyperv.ext_features & HV_VP_GHCB_ROOT_MAPPING_AVAILABLE;
290 #else
291 return 0;
292 #endif
293 }
294
mshv_get_vp_registers(u32 vp_index,u64 partition_id,u16 count,struct hv_register_assoc * registers)295 static int mshv_get_vp_registers(u32 vp_index, u64 partition_id, u16 count,
296 struct hv_register_assoc *registers)
297 {
298 return hv_call_get_vp_registers(vp_index, partition_id,
299 count, input_vtl_zero, registers);
300 }
301
mshv_set_vp_registers(u32 vp_index,u64 partition_id,u16 count,struct hv_register_assoc * registers)302 static int mshv_set_vp_registers(u32 vp_index, u64 partition_id, u16 count,
303 struct hv_register_assoc *registers)
304 {
305 return hv_call_set_vp_registers(vp_index, partition_id,
306 count, input_vtl_zero, registers);
307 }
308
309 /*
310 * Explicit guest vCPU suspend is asynchronous by nature (as it is requested by
311 * dom0 vCPU for guest vCPU) and thus it can race with "intercept" suspend,
312 * done by the hypervisor.
313 * "Intercept" suspend leads to asynchronous message delivery to dom0 which
314 * should be awaited to keep the VP loop consistent (i.e. no message pending
315 * upon VP resume).
316 * VP intercept suspend can't be done when the VP is explicitly suspended
317 * already, and thus can be only two possible race scenarios:
318 * 1. implicit suspend bit set -> explicit suspend bit set -> message sent
319 * 2. implicit suspend bit set -> message sent -> explicit suspend bit set
320 * Checking for implicit suspend bit set after explicit suspend request has
321 * succeeded in either case allows us to reliably identify, if there is a
322 * message to receive and deliver to VMM.
323 */
324 static int
mshv_suspend_vp(const struct mshv_vp * vp,bool * message_in_flight)325 mshv_suspend_vp(const struct mshv_vp *vp, bool *message_in_flight)
326 {
327 struct hv_register_assoc explicit_suspend = {
328 .name = HV_REGISTER_EXPLICIT_SUSPEND
329 };
330 struct hv_register_assoc intercept_suspend = {
331 .name = HV_REGISTER_INTERCEPT_SUSPEND
332 };
333 union hv_explicit_suspend_register *es =
334 &explicit_suspend.value.explicit_suspend;
335 union hv_intercept_suspend_register *is =
336 &intercept_suspend.value.intercept_suspend;
337 int ret;
338
339 es->suspended = 1;
340
341 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
342 1, &explicit_suspend);
343 if (ret) {
344 vp_err(vp, "Failed to explicitly suspend vCPU\n");
345 return ret;
346 }
347
348 ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
349 1, &intercept_suspend);
350 if (ret) {
351 vp_err(vp, "Failed to get intercept suspend state\n");
352 return ret;
353 }
354
355 *message_in_flight = is->suspended;
356
357 return 0;
358 }
359
360 /*
361 * This function is used when VPs are scheduled by the hypervisor's
362 * scheduler.
363 *
364 * Caller has to make sure the registers contain cleared
365 * HV_REGISTER_INTERCEPT_SUSPEND and HV_REGISTER_EXPLICIT_SUSPEND registers
366 * exactly in this order (the hypervisor clears them sequentially) to avoid
367 * potential invalid clearing a newly arrived HV_REGISTER_INTERCEPT_SUSPEND
368 * after VP is released from HV_REGISTER_EXPLICIT_SUSPEND in case of the
369 * opposite order.
370 */
mshv_run_vp_with_hyp_scheduler(struct mshv_vp * vp)371 static long mshv_run_vp_with_hyp_scheduler(struct mshv_vp *vp)
372 {
373 long ret;
374 struct hv_register_assoc suspend_regs[2] = {
375 { .name = HV_REGISTER_INTERCEPT_SUSPEND },
376 { .name = HV_REGISTER_EXPLICIT_SUSPEND }
377 };
378 size_t count = ARRAY_SIZE(suspend_regs);
379
380 /* Resume VP execution */
381 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
382 count, suspend_regs);
383 if (ret) {
384 vp_err(vp, "Failed to resume vp execution. %lx\n", ret);
385 return ret;
386 }
387
388 ret = wait_event_interruptible(vp->run.vp_suspend_queue,
389 vp->run.kicked_by_hv == 1);
390 if (ret) {
391 bool message_in_flight;
392
393 /*
394 * Otherwise the waiting was interrupted by a signal: suspend
395 * the vCPU explicitly and copy message in flight (if any).
396 */
397 ret = mshv_suspend_vp(vp, &message_in_flight);
398 if (ret)
399 return ret;
400
401 /* Return if no message in flight */
402 if (!message_in_flight)
403 return -EINTR;
404
405 /* Wait for the message in flight. */
406 wait_event(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1);
407 }
408
409 /*
410 * Reset the flag to make the wait_event call above work
411 * next time.
412 */
413 vp->run.kicked_by_hv = 0;
414
415 return 0;
416 }
417
418 static int
mshv_vp_dispatch(struct mshv_vp * vp,u32 flags,struct hv_output_dispatch_vp * res)419 mshv_vp_dispatch(struct mshv_vp *vp, u32 flags,
420 struct hv_output_dispatch_vp *res)
421 {
422 struct hv_input_dispatch_vp *input;
423 struct hv_output_dispatch_vp *output;
424 u64 status;
425
426 preempt_disable();
427 input = *this_cpu_ptr(root_scheduler_input);
428 output = *this_cpu_ptr(root_scheduler_output);
429
430 memset(input, 0, sizeof(*input));
431 memset(output, 0, sizeof(*output));
432
433 input->partition_id = vp->vp_partition->pt_id;
434 input->vp_index = vp->vp_index;
435 input->time_slice = 0; /* Run forever until something happens */
436 input->spec_ctrl = 0; /* TODO: set sensible flags */
437 input->flags = flags;
438
439 vp->run.flags.root_sched_dispatched = 1;
440 status = hv_do_hypercall(HVCALL_DISPATCH_VP, input, output);
441 vp->run.flags.root_sched_dispatched = 0;
442
443 *res = *output;
444 preempt_enable();
445
446 if (!hv_result_success(status))
447 vp_err(vp, "%s: status %s\n", __func__,
448 hv_result_to_string(status));
449
450 return hv_result_to_errno(status);
451 }
452
453 static int
mshv_vp_clear_explicit_suspend(struct mshv_vp * vp)454 mshv_vp_clear_explicit_suspend(struct mshv_vp *vp)
455 {
456 struct hv_register_assoc explicit_suspend = {
457 .name = HV_REGISTER_EXPLICIT_SUSPEND,
458 .value.explicit_suspend.suspended = 0,
459 };
460 int ret;
461
462 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
463 1, &explicit_suspend);
464
465 if (ret)
466 vp_err(vp, "Failed to unsuspend\n");
467
468 return ret;
469 }
470
471 #if IS_ENABLED(CONFIG_X86_64)
mshv_vp_interrupt_pending(struct mshv_vp * vp)472 static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp)
473 {
474 if (!vp->vp_register_page)
475 return 0;
476 return vp->vp_register_page->interrupt_vectors.as_uint64;
477 }
478 #else
mshv_vp_interrupt_pending(struct mshv_vp * vp)479 static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp)
480 {
481 return 0;
482 }
483 #endif
484
mshv_vp_dispatch_thread_blocked(struct mshv_vp * vp)485 static bool mshv_vp_dispatch_thread_blocked(struct mshv_vp *vp)
486 {
487 struct hv_stats_page **stats = vp->vp_stats_pages;
488 u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->vp_cntrs;
489 u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->vp_cntrs;
490
491 if (self_vp_cntrs[VpRootDispatchThreadBlocked])
492 return self_vp_cntrs[VpRootDispatchThreadBlocked];
493 return parent_vp_cntrs[VpRootDispatchThreadBlocked];
494 }
495
496 static int
mshv_vp_wait_for_hv_kick(struct mshv_vp * vp)497 mshv_vp_wait_for_hv_kick(struct mshv_vp *vp)
498 {
499 int ret;
500
501 ret = wait_event_interruptible(vp->run.vp_suspend_queue,
502 (vp->run.kicked_by_hv == 1 &&
503 !mshv_vp_dispatch_thread_blocked(vp)) ||
504 mshv_vp_interrupt_pending(vp));
505 if (ret)
506 return -EINTR;
507
508 vp->run.flags.root_sched_blocked = 0;
509 vp->run.kicked_by_hv = 0;
510
511 return 0;
512 }
513
514 /* Must be called with interrupts enabled */
mshv_run_vp_with_root_scheduler(struct mshv_vp * vp)515 static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp)
516 {
517 long ret;
518
519 if (vp->run.flags.root_sched_blocked) {
520 /*
521 * Dispatch state of this VP is blocked. Need to wait
522 * for the hypervisor to clear the blocked state before
523 * dispatching it.
524 */
525 ret = mshv_vp_wait_for_hv_kick(vp);
526 if (ret)
527 return ret;
528 }
529
530 do {
531 u32 flags = 0;
532 struct hv_output_dispatch_vp output;
533
534 if (__xfer_to_guest_mode_work_pending()) {
535 ret = xfer_to_guest_mode_handle_work();
536 if (ret)
537 break;
538 }
539
540 if (vp->run.flags.intercept_suspend)
541 flags |= HV_DISPATCH_VP_FLAG_CLEAR_INTERCEPT_SUSPEND;
542
543 if (mshv_vp_interrupt_pending(vp))
544 flags |= HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION;
545
546 ret = mshv_vp_dispatch(vp, flags, &output);
547 if (ret)
548 break;
549
550 vp->run.flags.intercept_suspend = 0;
551
552 if (output.dispatch_state == HV_VP_DISPATCH_STATE_BLOCKED) {
553 if (output.dispatch_event ==
554 HV_VP_DISPATCH_EVENT_SUSPEND) {
555 /*
556 * TODO: remove the warning once VP canceling
557 * is supported
558 */
559 WARN_ONCE(atomic64_read(&vp->run.vp_signaled_count),
560 "%s: vp#%d: unexpected explicit suspend\n",
561 __func__, vp->vp_index);
562 /*
563 * Need to clear explicit suspend before
564 * dispatching.
565 * Explicit suspend is either:
566 * - set right after the first VP dispatch or
567 * - set explicitly via hypercall
568 * Since the latter case is not yet supported,
569 * simply clear it here.
570 */
571 ret = mshv_vp_clear_explicit_suspend(vp);
572 if (ret)
573 break;
574
575 ret = mshv_vp_wait_for_hv_kick(vp);
576 if (ret)
577 break;
578 } else {
579 vp->run.flags.root_sched_blocked = 1;
580 ret = mshv_vp_wait_for_hv_kick(vp);
581 if (ret)
582 break;
583 }
584 } else {
585 /* HV_VP_DISPATCH_STATE_READY */
586 if (output.dispatch_event ==
587 HV_VP_DISPATCH_EVENT_INTERCEPT)
588 vp->run.flags.intercept_suspend = 1;
589 }
590 } while (!vp->run.flags.intercept_suspend);
591
592 rseq_virt_userspace_exit();
593
594 return ret;
595 }
596
597 static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ,
598 "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ");
599
600 static struct mshv_mem_region *
mshv_partition_region_by_gfn(struct mshv_partition * partition,u64 gfn)601 mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
602 {
603 struct mshv_mem_region *region;
604
605 hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) {
606 if (gfn >= region->start_gfn &&
607 gfn < region->start_gfn + region->nr_pages)
608 return region;
609 }
610
611 return NULL;
612 }
613
614 static struct mshv_mem_region *
mshv_partition_region_by_gfn_get(struct mshv_partition * p,u64 gfn)615 mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
616 {
617 struct mshv_mem_region *region;
618
619 spin_lock(&p->pt_mem_regions_lock);
620 region = mshv_partition_region_by_gfn(p, gfn);
621 if (!region || !mshv_region_get(region)) {
622 spin_unlock(&p->pt_mem_regions_lock);
623 return NULL;
624 }
625 spin_unlock(&p->pt_mem_regions_lock);
626
627 return region;
628 }
629
630 /**
631 * mshv_handle_gpa_intercept - Handle GPA (Guest Physical Address) intercepts.
632 * @vp: Pointer to the virtual processor structure.
633 *
634 * This function processes GPA intercepts by identifying the memory region
635 * corresponding to the intercepted GPA, aligning the page offset, and
636 * mapping the required pages. It ensures that the region is valid and
637 * handles faults efficiently by mapping multiple pages at once.
638 *
639 * Return: true if the intercept was handled successfully, false otherwise.
640 */
mshv_handle_gpa_intercept(struct mshv_vp * vp)641 static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
642 {
643 struct mshv_partition *p = vp->vp_partition;
644 struct mshv_mem_region *region;
645 bool ret;
646 u64 gfn;
647 #if defined(CONFIG_X86_64)
648 struct hv_x64_memory_intercept_message *msg =
649 (struct hv_x64_memory_intercept_message *)
650 vp->vp_intercept_msg_page->u.payload;
651 #elif defined(CONFIG_ARM64)
652 struct hv_arm64_memory_intercept_message *msg =
653 (struct hv_arm64_memory_intercept_message *)
654 vp->vp_intercept_msg_page->u.payload;
655 #endif
656
657 gfn = HVPFN_DOWN(msg->guest_physical_address);
658
659 region = mshv_partition_region_by_gfn_get(p, gfn);
660 if (!region)
661 return false;
662
663 /* Only movable memory ranges are supported for GPA intercepts */
664 if (region->type == MSHV_REGION_TYPE_MEM_MOVABLE)
665 ret = mshv_region_handle_gfn_fault(region, gfn);
666 else
667 ret = false;
668
669 mshv_region_put(region);
670
671 return ret;
672 }
673
mshv_vp_handle_intercept(struct mshv_vp * vp)674 static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
675 {
676 switch (vp->vp_intercept_msg_page->header.message_type) {
677 case HVMSG_GPA_INTERCEPT:
678 return mshv_handle_gpa_intercept(vp);
679 }
680 return false;
681 }
682
mshv_vp_ioctl_run_vp(struct mshv_vp * vp,void __user * ret_msg)683 static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg)
684 {
685 long rc;
686
687 do {
688 if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
689 rc = mshv_run_vp_with_root_scheduler(vp);
690 else
691 rc = mshv_run_vp_with_hyp_scheduler(vp);
692 } while (rc == 0 && mshv_vp_handle_intercept(vp));
693
694 if (rc)
695 return rc;
696
697 if (copy_to_user(ret_msg, vp->vp_intercept_msg_page,
698 sizeof(struct hv_message)))
699 rc = -EFAULT;
700
701 return rc;
702 }
703
704 static int
mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp * vp,struct hv_vp_state_data state_data,unsigned long user_pfn,size_t page_count,bool is_set)705 mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp *vp,
706 struct hv_vp_state_data state_data,
707 unsigned long user_pfn, size_t page_count,
708 bool is_set)
709 {
710 int completed, ret = 0;
711 unsigned long check;
712 struct page **pages;
713
714 if (page_count > INT_MAX)
715 return -EINVAL;
716 /*
717 * Check the arithmetic for wraparound/overflow.
718 * The last page address in the buffer is:
719 * (user_pfn + (page_count - 1)) * PAGE_SIZE
720 */
721 if (check_add_overflow(user_pfn, (page_count - 1), &check))
722 return -EOVERFLOW;
723 if (check_mul_overflow(check, PAGE_SIZE, &check))
724 return -EOVERFLOW;
725
726 /* Pin user pages so hypervisor can copy directly to them */
727 pages = kcalloc(page_count, sizeof(struct page *), GFP_KERNEL);
728 if (!pages)
729 return -ENOMEM;
730
731 for (completed = 0; completed < page_count; completed += ret) {
732 unsigned long user_addr = (user_pfn + completed) * PAGE_SIZE;
733 int remaining = page_count - completed;
734
735 ret = pin_user_pages_fast(user_addr, remaining, FOLL_WRITE,
736 &pages[completed]);
737 if (ret < 0) {
738 vp_err(vp, "%s: Failed to pin user pages error %i\n",
739 __func__, ret);
740 goto unpin_pages;
741 }
742 }
743
744 if (is_set)
745 ret = hv_call_set_vp_state(vp->vp_index,
746 vp->vp_partition->pt_id,
747 state_data, page_count, pages,
748 0, NULL);
749 else
750 ret = hv_call_get_vp_state(vp->vp_index,
751 vp->vp_partition->pt_id,
752 state_data, page_count, pages,
753 NULL);
754
755 unpin_pages:
756 unpin_user_pages(pages, completed);
757 kfree(pages);
758 return ret;
759 }
760
761 static long
mshv_vp_ioctl_get_set_state(struct mshv_vp * vp,struct mshv_get_set_vp_state __user * user_args,bool is_set)762 mshv_vp_ioctl_get_set_state(struct mshv_vp *vp,
763 struct mshv_get_set_vp_state __user *user_args,
764 bool is_set)
765 {
766 struct mshv_get_set_vp_state args;
767 long ret = 0;
768 union hv_output_get_vp_state vp_state;
769 u32 data_sz;
770 struct hv_vp_state_data state_data = {};
771
772 if (copy_from_user(&args, user_args, sizeof(args)))
773 return -EFAULT;
774
775 if (args.type >= MSHV_VP_STATE_COUNT || mshv_field_nonzero(args, rsvd) ||
776 !args.buf_sz || !PAGE_ALIGNED(args.buf_sz) ||
777 !PAGE_ALIGNED(args.buf_ptr))
778 return -EINVAL;
779
780 if (!access_ok((void __user *)args.buf_ptr, args.buf_sz))
781 return -EFAULT;
782
783 switch (args.type) {
784 case MSHV_VP_STATE_LAPIC:
785 state_data.type = HV_GET_SET_VP_STATE_LAPIC_STATE;
786 data_sz = HV_HYP_PAGE_SIZE;
787 break;
788 case MSHV_VP_STATE_XSAVE:
789 {
790 u64 data_sz_64;
791
792 ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
793 HV_PARTITION_PROPERTY_XSAVE_STATES,
794 &state_data.xsave.states.as_uint64);
795 if (ret)
796 return ret;
797
798 ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
799 HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE,
800 &data_sz_64);
801 if (ret)
802 return ret;
803
804 data_sz = (u32)data_sz_64;
805 state_data.xsave.flags = 0;
806 /* Always request legacy states */
807 state_data.xsave.states.legacy_x87 = 1;
808 state_data.xsave.states.legacy_sse = 1;
809 state_data.type = HV_GET_SET_VP_STATE_XSAVE;
810 break;
811 }
812 case MSHV_VP_STATE_SIMP:
813 state_data.type = HV_GET_SET_VP_STATE_SIM_PAGE;
814 data_sz = HV_HYP_PAGE_SIZE;
815 break;
816 case MSHV_VP_STATE_SIEFP:
817 state_data.type = HV_GET_SET_VP_STATE_SIEF_PAGE;
818 data_sz = HV_HYP_PAGE_SIZE;
819 break;
820 case MSHV_VP_STATE_SYNTHETIC_TIMERS:
821 state_data.type = HV_GET_SET_VP_STATE_SYNTHETIC_TIMERS;
822 data_sz = sizeof(vp_state.synthetic_timers_state);
823 break;
824 default:
825 return -EINVAL;
826 }
827
828 if (copy_to_user(&user_args->buf_sz, &data_sz, sizeof(user_args->buf_sz)))
829 return -EFAULT;
830
831 if (data_sz > args.buf_sz)
832 return -EINVAL;
833
834 /* If the data is transmitted via pfns, delegate to helper */
835 if (state_data.type & HV_GET_SET_VP_STATE_TYPE_PFN) {
836 unsigned long user_pfn = PFN_DOWN(args.buf_ptr);
837 size_t page_count = PFN_DOWN(args.buf_sz);
838
839 return mshv_vp_ioctl_get_set_state_pfn(vp, state_data, user_pfn,
840 page_count, is_set);
841 }
842
843 /* Paranoia check - this shouldn't happen! */
844 if (data_sz > sizeof(vp_state)) {
845 vp_err(vp, "Invalid vp state data size!\n");
846 return -EINVAL;
847 }
848
849 if (is_set) {
850 if (copy_from_user(&vp_state, (__user void *)args.buf_ptr, data_sz))
851 return -EFAULT;
852
853 return hv_call_set_vp_state(vp->vp_index,
854 vp->vp_partition->pt_id,
855 state_data, 0, NULL,
856 sizeof(vp_state), (u8 *)&vp_state);
857 }
858
859 ret = hv_call_get_vp_state(vp->vp_index, vp->vp_partition->pt_id,
860 state_data, 0, NULL, &vp_state);
861 if (ret)
862 return ret;
863
864 if (copy_to_user((void __user *)args.buf_ptr, &vp_state, data_sz))
865 return -EFAULT;
866
867 return 0;
868 }
869
870 static long
mshv_vp_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)871 mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
872 {
873 struct mshv_vp *vp = filp->private_data;
874 long r = -ENOTTY;
875
876 if (mutex_lock_killable(&vp->vp_mutex))
877 return -EINTR;
878
879 switch (ioctl) {
880 case MSHV_RUN_VP:
881 r = mshv_vp_ioctl_run_vp(vp, (void __user *)arg);
882 break;
883 case MSHV_GET_VP_STATE:
884 r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, false);
885 break;
886 case MSHV_SET_VP_STATE:
887 r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, true);
888 break;
889 case MSHV_ROOT_HVCALL:
890 r = mshv_ioctl_passthru_hvcall(vp->vp_partition, false,
891 (void __user *)arg);
892 break;
893 default:
894 vp_warn(vp, "Invalid ioctl: %#x\n", ioctl);
895 break;
896 }
897 mutex_unlock(&vp->vp_mutex);
898
899 return r;
900 }
901
mshv_vp_fault(struct vm_fault * vmf)902 static vm_fault_t mshv_vp_fault(struct vm_fault *vmf)
903 {
904 struct mshv_vp *vp = vmf->vma->vm_file->private_data;
905
906 switch (vmf->vma->vm_pgoff) {
907 case MSHV_VP_MMAP_OFFSET_REGISTERS:
908 vmf->page = virt_to_page(vp->vp_register_page);
909 break;
910 case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE:
911 vmf->page = virt_to_page(vp->vp_intercept_msg_page);
912 break;
913 case MSHV_VP_MMAP_OFFSET_GHCB:
914 vmf->page = virt_to_page(vp->vp_ghcb_page);
915 break;
916 default:
917 return VM_FAULT_SIGBUS;
918 }
919
920 get_page(vmf->page);
921
922 return 0;
923 }
924
mshv_vp_mmap(struct file * file,struct vm_area_struct * vma)925 static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma)
926 {
927 struct mshv_vp *vp = file->private_data;
928
929 switch (vma->vm_pgoff) {
930 case MSHV_VP_MMAP_OFFSET_REGISTERS:
931 if (!vp->vp_register_page)
932 return -ENODEV;
933 break;
934 case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE:
935 if (!vp->vp_intercept_msg_page)
936 return -ENODEV;
937 break;
938 case MSHV_VP_MMAP_OFFSET_GHCB:
939 if (!vp->vp_ghcb_page)
940 return -ENODEV;
941 break;
942 default:
943 return -EINVAL;
944 }
945
946 vma->vm_ops = &mshv_vp_vm_ops;
947 return 0;
948 }
949
950 static int
mshv_vp_release(struct inode * inode,struct file * filp)951 mshv_vp_release(struct inode *inode, struct file *filp)
952 {
953 struct mshv_vp *vp = filp->private_data;
954
955 /* Rest of VP cleanup happens in destroy_partition() */
956 mshv_partition_put(vp->vp_partition);
957 return 0;
958 }
959
mshv_vp_stats_unmap(u64 partition_id,u32 vp_index,void * stats_pages[])960 static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index,
961 void *stats_pages[])
962 {
963 union hv_stats_object_identity identity = {
964 .vp.partition_id = partition_id,
965 .vp.vp_index = vp_index,
966 };
967
968 identity.vp.stats_area_type = HV_STATS_AREA_SELF;
969 hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity);
970
971 identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
972 hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity);
973 }
974
mshv_vp_stats_map(u64 partition_id,u32 vp_index,void * stats_pages[])975 static int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
976 void *stats_pages[])
977 {
978 union hv_stats_object_identity identity = {
979 .vp.partition_id = partition_id,
980 .vp.vp_index = vp_index,
981 };
982 int err;
983
984 identity.vp.stats_area_type = HV_STATS_AREA_SELF;
985 err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
986 &stats_pages[HV_STATS_AREA_SELF]);
987 if (err)
988 return err;
989
990 identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
991 err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
992 &stats_pages[HV_STATS_AREA_PARENT]);
993 if (err)
994 goto unmap_self;
995
996 return 0;
997
998 unmap_self:
999 identity.vp.stats_area_type = HV_STATS_AREA_SELF;
1000 hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity);
1001 return err;
1002 }
1003
1004 static long
mshv_partition_ioctl_create_vp(struct mshv_partition * partition,void __user * arg)1005 mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
1006 void __user *arg)
1007 {
1008 struct mshv_create_vp args;
1009 struct mshv_vp *vp;
1010 struct page *intercept_msg_page, *register_page, *ghcb_page;
1011 void *stats_pages[2];
1012 long ret;
1013
1014 if (copy_from_user(&args, arg, sizeof(args)))
1015 return -EFAULT;
1016
1017 if (args.vp_index >= MSHV_MAX_VPS)
1018 return -EINVAL;
1019
1020 if (partition->pt_vp_array[args.vp_index])
1021 return -EEXIST;
1022
1023 ret = hv_call_create_vp(NUMA_NO_NODE, partition->pt_id, args.vp_index,
1024 0 /* Only valid for root partition VPs */);
1025 if (ret)
1026 return ret;
1027
1028 ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
1029 HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
1030 input_vtl_zero, &intercept_msg_page);
1031 if (ret)
1032 goto destroy_vp;
1033
1034 if (!mshv_partition_encrypted(partition)) {
1035 ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
1036 HV_VP_STATE_PAGE_REGISTERS,
1037 input_vtl_zero, ®ister_page);
1038 if (ret)
1039 goto unmap_intercept_message_page;
1040 }
1041
1042 if (mshv_partition_encrypted(partition) &&
1043 is_ghcb_mapping_available()) {
1044 ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
1045 HV_VP_STATE_PAGE_GHCB,
1046 input_vtl_normal, &ghcb_page);
1047 if (ret)
1048 goto unmap_register_page;
1049 }
1050
1051 /*
1052 * This mapping of the stats page is for detecting if dispatch thread
1053 * is blocked - only relevant for root scheduler
1054 */
1055 if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) {
1056 ret = mshv_vp_stats_map(partition->pt_id, args.vp_index,
1057 stats_pages);
1058 if (ret)
1059 goto unmap_ghcb_page;
1060 }
1061
1062 vp = kzalloc(sizeof(*vp), GFP_KERNEL);
1063 if (!vp)
1064 goto unmap_stats_pages;
1065
1066 vp->vp_partition = mshv_partition_get(partition);
1067 if (!vp->vp_partition) {
1068 ret = -EBADF;
1069 goto free_vp;
1070 }
1071
1072 mutex_init(&vp->vp_mutex);
1073 init_waitqueue_head(&vp->run.vp_suspend_queue);
1074 atomic64_set(&vp->run.vp_signaled_count, 0);
1075
1076 vp->vp_index = args.vp_index;
1077 vp->vp_intercept_msg_page = page_to_virt(intercept_msg_page);
1078 if (!mshv_partition_encrypted(partition))
1079 vp->vp_register_page = page_to_virt(register_page);
1080
1081 if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
1082 vp->vp_ghcb_page = page_to_virt(ghcb_page);
1083
1084 if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
1085 memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages));
1086
1087 /*
1088 * Keep anon_inode_getfd last: it installs fd in the file struct and
1089 * thus makes the state accessible in user space.
1090 */
1091 ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp,
1092 O_RDWR | O_CLOEXEC);
1093 if (ret < 0)
1094 goto put_partition;
1095
1096 /* already exclusive with the partition mutex for all ioctls */
1097 partition->pt_vp_count++;
1098 partition->pt_vp_array[args.vp_index] = vp;
1099
1100 return ret;
1101
1102 put_partition:
1103 mshv_partition_put(partition);
1104 free_vp:
1105 kfree(vp);
1106 unmap_stats_pages:
1107 if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
1108 mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages);
1109 unmap_ghcb_page:
1110 if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
1111 hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
1112 HV_VP_STATE_PAGE_GHCB, ghcb_page,
1113 input_vtl_normal);
1114 unmap_register_page:
1115 if (!mshv_partition_encrypted(partition))
1116 hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
1117 HV_VP_STATE_PAGE_REGISTERS,
1118 register_page, input_vtl_zero);
1119 unmap_intercept_message_page:
1120 hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
1121 HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
1122 intercept_msg_page, input_vtl_zero);
1123 destroy_vp:
1124 hv_call_delete_vp(partition->pt_id, args.vp_index);
1125 return ret;
1126 }
1127
mshv_init_async_handler(struct mshv_partition * partition)1128 static int mshv_init_async_handler(struct mshv_partition *partition)
1129 {
1130 if (completion_done(&partition->async_hypercall)) {
1131 pt_err(partition,
1132 "Cannot issue async hypercall while another one in progress!\n");
1133 return -EPERM;
1134 }
1135
1136 reinit_completion(&partition->async_hypercall);
1137 return 0;
1138 }
1139
mshv_async_hvcall_handler(void * data,u64 * status)1140 static void mshv_async_hvcall_handler(void *data, u64 *status)
1141 {
1142 struct mshv_partition *partition = data;
1143
1144 wait_for_completion(&partition->async_hypercall);
1145 pt_dbg(partition, "Async hypercall completed!\n");
1146
1147 *status = partition->async_hypercall_status;
1148 }
1149
1150 /*
1151 * NB: caller checks and makes sure mem->size is page aligned
1152 * Returns: 0 with regionpp updated on success, or -errno
1153 */
mshv_partition_create_region(struct mshv_partition * partition,struct mshv_user_mem_region * mem,struct mshv_mem_region ** regionpp,bool is_mmio)1154 static int mshv_partition_create_region(struct mshv_partition *partition,
1155 struct mshv_user_mem_region *mem,
1156 struct mshv_mem_region **regionpp,
1157 bool is_mmio)
1158 {
1159 struct mshv_mem_region *rg;
1160 u64 nr_pages = HVPFN_DOWN(mem->size);
1161
1162 /* Reject overlapping regions */
1163 spin_lock(&partition->pt_mem_regions_lock);
1164 hlist_for_each_entry(rg, &partition->pt_mem_regions, hnode) {
1165 if (mem->guest_pfn + nr_pages <= rg->start_gfn ||
1166 rg->start_gfn + rg->nr_pages <= mem->guest_pfn)
1167 continue;
1168 spin_unlock(&partition->pt_mem_regions_lock);
1169 return -EEXIST;
1170 }
1171 spin_unlock(&partition->pt_mem_regions_lock);
1172
1173 rg = mshv_region_create(mem->guest_pfn, nr_pages,
1174 mem->userspace_addr, mem->flags);
1175 if (IS_ERR(rg))
1176 return PTR_ERR(rg);
1177
1178 if (is_mmio)
1179 rg->type = MSHV_REGION_TYPE_MMIO;
1180 else if (mshv_partition_encrypted(partition) ||
1181 !mshv_region_movable_init(rg))
1182 rg->type = MSHV_REGION_TYPE_MEM_PINNED;
1183 else
1184 rg->type = MSHV_REGION_TYPE_MEM_MOVABLE;
1185
1186 rg->partition = partition;
1187
1188 *regionpp = rg;
1189
1190 return 0;
1191 }
1192
1193 /**
1194 * mshv_prepare_pinned_region - Pin and map memory regions
1195 * @region: Pointer to the memory region structure
1196 *
1197 * This function processes memory regions that are explicitly marked as pinned.
1198 * Pinned regions are preallocated, mapped upfront, and do not rely on fault-based
1199 * population. The function ensures the region is properly populated, handles
1200 * encryption requirements for SNP partitions if applicable, maps the region,
1201 * and performs necessary sharing or eviction operations based on the mapping
1202 * result.
1203 *
1204 * Return: 0 on success, negative error code on failure.
1205 */
mshv_prepare_pinned_region(struct mshv_mem_region * region)1206 static int mshv_prepare_pinned_region(struct mshv_mem_region *region)
1207 {
1208 struct mshv_partition *partition = region->partition;
1209 int ret;
1210
1211 ret = mshv_region_pin(region);
1212 if (ret) {
1213 pt_err(partition, "Failed to pin memory region: %d\n",
1214 ret);
1215 goto err_out;
1216 }
1217
1218 /*
1219 * For an SNP partition it is a requirement that for every memory region
1220 * that we are going to map for this partition we should make sure that
1221 * host access to that region is released. This is ensured by doing an
1222 * additional hypercall which will update the SLAT to release host
1223 * access to guest memory regions.
1224 */
1225 if (mshv_partition_encrypted(partition)) {
1226 ret = mshv_region_unshare(region);
1227 if (ret) {
1228 pt_err(partition,
1229 "Failed to unshare memory region (guest_pfn: %llu): %d\n",
1230 region->start_gfn, ret);
1231 goto invalidate_region;
1232 }
1233 }
1234
1235 ret = mshv_region_map(region);
1236 if (ret && mshv_partition_encrypted(partition)) {
1237 int shrc;
1238
1239 shrc = mshv_region_share(region);
1240 if (!shrc)
1241 goto invalidate_region;
1242
1243 pt_err(partition,
1244 "Failed to share memory region (guest_pfn: %llu): %d\n",
1245 region->start_gfn, shrc);
1246 /*
1247 * Don't unpin if marking shared failed because pages are no
1248 * longer mapped in the host, ie root, anymore.
1249 */
1250 goto err_out;
1251 }
1252
1253 return 0;
1254
1255 invalidate_region:
1256 mshv_region_invalidate(region);
1257 err_out:
1258 return ret;
1259 }
1260
1261 /*
1262 * This maps two things: guest RAM and for pci passthru mmio space.
1263 *
1264 * mmio:
1265 * - vfio overloads vm_pgoff to store the mmio start pfn/spa.
1266 * - Two things need to happen for mapping mmio range:
1267 * 1. mapped in the uaddr so VMM can access it.
1268 * 2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it.
1269 *
1270 * This function takes care of the second. The first one is managed by vfio,
1271 * and hence is taken care of via vfio_pci_mmap_fault().
1272 */
1273 static long
mshv_map_user_memory(struct mshv_partition * partition,struct mshv_user_mem_region mem)1274 mshv_map_user_memory(struct mshv_partition *partition,
1275 struct mshv_user_mem_region mem)
1276 {
1277 struct mshv_mem_region *region;
1278 struct vm_area_struct *vma;
1279 bool is_mmio;
1280 ulong mmio_pfn;
1281 long ret;
1282
1283 if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP) ||
1284 !access_ok((const void __user *)mem.userspace_addr, mem.size))
1285 return -EINVAL;
1286
1287 mmap_read_lock(current->mm);
1288 vma = vma_lookup(current->mm, mem.userspace_addr);
1289 is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
1290 mmio_pfn = is_mmio ? vma->vm_pgoff : 0;
1291 mmap_read_unlock(current->mm);
1292
1293 if (!vma)
1294 return -EINVAL;
1295
1296 ret = mshv_partition_create_region(partition, &mem, ®ion,
1297 is_mmio);
1298 if (ret)
1299 return ret;
1300
1301 switch (region->type) {
1302 case MSHV_REGION_TYPE_MEM_PINNED:
1303 ret = mshv_prepare_pinned_region(region);
1304 break;
1305 case MSHV_REGION_TYPE_MEM_MOVABLE:
1306 /*
1307 * For movable memory regions, remap with no access to let
1308 * the hypervisor track dirty pages, enabling pre-copy live
1309 * migration.
1310 */
1311 ret = hv_call_map_gpa_pages(partition->pt_id,
1312 region->start_gfn,
1313 region->nr_pages,
1314 HV_MAP_GPA_NO_ACCESS, NULL);
1315 break;
1316 case MSHV_REGION_TYPE_MMIO:
1317 ret = hv_call_map_mmio_pages(partition->pt_id,
1318 region->start_gfn,
1319 mmio_pfn,
1320 region->nr_pages);
1321 break;
1322 }
1323
1324 if (ret)
1325 goto errout;
1326
1327 spin_lock(&partition->pt_mem_regions_lock);
1328 hlist_add_head(®ion->hnode, &partition->pt_mem_regions);
1329 spin_unlock(&partition->pt_mem_regions_lock);
1330
1331 return 0;
1332
1333 errout:
1334 vfree(region);
1335 return ret;
1336 }
1337
1338 /* Called for unmapping both the guest ram and the mmio space */
1339 static long
mshv_unmap_user_memory(struct mshv_partition * partition,struct mshv_user_mem_region mem)1340 mshv_unmap_user_memory(struct mshv_partition *partition,
1341 struct mshv_user_mem_region mem)
1342 {
1343 struct mshv_mem_region *region;
1344
1345 if (!(mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP)))
1346 return -EINVAL;
1347
1348 spin_lock(&partition->pt_mem_regions_lock);
1349
1350 region = mshv_partition_region_by_gfn(partition, mem.guest_pfn);
1351 if (!region) {
1352 spin_unlock(&partition->pt_mem_regions_lock);
1353 return -ENOENT;
1354 }
1355
1356 /* Paranoia check */
1357 if (region->start_uaddr != mem.userspace_addr ||
1358 region->start_gfn != mem.guest_pfn ||
1359 region->nr_pages != HVPFN_DOWN(mem.size)) {
1360 spin_unlock(&partition->pt_mem_regions_lock);
1361 return -EINVAL;
1362 }
1363
1364 hlist_del(®ion->hnode);
1365
1366 spin_unlock(&partition->pt_mem_regions_lock);
1367
1368 mshv_region_put(region);
1369
1370 return 0;
1371 }
1372
1373 static long
mshv_partition_ioctl_set_memory(struct mshv_partition * partition,struct mshv_user_mem_region __user * user_mem)1374 mshv_partition_ioctl_set_memory(struct mshv_partition *partition,
1375 struct mshv_user_mem_region __user *user_mem)
1376 {
1377 struct mshv_user_mem_region mem;
1378
1379 if (copy_from_user(&mem, user_mem, sizeof(mem)))
1380 return -EFAULT;
1381
1382 if (!mem.size ||
1383 !PAGE_ALIGNED(mem.size) ||
1384 !PAGE_ALIGNED(mem.userspace_addr) ||
1385 (mem.flags & ~MSHV_SET_MEM_FLAGS_MASK) ||
1386 mshv_field_nonzero(mem, rsvd))
1387 return -EINVAL;
1388
1389 if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP))
1390 return mshv_unmap_user_memory(partition, mem);
1391
1392 return mshv_map_user_memory(partition, mem);
1393 }
1394
1395 static long
mshv_partition_ioctl_ioeventfd(struct mshv_partition * partition,void __user * user_args)1396 mshv_partition_ioctl_ioeventfd(struct mshv_partition *partition,
1397 void __user *user_args)
1398 {
1399 struct mshv_user_ioeventfd args;
1400
1401 if (copy_from_user(&args, user_args, sizeof(args)))
1402 return -EFAULT;
1403
1404 return mshv_set_unset_ioeventfd(partition, &args);
1405 }
1406
1407 static long
mshv_partition_ioctl_irqfd(struct mshv_partition * partition,void __user * user_args)1408 mshv_partition_ioctl_irqfd(struct mshv_partition *partition,
1409 void __user *user_args)
1410 {
1411 struct mshv_user_irqfd args;
1412
1413 if (copy_from_user(&args, user_args, sizeof(args)))
1414 return -EFAULT;
1415
1416 return mshv_set_unset_irqfd(partition, &args);
1417 }
1418
1419 static long
mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition * partition,void __user * user_args)1420 mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition *partition,
1421 void __user *user_args)
1422 {
1423 struct mshv_gpap_access_bitmap args;
1424 union hv_gpa_page_access_state *states;
1425 long ret, i;
1426 union hv_gpa_page_access_state_flags hv_flags = {};
1427 u8 hv_type_mask;
1428 ulong bitmap_buf_sz, states_buf_sz;
1429 int written = 0;
1430
1431 if (copy_from_user(&args, user_args, sizeof(args)))
1432 return -EFAULT;
1433
1434 if (args.access_type >= MSHV_GPAP_ACCESS_TYPE_COUNT ||
1435 args.access_op >= MSHV_GPAP_ACCESS_OP_COUNT ||
1436 mshv_field_nonzero(args, rsvd) || !args.page_count ||
1437 !args.bitmap_ptr)
1438 return -EINVAL;
1439
1440 if (check_mul_overflow(args.page_count, sizeof(*states), &states_buf_sz))
1441 return -E2BIG;
1442
1443 /* Num bytes needed to store bitmap; one bit per page rounded up */
1444 bitmap_buf_sz = DIV_ROUND_UP(args.page_count, 8);
1445
1446 /* Sanity check */
1447 if (bitmap_buf_sz > states_buf_sz)
1448 return -EBADFD;
1449
1450 switch (args.access_type) {
1451 case MSHV_GPAP_ACCESS_TYPE_ACCESSED:
1452 hv_type_mask = 1;
1453 if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) {
1454 hv_flags.clear_accessed = 1;
1455 /* not accessed implies not dirty */
1456 hv_flags.clear_dirty = 1;
1457 } else { /* MSHV_GPAP_ACCESS_OP_SET */
1458 hv_flags.set_accessed = 1;
1459 }
1460 break;
1461 case MSHV_GPAP_ACCESS_TYPE_DIRTY:
1462 hv_type_mask = 2;
1463 if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) {
1464 hv_flags.clear_dirty = 1;
1465 } else { /* MSHV_GPAP_ACCESS_OP_SET */
1466 hv_flags.set_dirty = 1;
1467 /* dirty implies accessed */
1468 hv_flags.set_accessed = 1;
1469 }
1470 break;
1471 }
1472
1473 states = vzalloc(states_buf_sz);
1474 if (!states)
1475 return -ENOMEM;
1476
1477 ret = hv_call_get_gpa_access_states(partition->pt_id, args.page_count,
1478 args.gpap_base, hv_flags, &written,
1479 states);
1480 if (ret)
1481 goto free_return;
1482
1483 /*
1484 * Overwrite states buffer with bitmap - the bits in hv_type_mask
1485 * correspond to bitfields in hv_gpa_page_access_state
1486 */
1487 for (i = 0; i < written; ++i)
1488 __assign_bit(i, (ulong *)states,
1489 states[i].as_uint8 & hv_type_mask);
1490
1491 /* zero the unused bits in the last byte(s) of the returned bitmap */
1492 for (i = written; i < bitmap_buf_sz * 8; ++i)
1493 __clear_bit(i, (ulong *)states);
1494
1495 if (copy_to_user((void __user *)args.bitmap_ptr, states, bitmap_buf_sz))
1496 ret = -EFAULT;
1497
1498 free_return:
1499 vfree(states);
1500 return ret;
1501 }
1502
1503 static long
mshv_partition_ioctl_set_msi_routing(struct mshv_partition * partition,void __user * user_args)1504 mshv_partition_ioctl_set_msi_routing(struct mshv_partition *partition,
1505 void __user *user_args)
1506 {
1507 struct mshv_user_irq_entry *entries = NULL;
1508 struct mshv_user_irq_table args;
1509 long ret;
1510
1511 if (copy_from_user(&args, user_args, sizeof(args)))
1512 return -EFAULT;
1513
1514 if (args.nr > MSHV_MAX_GUEST_IRQS ||
1515 mshv_field_nonzero(args, rsvd))
1516 return -EINVAL;
1517
1518 if (args.nr) {
1519 struct mshv_user_irq_table __user *urouting = user_args;
1520
1521 entries = vmemdup_user(urouting->entries,
1522 array_size(sizeof(*entries),
1523 args.nr));
1524 if (IS_ERR(entries))
1525 return PTR_ERR(entries);
1526 }
1527 ret = mshv_update_routing_table(partition, entries, args.nr);
1528 kvfree(entries);
1529
1530 return ret;
1531 }
1532
1533 static long
mshv_partition_ioctl_initialize(struct mshv_partition * partition)1534 mshv_partition_ioctl_initialize(struct mshv_partition *partition)
1535 {
1536 long ret;
1537
1538 if (partition->pt_initialized)
1539 return 0;
1540
1541 ret = hv_call_initialize_partition(partition->pt_id);
1542 if (ret)
1543 goto withdraw_mem;
1544
1545 partition->pt_initialized = true;
1546
1547 return 0;
1548
1549 withdraw_mem:
1550 hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);
1551
1552 return ret;
1553 }
1554
1555 static long
mshv_partition_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)1556 mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
1557 {
1558 struct mshv_partition *partition = filp->private_data;
1559 long ret;
1560 void __user *uarg = (void __user *)arg;
1561
1562 if (mutex_lock_killable(&partition->pt_mutex))
1563 return -EINTR;
1564
1565 switch (ioctl) {
1566 case MSHV_INITIALIZE_PARTITION:
1567 ret = mshv_partition_ioctl_initialize(partition);
1568 break;
1569 case MSHV_SET_GUEST_MEMORY:
1570 ret = mshv_partition_ioctl_set_memory(partition, uarg);
1571 break;
1572 case MSHV_CREATE_VP:
1573 ret = mshv_partition_ioctl_create_vp(partition, uarg);
1574 break;
1575 case MSHV_IRQFD:
1576 ret = mshv_partition_ioctl_irqfd(partition, uarg);
1577 break;
1578 case MSHV_IOEVENTFD:
1579 ret = mshv_partition_ioctl_ioeventfd(partition, uarg);
1580 break;
1581 case MSHV_SET_MSI_ROUTING:
1582 ret = mshv_partition_ioctl_set_msi_routing(partition, uarg);
1583 break;
1584 case MSHV_GET_GPAP_ACCESS_BITMAP:
1585 ret = mshv_partition_ioctl_get_gpap_access_bitmap(partition,
1586 uarg);
1587 break;
1588 case MSHV_ROOT_HVCALL:
1589 ret = mshv_ioctl_passthru_hvcall(partition, true, uarg);
1590 break;
1591 default:
1592 ret = -ENOTTY;
1593 }
1594
1595 mutex_unlock(&partition->pt_mutex);
1596 return ret;
1597 }
1598
1599 static int
disable_vp_dispatch(struct mshv_vp * vp)1600 disable_vp_dispatch(struct mshv_vp *vp)
1601 {
1602 int ret;
1603 struct hv_register_assoc dispatch_suspend = {
1604 .name = HV_REGISTER_DISPATCH_SUSPEND,
1605 .value.dispatch_suspend.suspended = 1,
1606 };
1607
1608 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
1609 1, &dispatch_suspend);
1610 if (ret)
1611 vp_err(vp, "failed to suspend\n");
1612
1613 return ret;
1614 }
1615
1616 static int
get_vp_signaled_count(struct mshv_vp * vp,u64 * count)1617 get_vp_signaled_count(struct mshv_vp *vp, u64 *count)
1618 {
1619 int ret;
1620 struct hv_register_assoc root_signal_count = {
1621 .name = HV_REGISTER_VP_ROOT_SIGNAL_COUNT,
1622 };
1623
1624 ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
1625 1, &root_signal_count);
1626
1627 if (ret) {
1628 vp_err(vp, "Failed to get root signal count");
1629 *count = 0;
1630 return ret;
1631 }
1632
1633 *count = root_signal_count.value.reg64;
1634
1635 return ret;
1636 }
1637
1638 static void
drain_vp_signals(struct mshv_vp * vp)1639 drain_vp_signals(struct mshv_vp *vp)
1640 {
1641 u64 hv_signal_count;
1642 u64 vp_signal_count;
1643
1644 get_vp_signaled_count(vp, &hv_signal_count);
1645
1646 vp_signal_count = atomic64_read(&vp->run.vp_signaled_count);
1647
1648 /*
1649 * There should be at most 1 outstanding notification, but be extra
1650 * careful anyway.
1651 */
1652 while (hv_signal_count != vp_signal_count) {
1653 WARN_ON(hv_signal_count - vp_signal_count != 1);
1654
1655 if (wait_event_interruptible(vp->run.vp_suspend_queue,
1656 vp->run.kicked_by_hv == 1))
1657 break;
1658 vp->run.kicked_by_hv = 0;
1659 vp_signal_count = atomic64_read(&vp->run.vp_signaled_count);
1660 }
1661 }
1662
drain_all_vps(const struct mshv_partition * partition)1663 static void drain_all_vps(const struct mshv_partition *partition)
1664 {
1665 int i;
1666 struct mshv_vp *vp;
1667
1668 /*
1669 * VPs are reachable from ISR. It is safe to not take the partition
1670 * lock because nobody else can enter this function and drop the
1671 * partition from the list.
1672 */
1673 for (i = 0; i < MSHV_MAX_VPS; i++) {
1674 vp = partition->pt_vp_array[i];
1675 if (!vp)
1676 continue;
1677 /*
1678 * Disable dispatching of the VP in the hypervisor. After this
1679 * the hypervisor guarantees it won't generate any signals for
1680 * the VP and the hypervisor's VP signal count won't change.
1681 */
1682 disable_vp_dispatch(vp);
1683 drain_vp_signals(vp);
1684 }
1685 }
1686
1687 static void
remove_partition(struct mshv_partition * partition)1688 remove_partition(struct mshv_partition *partition)
1689 {
1690 spin_lock(&mshv_root.pt_ht_lock);
1691 hlist_del_rcu(&partition->pt_hnode);
1692 spin_unlock(&mshv_root.pt_ht_lock);
1693
1694 synchronize_rcu();
1695 }
1696
1697 /*
1698 * Tear down a partition and remove it from the list.
1699 * Partition's refcount must be 0
1700 */
destroy_partition(struct mshv_partition * partition)1701 static void destroy_partition(struct mshv_partition *partition)
1702 {
1703 struct mshv_vp *vp;
1704 struct mshv_mem_region *region;
1705 struct hlist_node *n;
1706 int i;
1707
1708 if (refcount_read(&partition->pt_ref_count)) {
1709 pt_err(partition,
1710 "Attempt to destroy partition but refcount > 0\n");
1711 return;
1712 }
1713
1714 if (partition->pt_initialized) {
1715 /*
1716 * We only need to drain signals for root scheduler. This should be
1717 * done before removing the partition from the partition list.
1718 */
1719 if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
1720 drain_all_vps(partition);
1721
1722 /* Remove vps */
1723 for (i = 0; i < MSHV_MAX_VPS; ++i) {
1724 vp = partition->pt_vp_array[i];
1725 if (!vp)
1726 continue;
1727
1728 if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
1729 mshv_vp_stats_unmap(partition->pt_id, vp->vp_index,
1730 (void **)vp->vp_stats_pages);
1731
1732 if (vp->vp_register_page) {
1733 (void)hv_unmap_vp_state_page(partition->pt_id,
1734 vp->vp_index,
1735 HV_VP_STATE_PAGE_REGISTERS,
1736 virt_to_page(vp->vp_register_page),
1737 input_vtl_zero);
1738 vp->vp_register_page = NULL;
1739 }
1740
1741 (void)hv_unmap_vp_state_page(partition->pt_id,
1742 vp->vp_index,
1743 HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
1744 virt_to_page(vp->vp_intercept_msg_page),
1745 input_vtl_zero);
1746 vp->vp_intercept_msg_page = NULL;
1747
1748 if (vp->vp_ghcb_page) {
1749 (void)hv_unmap_vp_state_page(partition->pt_id,
1750 vp->vp_index,
1751 HV_VP_STATE_PAGE_GHCB,
1752 virt_to_page(vp->vp_ghcb_page),
1753 input_vtl_normal);
1754 vp->vp_ghcb_page = NULL;
1755 }
1756
1757 kfree(vp);
1758
1759 partition->pt_vp_array[i] = NULL;
1760 }
1761
1762 /* Deallocates and unmaps everything including vcpus, GPA mappings etc */
1763 hv_call_finalize_partition(partition->pt_id);
1764
1765 partition->pt_initialized = false;
1766 }
1767
1768 remove_partition(partition);
1769
1770 hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions,
1771 hnode) {
1772 hlist_del(®ion->hnode);
1773 mshv_region_put(region);
1774 }
1775
1776 /* Withdraw and free all pages we deposited */
1777 hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);
1778 hv_call_delete_partition(partition->pt_id);
1779
1780 mshv_free_routing_table(partition);
1781 kfree(partition);
1782 }
1783
1784 struct
mshv_partition_get(struct mshv_partition * partition)1785 mshv_partition *mshv_partition_get(struct mshv_partition *partition)
1786 {
1787 if (refcount_inc_not_zero(&partition->pt_ref_count))
1788 return partition;
1789 return NULL;
1790 }
1791
1792 struct
mshv_partition_find(u64 partition_id)1793 mshv_partition *mshv_partition_find(u64 partition_id)
1794 __must_hold(RCU)
1795 {
1796 struct mshv_partition *p;
1797
1798 hash_for_each_possible_rcu(mshv_root.pt_htable, p, pt_hnode,
1799 partition_id)
1800 if (p->pt_id == partition_id)
1801 return p;
1802
1803 return NULL;
1804 }
1805
1806 void
mshv_partition_put(struct mshv_partition * partition)1807 mshv_partition_put(struct mshv_partition *partition)
1808 {
1809 if (refcount_dec_and_test(&partition->pt_ref_count))
1810 destroy_partition(partition);
1811 }
1812
1813 static int
mshv_partition_release(struct inode * inode,struct file * filp)1814 mshv_partition_release(struct inode *inode, struct file *filp)
1815 {
1816 struct mshv_partition *partition = filp->private_data;
1817
1818 mshv_eventfd_release(partition);
1819
1820 cleanup_srcu_struct(&partition->pt_irq_srcu);
1821
1822 mshv_partition_put(partition);
1823
1824 return 0;
1825 }
1826
1827 static int
add_partition(struct mshv_partition * partition)1828 add_partition(struct mshv_partition *partition)
1829 {
1830 spin_lock(&mshv_root.pt_ht_lock);
1831
1832 hash_add_rcu(mshv_root.pt_htable, &partition->pt_hnode,
1833 partition->pt_id);
1834
1835 spin_unlock(&mshv_root.pt_ht_lock);
1836
1837 return 0;
1838 }
1839
1840 static_assert(MSHV_NUM_CPU_FEATURES_BANKS ==
1841 HV_PARTITION_PROCESSOR_FEATURES_BANKS);
1842
mshv_ioctl_process_pt_flags(void __user * user_arg,u64 * pt_flags,struct hv_partition_creation_properties * cr_props,union hv_partition_isolation_properties * isol_props)1843 static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags,
1844 struct hv_partition_creation_properties *cr_props,
1845 union hv_partition_isolation_properties *isol_props)
1846 {
1847 int i;
1848 struct mshv_create_partition_v2 args;
1849 union hv_partition_processor_features *disabled_procs;
1850 union hv_partition_processor_xsave_features *disabled_xsave;
1851
1852 /* First, copy v1 struct in case user is on previous versions */
1853 if (copy_from_user(&args, user_arg,
1854 sizeof(struct mshv_create_partition)))
1855 return -EFAULT;
1856
1857 if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
1858 args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
1859 return -EINVAL;
1860
1861 disabled_procs = &cr_props->disabled_processor_features;
1862 disabled_xsave = &cr_props->disabled_processor_xsave_features;
1863
1864 /* Check if user provided newer struct with feature fields */
1865 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES)) {
1866 if (copy_from_user(&args, user_arg, sizeof(args)))
1867 return -EFAULT;
1868
1869 /* Re-validate v1 fields after second copy_from_user() */
1870 if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
1871 args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
1872 return -EINVAL;
1873
1874 if (args.pt_num_cpu_fbanks != MSHV_NUM_CPU_FEATURES_BANKS ||
1875 mshv_field_nonzero(args, pt_rsvd) ||
1876 mshv_field_nonzero(args, pt_rsvd1))
1877 return -EINVAL;
1878
1879 /*
1880 * Note this assumes MSHV_NUM_CPU_FEATURES_BANKS will never
1881 * change and equals HV_PARTITION_PROCESSOR_FEATURES_BANKS
1882 * (i.e. 2).
1883 *
1884 * Further banks (index >= 2) will be modifiable as 'early'
1885 * properties via the set partition property hypercall.
1886 */
1887 for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
1888 disabled_procs->as_uint64[i] = args.pt_cpu_fbanks[i];
1889
1890 #if IS_ENABLED(CONFIG_X86_64)
1891 disabled_xsave->as_uint64 = args.pt_disabled_xsave;
1892 #else
1893 /*
1894 * In practice this field is ignored on arm64, but safer to
1895 * zero it in case it is ever used.
1896 */
1897 disabled_xsave->as_uint64 = 0;
1898
1899 if (mshv_field_nonzero(args, pt_rsvd2))
1900 return -EINVAL;
1901 #endif
1902 } else {
1903 /*
1904 * v1 behavior: try to enable everything. The hypervisor will
1905 * disable features that are not supported. The banks can be
1906 * queried via the get partition property hypercall.
1907 */
1908 for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
1909 disabled_procs->as_uint64[i] = 0;
1910
1911 disabled_xsave->as_uint64 = 0;
1912 }
1913
1914 /* Only support EXO partitions */
1915 *pt_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION |
1916 HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED;
1917
1918 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_LAPIC))
1919 *pt_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED;
1920 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_X2APIC))
1921 *pt_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE;
1922 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_GPA_SUPER_PAGES))
1923 *pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED;
1924
1925 isol_props->as_uint64 = 0;
1926
1927 switch (args.pt_isolation) {
1928 case MSHV_PT_ISOLATION_NONE:
1929 isol_props->isolation_type = HV_PARTITION_ISOLATION_TYPE_NONE;
1930 break;
1931 }
1932
1933 return 0;
1934 }
1935
1936 static long
mshv_ioctl_create_partition(void __user * user_arg,struct device * module_dev)1937 mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
1938 {
1939 u64 creation_flags;
1940 struct hv_partition_creation_properties creation_properties;
1941 union hv_partition_isolation_properties isolation_properties;
1942 struct mshv_partition *partition;
1943 long ret;
1944
1945 ret = mshv_ioctl_process_pt_flags(user_arg, &creation_flags,
1946 &creation_properties,
1947 &isolation_properties);
1948 if (ret)
1949 return ret;
1950
1951 partition = kzalloc(sizeof(*partition), GFP_KERNEL);
1952 if (!partition)
1953 return -ENOMEM;
1954
1955 partition->pt_module_dev = module_dev;
1956 partition->isolation_type = isolation_properties.isolation_type;
1957
1958 refcount_set(&partition->pt_ref_count, 1);
1959
1960 mutex_init(&partition->pt_mutex);
1961
1962 mutex_init(&partition->pt_irq_lock);
1963
1964 init_completion(&partition->async_hypercall);
1965
1966 INIT_HLIST_HEAD(&partition->irq_ack_notifier_list);
1967
1968 INIT_HLIST_HEAD(&partition->pt_devices);
1969
1970 spin_lock_init(&partition->pt_mem_regions_lock);
1971 INIT_HLIST_HEAD(&partition->pt_mem_regions);
1972
1973 mshv_eventfd_init(partition);
1974
1975 ret = init_srcu_struct(&partition->pt_irq_srcu);
1976 if (ret)
1977 goto free_partition;
1978
1979 ret = hv_call_create_partition(creation_flags,
1980 creation_properties,
1981 isolation_properties,
1982 &partition->pt_id);
1983 if (ret)
1984 goto cleanup_irq_srcu;
1985
1986 ret = add_partition(partition);
1987 if (ret)
1988 goto delete_partition;
1989
1990 ret = mshv_init_async_handler(partition);
1991 if (!ret) {
1992 ret = FD_ADD(O_CLOEXEC, anon_inode_getfile("mshv_partition",
1993 &mshv_partition_fops,
1994 partition, O_RDWR));
1995 if (ret >= 0)
1996 return ret;
1997 }
1998 remove_partition(partition);
1999 delete_partition:
2000 hv_call_delete_partition(partition->pt_id);
2001 cleanup_irq_srcu:
2002 cleanup_srcu_struct(&partition->pt_irq_srcu);
2003 free_partition:
2004 kfree(partition);
2005
2006 return ret;
2007 }
2008
mshv_dev_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)2009 static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl,
2010 unsigned long arg)
2011 {
2012 struct miscdevice *misc = filp->private_data;
2013
2014 switch (ioctl) {
2015 case MSHV_CREATE_PARTITION:
2016 return mshv_ioctl_create_partition((void __user *)arg,
2017 misc->this_device);
2018 case MSHV_ROOT_HVCALL:
2019 return mshv_ioctl_passthru_hvcall(NULL, false,
2020 (void __user *)arg);
2021 }
2022
2023 return -ENOTTY;
2024 }
2025
2026 static int
mshv_dev_open(struct inode * inode,struct file * filp)2027 mshv_dev_open(struct inode *inode, struct file *filp)
2028 {
2029 return 0;
2030 }
2031
2032 static int
mshv_dev_release(struct inode * inode,struct file * filp)2033 mshv_dev_release(struct inode *inode, struct file *filp)
2034 {
2035 return 0;
2036 }
2037
2038 static int mshv_cpuhp_online;
2039 static int mshv_root_sched_online;
2040
scheduler_type_to_string(enum hv_scheduler_type type)2041 static const char *scheduler_type_to_string(enum hv_scheduler_type type)
2042 {
2043 switch (type) {
2044 case HV_SCHEDULER_TYPE_LP:
2045 return "classic scheduler without SMT";
2046 case HV_SCHEDULER_TYPE_LP_SMT:
2047 return "classic scheduler with SMT";
2048 case HV_SCHEDULER_TYPE_CORE_SMT:
2049 return "core scheduler";
2050 case HV_SCHEDULER_TYPE_ROOT:
2051 return "root scheduler";
2052 default:
2053 return "unknown scheduler";
2054 };
2055 }
2056
2057 /* TODO move this to hv_common.c when needed outside */
hv_retrieve_scheduler_type(enum hv_scheduler_type * out)2058 static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out)
2059 {
2060 struct hv_input_get_system_property *input;
2061 struct hv_output_get_system_property *output;
2062 unsigned long flags;
2063 u64 status;
2064
2065 local_irq_save(flags);
2066 input = *this_cpu_ptr(hyperv_pcpu_input_arg);
2067 output = *this_cpu_ptr(hyperv_pcpu_output_arg);
2068
2069 memset(input, 0, sizeof(*input));
2070 memset(output, 0, sizeof(*output));
2071 input->property_id = HV_SYSTEM_PROPERTY_SCHEDULER_TYPE;
2072
2073 status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output);
2074 if (!hv_result_success(status)) {
2075 local_irq_restore(flags);
2076 pr_err("%s: %s\n", __func__, hv_result_to_string(status));
2077 return hv_result_to_errno(status);
2078 }
2079
2080 *out = output->scheduler_type;
2081 local_irq_restore(flags);
2082
2083 return 0;
2084 }
2085
2086 /* Retrieve and stash the supported scheduler type */
mshv_retrieve_scheduler_type(struct device * dev)2087 static int __init mshv_retrieve_scheduler_type(struct device *dev)
2088 {
2089 int ret = 0;
2090
2091 if (hv_l1vh_partition())
2092 hv_scheduler_type = HV_SCHEDULER_TYPE_CORE_SMT;
2093 else
2094 ret = hv_retrieve_scheduler_type(&hv_scheduler_type);
2095
2096 if (ret)
2097 return ret;
2098
2099 dev_info(dev, "Hypervisor using %s\n",
2100 scheduler_type_to_string(hv_scheduler_type));
2101
2102 switch (hv_scheduler_type) {
2103 case HV_SCHEDULER_TYPE_CORE_SMT:
2104 case HV_SCHEDULER_TYPE_LP_SMT:
2105 case HV_SCHEDULER_TYPE_ROOT:
2106 case HV_SCHEDULER_TYPE_LP:
2107 /* Supported scheduler, nothing to do */
2108 break;
2109 default:
2110 dev_err(dev, "unsupported scheduler 0x%x, bailing.\n",
2111 hv_scheduler_type);
2112 return -EOPNOTSUPP;
2113 }
2114
2115 return 0;
2116 }
2117
mshv_root_scheduler_init(unsigned int cpu)2118 static int mshv_root_scheduler_init(unsigned int cpu)
2119 {
2120 void **inputarg, **outputarg, *p;
2121
2122 inputarg = (void **)this_cpu_ptr(root_scheduler_input);
2123 outputarg = (void **)this_cpu_ptr(root_scheduler_output);
2124
2125 /* Allocate two consecutive pages. One for input, one for output. */
2126 p = kmalloc(2 * HV_HYP_PAGE_SIZE, GFP_KERNEL);
2127 if (!p)
2128 return -ENOMEM;
2129
2130 *inputarg = p;
2131 *outputarg = (char *)p + HV_HYP_PAGE_SIZE;
2132
2133 return 0;
2134 }
2135
mshv_root_scheduler_cleanup(unsigned int cpu)2136 static int mshv_root_scheduler_cleanup(unsigned int cpu)
2137 {
2138 void *p, **inputarg, **outputarg;
2139
2140 inputarg = (void **)this_cpu_ptr(root_scheduler_input);
2141 outputarg = (void **)this_cpu_ptr(root_scheduler_output);
2142
2143 p = *inputarg;
2144
2145 *inputarg = NULL;
2146 *outputarg = NULL;
2147
2148 kfree(p);
2149
2150 return 0;
2151 }
2152
2153 /* Must be called after retrieving the scheduler type */
2154 static int
root_scheduler_init(struct device * dev)2155 root_scheduler_init(struct device *dev)
2156 {
2157 int ret;
2158
2159 if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
2160 return 0;
2161
2162 root_scheduler_input = alloc_percpu(void *);
2163 root_scheduler_output = alloc_percpu(void *);
2164
2165 if (!root_scheduler_input || !root_scheduler_output) {
2166 dev_err(dev, "Failed to allocate root scheduler buffers\n");
2167 ret = -ENOMEM;
2168 goto out;
2169 }
2170
2171 ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_root_sched",
2172 mshv_root_scheduler_init,
2173 mshv_root_scheduler_cleanup);
2174
2175 if (ret < 0) {
2176 dev_err(dev, "Failed to setup root scheduler state: %i\n", ret);
2177 goto out;
2178 }
2179
2180 mshv_root_sched_online = ret;
2181
2182 return 0;
2183
2184 out:
2185 free_percpu(root_scheduler_input);
2186 free_percpu(root_scheduler_output);
2187 return ret;
2188 }
2189
2190 static void
root_scheduler_deinit(void)2191 root_scheduler_deinit(void)
2192 {
2193 if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
2194 return;
2195
2196 cpuhp_remove_state(mshv_root_sched_online);
2197 free_percpu(root_scheduler_input);
2198 free_percpu(root_scheduler_output);
2199 }
2200
mshv_reboot_notify(struct notifier_block * nb,unsigned long code,void * unused)2201 static int mshv_reboot_notify(struct notifier_block *nb,
2202 unsigned long code, void *unused)
2203 {
2204 cpuhp_remove_state(mshv_cpuhp_online);
2205 return 0;
2206 }
2207
2208 struct notifier_block mshv_reboot_nb = {
2209 .notifier_call = mshv_reboot_notify,
2210 };
2211
mshv_root_partition_exit(void)2212 static void mshv_root_partition_exit(void)
2213 {
2214 unregister_reboot_notifier(&mshv_reboot_nb);
2215 root_scheduler_deinit();
2216 }
2217
mshv_root_partition_init(struct device * dev)2218 static int __init mshv_root_partition_init(struct device *dev)
2219 {
2220 int err;
2221
2222 err = root_scheduler_init(dev);
2223 if (err)
2224 return err;
2225
2226 err = register_reboot_notifier(&mshv_reboot_nb);
2227 if (err)
2228 goto root_sched_deinit;
2229
2230 return 0;
2231
2232 root_sched_deinit:
2233 root_scheduler_deinit();
2234 return err;
2235 }
2236
mshv_init_vmm_caps(struct device * dev)2237 static void mshv_init_vmm_caps(struct device *dev)
2238 {
2239 /*
2240 * This can only fail here if HVCALL_GET_PARTITION_PROPERTY_EX or
2241 * HV_PARTITION_PROPERTY_VMM_CAPABILITIES are not supported. In that
2242 * case it's valid to proceed as if all vmm_caps are disabled (zero).
2243 */
2244 if (hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF,
2245 HV_PARTITION_PROPERTY_VMM_CAPABILITIES,
2246 0, &mshv_root.vmm_caps,
2247 sizeof(mshv_root.vmm_caps)))
2248 dev_warn(dev, "Unable to get VMM capabilities\n");
2249
2250 dev_dbg(dev, "vmm_caps = %#llx\n", mshv_root.vmm_caps.as_uint64[0]);
2251 }
2252
mshv_parent_partition_init(void)2253 static int __init mshv_parent_partition_init(void)
2254 {
2255 int ret;
2256 struct device *dev;
2257 union hv_hypervisor_version_info version_info;
2258
2259 if (!hv_parent_partition() || is_kdump_kernel())
2260 return -ENODEV;
2261
2262 if (hv_get_hypervisor_version(&version_info))
2263 return -ENODEV;
2264
2265 ret = misc_register(&mshv_dev);
2266 if (ret)
2267 return ret;
2268
2269 dev = mshv_dev.this_device;
2270
2271 if (version_info.build_number < MSHV_HV_MIN_VERSION ||
2272 version_info.build_number > MSHV_HV_MAX_VERSION) {
2273 dev_err(dev, "Running on unvalidated Hyper-V version\n");
2274 dev_err(dev, "Versions: current: %u min: %u max: %u\n",
2275 version_info.build_number, MSHV_HV_MIN_VERSION,
2276 MSHV_HV_MAX_VERSION);
2277 }
2278
2279 mshv_root.synic_pages = alloc_percpu(struct hv_synic_pages);
2280 if (!mshv_root.synic_pages) {
2281 dev_err(dev, "Failed to allocate percpu synic page\n");
2282 ret = -ENOMEM;
2283 goto device_deregister;
2284 }
2285
2286 ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic",
2287 mshv_synic_init,
2288 mshv_synic_cleanup);
2289 if (ret < 0) {
2290 dev_err(dev, "Failed to setup cpu hotplug state: %i\n", ret);
2291 goto free_synic_pages;
2292 }
2293
2294 mshv_cpuhp_online = ret;
2295
2296 ret = mshv_retrieve_scheduler_type(dev);
2297 if (ret)
2298 goto remove_cpu_state;
2299
2300 if (hv_root_partition())
2301 ret = mshv_root_partition_init(dev);
2302 if (ret)
2303 goto remove_cpu_state;
2304
2305 mshv_init_vmm_caps(dev);
2306
2307 ret = mshv_irqfd_wq_init();
2308 if (ret)
2309 goto exit_partition;
2310
2311 spin_lock_init(&mshv_root.pt_ht_lock);
2312 hash_init(mshv_root.pt_htable);
2313
2314 hv_setup_mshv_handler(mshv_isr);
2315
2316 return 0;
2317
2318 exit_partition:
2319 if (hv_root_partition())
2320 mshv_root_partition_exit();
2321 remove_cpu_state:
2322 cpuhp_remove_state(mshv_cpuhp_online);
2323 free_synic_pages:
2324 free_percpu(mshv_root.synic_pages);
2325 device_deregister:
2326 misc_deregister(&mshv_dev);
2327 return ret;
2328 }
2329
mshv_parent_partition_exit(void)2330 static void __exit mshv_parent_partition_exit(void)
2331 {
2332 hv_setup_mshv_handler(NULL);
2333 mshv_port_table_fini();
2334 misc_deregister(&mshv_dev);
2335 mshv_irqfd_wq_cleanup();
2336 if (hv_root_partition())
2337 mshv_root_partition_exit();
2338 cpuhp_remove_state(mshv_cpuhp_online);
2339 free_percpu(mshv_root.synic_pages);
2340 }
2341
2342 module_init(mshv_parent_partition_init);
2343 module_exit(mshv_parent_partition_exit);
2344