1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (c) 2024, Microsoft Corporation.
4 *
5 * The main part of the mshv_root module, providing APIs to create
6 * and manage guest partitions.
7 *
8 * Authors: Microsoft Linux virtualization team
9 */
10
11 #include <linux/entry-virt.h>
12 #include <linux/kernel.h>
13 #include <linux/module.h>
14 #include <linux/fs.h>
15 #include <linux/miscdevice.h>
16 #include <linux/slab.h>
17 #include <linux/file.h>
18 #include <linux/anon_inodes.h>
19 #include <linux/mm.h>
20 #include <linux/io.h>
21 #include <linux/cpuhotplug.h>
22 #include <linux/random.h>
23 #include <asm/mshyperv.h>
24 #include <linux/hyperv.h>
25 #include <linux/notifier.h>
26 #include <linux/reboot.h>
27 #include <linux/kexec.h>
28 #include <linux/page-flags.h>
29 #include <linux/crash_dump.h>
30 #include <linux/panic_notifier.h>
31 #include <linux/vmalloc.h>
32 #include <linux/rseq.h>
33
34 #include "mshv_eventfd.h"
35 #include "mshv.h"
36 #include "mshv_root.h"
37
38 MODULE_AUTHOR("Microsoft");
39 MODULE_LICENSE("GPL");
40 MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv");
41
42 /* HV_THREAD_COUNTER */
43 #if defined(CONFIG_X86_64)
44 #define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 202
45 #elif defined(CONFIG_ARM64)
46 #define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 95
47 #endif
48
49 struct mshv_root mshv_root;
50
51 enum hv_scheduler_type hv_scheduler_type;
52
53 /* Once we implement the fast extended hypercall ABI they can go away. */
54 static void * __percpu *root_scheduler_input;
55 static void * __percpu *root_scheduler_output;
56
57 static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
58 static int mshv_dev_open(struct inode *inode, struct file *filp);
59 static int mshv_dev_release(struct inode *inode, struct file *filp);
60 static int mshv_vp_release(struct inode *inode, struct file *filp);
61 static long mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
62 static int mshv_partition_release(struct inode *inode, struct file *filp);
63 static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
64 static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma);
65 static vm_fault_t mshv_vp_fault(struct vm_fault *vmf);
66 static int mshv_init_async_handler(struct mshv_partition *partition);
67 static void mshv_async_hvcall_handler(void *data, u64 *status);
68
69 static const union hv_input_vtl input_vtl_zero;
70 static const union hv_input_vtl input_vtl_normal = {
71 .target_vtl = HV_NORMAL_VTL,
72 .use_target_vtl = 1,
73 };
74
75 static const struct vm_operations_struct mshv_vp_vm_ops = {
76 .fault = mshv_vp_fault,
77 };
78
79 static const struct file_operations mshv_vp_fops = {
80 .owner = THIS_MODULE,
81 .release = mshv_vp_release,
82 .unlocked_ioctl = mshv_vp_ioctl,
83 .llseek = noop_llseek,
84 .mmap = mshv_vp_mmap,
85 };
86
87 static const struct file_operations mshv_partition_fops = {
88 .owner = THIS_MODULE,
89 .release = mshv_partition_release,
90 .unlocked_ioctl = mshv_partition_ioctl,
91 .llseek = noop_llseek,
92 };
93
94 static const struct file_operations mshv_dev_fops = {
95 .owner = THIS_MODULE,
96 .open = mshv_dev_open,
97 .release = mshv_dev_release,
98 .unlocked_ioctl = mshv_dev_ioctl,
99 .llseek = noop_llseek,
100 };
101
102 static struct miscdevice mshv_dev = {
103 .minor = MISC_DYNAMIC_MINOR,
104 .name = "mshv",
105 .fops = &mshv_dev_fops,
106 .mode = 0600,
107 };
108
109 /*
110 * Only allow hypercalls that have a u64 partition id as the first member of
111 * the input structure.
112 * These are sorted by value.
113 */
114 static u16 mshv_passthru_hvcalls[] = {
115 HVCALL_GET_PARTITION_PROPERTY,
116 HVCALL_GET_PARTITION_PROPERTY_EX,
117 HVCALL_SET_PARTITION_PROPERTY,
118 HVCALL_INSTALL_INTERCEPT,
119 HVCALL_GET_VP_REGISTERS,
120 HVCALL_SET_VP_REGISTERS,
121 HVCALL_TRANSLATE_VIRTUAL_ADDRESS,
122 HVCALL_CLEAR_VIRTUAL_INTERRUPT,
123 HVCALL_REGISTER_INTERCEPT_RESULT,
124 HVCALL_ASSERT_VIRTUAL_INTERRUPT,
125 HVCALL_GET_GPA_PAGES_ACCESS_STATES,
126 HVCALL_SIGNAL_EVENT_DIRECT,
127 HVCALL_POST_MESSAGE_DIRECT,
128 HVCALL_GET_VP_CPUID_VALUES,
129 };
130
131 /*
132 * Only allow hypercalls that are safe to be called by the VMM with the host
133 * partition as target (i.e. HV_PARTITION_ID_SELF). Carefully audit that a
134 * hypercall cannot be misused by the VMM before adding it to this list.
135 */
136 static u16 mshv_self_passthru_hvcalls[] = {
137 HVCALL_GET_PARTITION_PROPERTY,
138 HVCALL_GET_PARTITION_PROPERTY_EX,
139 };
140
mshv_hvcall_is_async(u16 code)141 static bool mshv_hvcall_is_async(u16 code)
142 {
143 switch (code) {
144 case HVCALL_SET_PARTITION_PROPERTY:
145 return true;
146 default:
147 break;
148 }
149 return false;
150 }
151
mshv_passthru_hvcall_allowed(u16 code,u64 pt_id)152 static bool mshv_passthru_hvcall_allowed(u16 code, u64 pt_id)
153 {
154 int i;
155 int n = ARRAY_SIZE(mshv_passthru_hvcalls);
156 u16 *allowed_hvcalls = mshv_passthru_hvcalls;
157
158 if (pt_id == HV_PARTITION_ID_SELF) {
159 n = ARRAY_SIZE(mshv_self_passthru_hvcalls);
160 allowed_hvcalls = mshv_self_passthru_hvcalls;
161 }
162
163 for (i = 0; i < n; ++i)
164 if (allowed_hvcalls[i] == code)
165 return true;
166
167 return false;
168 }
169
mshv_ioctl_passthru_hvcall(struct mshv_partition * partition,bool partition_locked,void __user * user_args)170 static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition,
171 bool partition_locked,
172 void __user *user_args)
173 {
174 u64 status;
175 int ret = 0;
176 bool is_async;
177 struct mshv_root_hvcall args;
178 struct page *page;
179 unsigned int pages_order;
180 void *input_pg = NULL;
181 void *output_pg = NULL;
182 u16 reps_completed;
183 u64 pt_id = partition ? partition->pt_id : HV_PARTITION_ID_SELF;
184
185 if (copy_from_user(&args, user_args, sizeof(args)))
186 return -EFAULT;
187
188 if (args.status || !args.in_ptr || args.in_sz < sizeof(u64) ||
189 mshv_field_nonzero(args, rsvd) || args.in_sz > HV_HYP_PAGE_SIZE)
190 return -EINVAL;
191
192 if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE))
193 return -EINVAL;
194
195 if (!mshv_passthru_hvcall_allowed(args.code, pt_id))
196 return -EINVAL;
197
198 is_async = mshv_hvcall_is_async(args.code);
199 if (is_async) {
200 /* async hypercalls can only be called from partition fd */
201 if (!partition || !partition_locked)
202 return -EINVAL;
203 ret = mshv_init_async_handler(partition);
204 if (ret)
205 return ret;
206 }
207
208 pages_order = args.out_ptr ? 1 : 0;
209 page = alloc_pages(GFP_KERNEL, pages_order);
210 if (!page)
211 return -ENOMEM;
212 input_pg = page_address(page);
213
214 if (args.out_ptr)
215 output_pg = (char *)input_pg + PAGE_SIZE;
216 else
217 output_pg = NULL;
218
219 if (copy_from_user(input_pg, (void __user *)args.in_ptr,
220 args.in_sz)) {
221 ret = -EFAULT;
222 goto free_pages_out;
223 }
224
225 /*
226 * NOTE: This only works because all the allowed hypercalls' input
227 * structs begin with a u64 partition_id field.
228 */
229 *(u64 *)input_pg = pt_id;
230
231 reps_completed = 0;
232 do {
233 if (args.reps) {
234 status = hv_do_rep_hypercall_ex(args.code, args.reps,
235 0, reps_completed,
236 input_pg, output_pg);
237 reps_completed = hv_repcomp(status);
238 } else {
239 status = hv_do_hypercall(args.code, input_pg, output_pg);
240 }
241
242 if (hv_result(status) == HV_STATUS_CALL_PENDING) {
243 if (is_async) {
244 mshv_async_hvcall_handler(partition, &status);
245 } else { /* Paranoia check. This shouldn't happen! */
246 ret = -EBADFD;
247 goto free_pages_out;
248 }
249 }
250
251 if (hv_result_success(status))
252 break;
253
254 if (!hv_result_needs_memory(status))
255 ret = hv_result_to_errno(status);
256 else
257 ret = hv_deposit_memory(pt_id, status);
258 } while (!ret);
259
260 args.status = hv_result(status);
261 args.reps = reps_completed;
262 if (copy_to_user(user_args, &args, sizeof(args)))
263 ret = -EFAULT;
264
265 if (!ret && output_pg &&
266 copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz))
267 ret = -EFAULT;
268
269 free_pages_out:
270 free_pages((unsigned long)input_pg, pages_order);
271
272 return ret;
273 }
274
is_ghcb_mapping_available(void)275 static inline bool is_ghcb_mapping_available(void)
276 {
277 #if IS_ENABLED(CONFIG_X86_64)
278 return ms_hyperv.ext_features & HV_VP_GHCB_ROOT_MAPPING_AVAILABLE;
279 #else
280 return 0;
281 #endif
282 }
283
mshv_get_vp_registers(u32 vp_index,u64 partition_id,u16 count,struct hv_register_assoc * registers)284 static int mshv_get_vp_registers(u32 vp_index, u64 partition_id, u16 count,
285 struct hv_register_assoc *registers)
286 {
287 return hv_call_get_vp_registers(vp_index, partition_id,
288 count, input_vtl_zero, registers);
289 }
290
mshv_set_vp_registers(u32 vp_index,u64 partition_id,u16 count,struct hv_register_assoc * registers)291 static int mshv_set_vp_registers(u32 vp_index, u64 partition_id, u16 count,
292 struct hv_register_assoc *registers)
293 {
294 return hv_call_set_vp_registers(vp_index, partition_id,
295 count, input_vtl_zero, registers);
296 }
297
298 /*
299 * Explicit guest vCPU suspend is asynchronous by nature (as it is requested by
300 * dom0 vCPU for guest vCPU) and thus it can race with "intercept" suspend,
301 * done by the hypervisor.
302 * "Intercept" suspend leads to asynchronous message delivery to dom0 which
303 * should be awaited to keep the VP loop consistent (i.e. no message pending
304 * upon VP resume).
305 * VP intercept suspend can't be done when the VP is explicitly suspended
306 * already, and thus can be only two possible race scenarios:
307 * 1. implicit suspend bit set -> explicit suspend bit set -> message sent
308 * 2. implicit suspend bit set -> message sent -> explicit suspend bit set
309 * Checking for implicit suspend bit set after explicit suspend request has
310 * succeeded in either case allows us to reliably identify, if there is a
311 * message to receive and deliver to VMM.
312 */
313 static int
mshv_suspend_vp(const struct mshv_vp * vp,bool * message_in_flight)314 mshv_suspend_vp(const struct mshv_vp *vp, bool *message_in_flight)
315 {
316 struct hv_register_assoc explicit_suspend = {
317 .name = HV_REGISTER_EXPLICIT_SUSPEND
318 };
319 struct hv_register_assoc intercept_suspend = {
320 .name = HV_REGISTER_INTERCEPT_SUSPEND
321 };
322 union hv_explicit_suspend_register *es =
323 &explicit_suspend.value.explicit_suspend;
324 union hv_intercept_suspend_register *is =
325 &intercept_suspend.value.intercept_suspend;
326 int ret;
327
328 es->suspended = 1;
329
330 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
331 1, &explicit_suspend);
332 if (ret) {
333 vp_err(vp, "Failed to explicitly suspend vCPU\n");
334 return ret;
335 }
336
337 ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
338 1, &intercept_suspend);
339 if (ret) {
340 vp_err(vp, "Failed to get intercept suspend state\n");
341 return ret;
342 }
343
344 *message_in_flight = is->suspended;
345
346 return 0;
347 }
348
349 /*
350 * This function is used when VPs are scheduled by the hypervisor's
351 * scheduler.
352 *
353 * Caller has to make sure the registers contain cleared
354 * HV_REGISTER_INTERCEPT_SUSPEND and HV_REGISTER_EXPLICIT_SUSPEND registers
355 * exactly in this order (the hypervisor clears them sequentially) to avoid
356 * potential invalid clearing a newly arrived HV_REGISTER_INTERCEPT_SUSPEND
357 * after VP is released from HV_REGISTER_EXPLICIT_SUSPEND in case of the
358 * opposite order.
359 */
mshv_run_vp_with_hyp_scheduler(struct mshv_vp * vp)360 static long mshv_run_vp_with_hyp_scheduler(struct mshv_vp *vp)
361 {
362 long ret;
363 struct hv_register_assoc suspend_regs[2] = {
364 { .name = HV_REGISTER_INTERCEPT_SUSPEND },
365 { .name = HV_REGISTER_EXPLICIT_SUSPEND }
366 };
367 size_t count = ARRAY_SIZE(suspend_regs);
368
369 /* Resume VP execution */
370 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
371 count, suspend_regs);
372 if (ret) {
373 vp_err(vp, "Failed to resume vp execution. %lx\n", ret);
374 return ret;
375 }
376
377 ret = wait_event_interruptible(vp->run.vp_suspend_queue,
378 vp->run.kicked_by_hv == 1);
379 if (ret) {
380 bool message_in_flight;
381
382 /*
383 * Otherwise the waiting was interrupted by a signal: suspend
384 * the vCPU explicitly and copy message in flight (if any).
385 */
386 ret = mshv_suspend_vp(vp, &message_in_flight);
387 if (ret)
388 return ret;
389
390 /* Return if no message in flight */
391 if (!message_in_flight)
392 return -EINTR;
393
394 /* Wait for the message in flight. */
395 wait_event(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1);
396 }
397
398 /*
399 * Reset the flag to make the wait_event call above work
400 * next time.
401 */
402 vp->run.kicked_by_hv = 0;
403
404 return 0;
405 }
406
407 static int
mshv_vp_dispatch(struct mshv_vp * vp,u32 flags,struct hv_output_dispatch_vp * res)408 mshv_vp_dispatch(struct mshv_vp *vp, u32 flags,
409 struct hv_output_dispatch_vp *res)
410 {
411 struct hv_input_dispatch_vp *input;
412 struct hv_output_dispatch_vp *output;
413 u64 status;
414
415 preempt_disable();
416 input = *this_cpu_ptr(root_scheduler_input);
417 output = *this_cpu_ptr(root_scheduler_output);
418
419 memset(input, 0, sizeof(*input));
420 memset(output, 0, sizeof(*output));
421
422 input->partition_id = vp->vp_partition->pt_id;
423 input->vp_index = vp->vp_index;
424 input->time_slice = 0; /* Run forever until something happens */
425 input->spec_ctrl = 0; /* TODO: set sensible flags */
426 input->flags = flags;
427
428 vp->run.flags.root_sched_dispatched = 1;
429 status = hv_do_hypercall(HVCALL_DISPATCH_VP, input, output);
430 vp->run.flags.root_sched_dispatched = 0;
431
432 trace_mshv_hvcall_dispatch_vp(vp->vp_partition->pt_id,
433 vp->vp_index, flags,
434 output->dispatch_state,
435 output->dispatch_event,
436 #if defined(CONFIG_X86_64)
437 vp->vp_register_page->interrupt_vectors.as_uint64,
438 #else
439 0,
440 #endif
441 status);
442
443 *res = *output;
444 preempt_enable();
445
446 if (!hv_result_success(status))
447 vp_err(vp, "%s: status %s\n", __func__,
448 hv_result_to_string(status));
449
450 return hv_result_to_errno(status);
451 }
452
453 static int
mshv_vp_clear_explicit_suspend(struct mshv_vp * vp)454 mshv_vp_clear_explicit_suspend(struct mshv_vp *vp)
455 {
456 struct hv_register_assoc explicit_suspend = {
457 .name = HV_REGISTER_EXPLICIT_SUSPEND,
458 .value.explicit_suspend.suspended = 0,
459 };
460 int ret;
461
462 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
463 1, &explicit_suspend);
464
465 trace_mshv_vp_clear_explicit_suspend(vp->vp_partition->pt_id,
466 vp->vp_index, ret);
467
468 if (ret)
469 vp_err(vp, "Failed to unsuspend\n");
470
471 return ret;
472 }
473
474 #if IS_ENABLED(CONFIG_X86_64)
mshv_vp_interrupt_pending(struct mshv_vp * vp)475 static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp)
476 {
477 if (!vp->vp_register_page)
478 return 0;
479 return vp->vp_register_page->interrupt_vectors.as_uint64;
480 }
481 #else
mshv_vp_interrupt_pending(struct mshv_vp * vp)482 static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp)
483 {
484 return 0;
485 }
486 #endif
487
mshv_vp_dispatch_thread_blocked(struct mshv_vp * vp)488 static bool mshv_vp_dispatch_thread_blocked(struct mshv_vp *vp)
489 {
490 struct hv_stats_page **stats = vp->vp_stats_pages;
491 u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->data;
492 u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->data;
493
494 return parent_vp_cntrs[HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED] ||
495 self_vp_cntrs[HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED];
496 }
497
498 static int
mshv_vp_wait_for_hv_kick(struct mshv_vp * vp)499 mshv_vp_wait_for_hv_kick(struct mshv_vp *vp)
500 {
501 int ret;
502
503 ret = wait_event_interruptible(vp->run.vp_suspend_queue,
504 (vp->run.kicked_by_hv == 1 &&
505 !mshv_vp_dispatch_thread_blocked(vp)) ||
506 mshv_vp_interrupt_pending(vp));
507 if (ret)
508 return -EINTR;
509
510 trace_mshv_vp_wait_for_hv_kick(vp->vp_partition->pt_id,
511 vp->vp_index,
512 vp->run.kicked_by_hv,
513 mshv_vp_dispatch_thread_blocked(vp),
514 mshv_vp_interrupt_pending(vp));
515
516 vp->run.flags.root_sched_blocked = 0;
517 vp->run.kicked_by_hv = 0;
518
519 return 0;
520 }
521
522 /* Must be called with interrupts enabled */
mshv_run_vp_with_root_scheduler(struct mshv_vp * vp)523 static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp)
524 {
525 long ret;
526
527 if (vp->run.flags.root_sched_blocked) {
528 /*
529 * Dispatch state of this VP is blocked. Need to wait
530 * for the hypervisor to clear the blocked state before
531 * dispatching it.
532 */
533 ret = mshv_vp_wait_for_hv_kick(vp);
534 if (ret)
535 return ret;
536 }
537
538 do {
539 u32 flags = 0;
540 struct hv_output_dispatch_vp output;
541
542 if (__xfer_to_guest_mode_work_pending()) {
543 ret = xfer_to_guest_mode_handle_work();
544
545 trace_mshv_xfer_to_guest_mode_work(vp->vp_partition->pt_id,
546 vp->vp_index,
547 read_thread_flags(),
548 ret);
549
550 if (ret)
551 break;
552 }
553
554 if (vp->run.flags.intercept_suspend)
555 flags |= HV_DISPATCH_VP_FLAG_CLEAR_INTERCEPT_SUSPEND;
556
557 if (mshv_vp_interrupt_pending(vp))
558 flags |= HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION;
559
560 ret = mshv_vp_dispatch(vp, flags, &output);
561 if (ret)
562 break;
563
564 vp->run.flags.intercept_suspend = 0;
565
566 if (output.dispatch_state == HV_VP_DISPATCH_STATE_BLOCKED) {
567 if (output.dispatch_event ==
568 HV_VP_DISPATCH_EVENT_SUSPEND) {
569 /*
570 * TODO: remove the warning once VP canceling
571 * is supported
572 */
573 WARN_ONCE(atomic64_read(&vp->run.vp_signaled_count),
574 "%s: vp#%d: unexpected explicit suspend\n",
575 __func__, vp->vp_index);
576 /*
577 * Need to clear explicit suspend before
578 * dispatching.
579 * Explicit suspend is either:
580 * - set right after the first VP dispatch or
581 * - set explicitly via hypercall
582 * Since the latter case is not yet supported,
583 * simply clear it here.
584 */
585 ret = mshv_vp_clear_explicit_suspend(vp);
586 if (ret)
587 break;
588
589 ret = mshv_vp_wait_for_hv_kick(vp);
590 if (ret)
591 break;
592 } else {
593 vp->run.flags.root_sched_blocked = 1;
594 ret = mshv_vp_wait_for_hv_kick(vp);
595 if (ret)
596 break;
597 }
598 } else {
599 /* HV_VP_DISPATCH_STATE_READY */
600 if (output.dispatch_event ==
601 HV_VP_DISPATCH_EVENT_INTERCEPT)
602 vp->run.flags.intercept_suspend = 1;
603 }
604 } while (!vp->run.flags.intercept_suspend);
605
606 rseq_virt_userspace_exit();
607
608 return ret;
609 }
610
611 static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ,
612 "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ");
613
614 static struct mshv_mem_region *
mshv_partition_region_by_gfn(struct mshv_partition * partition,u64 gfn)615 mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
616 {
617 struct mshv_mem_region *region;
618
619 hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) {
620 if (gfn >= region->start_gfn &&
621 gfn < region->start_gfn + region->nr_pages)
622 return region;
623 }
624
625 return NULL;
626 }
627
628 static struct mshv_mem_region *
mshv_partition_region_by_gfn_get(struct mshv_partition * p,u64 gfn)629 mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
630 {
631 struct mshv_mem_region *region;
632
633 spin_lock(&p->pt_mem_regions_lock);
634 region = mshv_partition_region_by_gfn(p, gfn);
635 if (!region || !mshv_region_get(region)) {
636 spin_unlock(&p->pt_mem_regions_lock);
637 return NULL;
638 }
639 spin_unlock(&p->pt_mem_regions_lock);
640
641 return region;
642 }
643
644 /**
645 * mshv_handle_gpa_intercept - Handle GPA (Guest Physical Address) intercepts.
646 * @vp: Pointer to the virtual processor structure.
647 *
648 * This function processes GPA intercepts by identifying the memory region
649 * corresponding to the intercepted GPA, aligning the page offset, and
650 * mapping the required pages. It ensures that the region is valid and
651 * handles faults efficiently by mapping multiple pages at once.
652 *
653 * Return: true if the intercept was handled successfully, false otherwise.
654 */
mshv_handle_gpa_intercept(struct mshv_vp * vp)655 static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
656 {
657 struct mshv_partition *p = vp->vp_partition;
658 struct mshv_mem_region *region;
659 bool ret = false;
660 u64 gfn;
661 #if defined(CONFIG_X86_64)
662 struct hv_x64_memory_intercept_message *msg =
663 (struct hv_x64_memory_intercept_message *)
664 vp->vp_intercept_msg_page->u.payload;
665 #elif defined(CONFIG_ARM64)
666 struct hv_arm64_memory_intercept_message *msg =
667 (struct hv_arm64_memory_intercept_message *)
668 vp->vp_intercept_msg_page->u.payload;
669 #endif
670 enum hv_intercept_access_type access_type =
671 msg->header.intercept_access_type;
672
673 gfn = HVPFN_DOWN(msg->guest_physical_address);
674
675 region = mshv_partition_region_by_gfn_get(p, gfn);
676 if (!region)
677 goto out;
678
679 if (access_type == HV_INTERCEPT_ACCESS_WRITE &&
680 !(region->hv_map_flags & HV_MAP_GPA_WRITABLE))
681 goto put_region;
682
683 if (access_type == HV_INTERCEPT_ACCESS_EXECUTE &&
684 !(region->hv_map_flags & HV_MAP_GPA_EXECUTABLE))
685 goto put_region;
686
687 /* Only movable memory ranges are supported for GPA intercepts */
688 if (region->mreg_type == MSHV_REGION_TYPE_MEM_MOVABLE)
689 ret = mshv_region_handle_gfn_fault(region, gfn);
690
691 put_region:
692 mshv_region_put(region);
693 out:
694 trace_mshv_handle_gpa_intercept(p->pt_id, vp->vp_index, gfn,
695 access_type, ret);
696 return ret;
697 }
698
mshv_vp_handle_intercept(struct mshv_vp * vp)699 static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
700 {
701 switch (vp->vp_intercept_msg_page->header.message_type) {
702 case HVMSG_GPA_INTERCEPT:
703 return mshv_handle_gpa_intercept(vp);
704 }
705 return false;
706 }
707
mshv_vp_ioctl_run_vp(struct mshv_vp * vp,void __user * ret_msg)708 static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg)
709 {
710 long rc;
711
712 trace_mshv_run_vp_entry(vp->vp_partition->pt_id, vp->vp_index);
713
714 do {
715 if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
716 rc = mshv_run_vp_with_root_scheduler(vp);
717 else
718 rc = mshv_run_vp_with_hyp_scheduler(vp);
719 } while (rc == 0 && mshv_vp_handle_intercept(vp));
720
721 trace_mshv_run_vp_exit(vp->vp_partition->pt_id, vp->vp_index,
722 vp->vp_intercept_msg_page->header.message_type,
723 rc);
724
725 if (rc)
726 return rc;
727
728 if (copy_to_user(ret_msg, vp->vp_intercept_msg_page,
729 sizeof(struct hv_message)))
730 rc = -EFAULT;
731
732 return rc;
733 }
734
735 static int
mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp * vp,struct hv_vp_state_data state_data,unsigned long user_pfn,size_t page_count,bool is_set)736 mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp *vp,
737 struct hv_vp_state_data state_data,
738 unsigned long user_pfn, size_t page_count,
739 bool is_set)
740 {
741 int completed, ret = 0;
742 unsigned long check;
743 struct page **pages;
744
745 if (page_count > INT_MAX)
746 return -EINVAL;
747 /*
748 * Check the arithmetic for wraparound/overflow.
749 * The last page address in the buffer is:
750 * (user_pfn + (page_count - 1)) * PAGE_SIZE
751 */
752 if (check_add_overflow(user_pfn, (page_count - 1), &check))
753 return -EOVERFLOW;
754 if (check_mul_overflow(check, PAGE_SIZE, &check))
755 return -EOVERFLOW;
756
757 /* Pin user pages so hypervisor can copy directly to them */
758 pages = kzalloc_objs(struct page *, page_count);
759 if (!pages)
760 return -ENOMEM;
761
762 for (completed = 0; completed < page_count; completed += ret) {
763 unsigned long user_addr = (user_pfn + completed) * PAGE_SIZE;
764 int remaining = page_count - completed;
765
766 ret = pin_user_pages_fast(user_addr, remaining, FOLL_WRITE,
767 &pages[completed]);
768 if (ret < 0) {
769 vp_err(vp, "%s: Failed to pin user pages error %i\n",
770 __func__, ret);
771 goto unpin_pages;
772 }
773 }
774
775 if (is_set)
776 ret = hv_call_set_vp_state(vp->vp_index,
777 vp->vp_partition->pt_id,
778 state_data, page_count, pages,
779 0, NULL);
780 else
781 ret = hv_call_get_vp_state(vp->vp_index,
782 vp->vp_partition->pt_id,
783 state_data, page_count, pages,
784 NULL);
785
786 unpin_pages:
787 unpin_user_pages(pages, completed);
788 kfree(pages);
789 return ret;
790 }
791
792 static long
mshv_vp_ioctl_get_set_state(struct mshv_vp * vp,struct mshv_get_set_vp_state __user * user_args,bool is_set)793 mshv_vp_ioctl_get_set_state(struct mshv_vp *vp,
794 struct mshv_get_set_vp_state __user *user_args,
795 bool is_set)
796 {
797 struct mshv_get_set_vp_state args;
798 long ret = 0;
799 union hv_output_get_vp_state vp_state;
800 u32 data_sz;
801 struct hv_vp_state_data state_data = {};
802
803 if (copy_from_user(&args, user_args, sizeof(args)))
804 return -EFAULT;
805
806 if (args.type >= MSHV_VP_STATE_COUNT || mshv_field_nonzero(args, rsvd) ||
807 !args.buf_sz || !PAGE_ALIGNED(args.buf_sz) ||
808 !PAGE_ALIGNED(args.buf_ptr))
809 return -EINVAL;
810
811 if (!access_ok((void __user *)args.buf_ptr, args.buf_sz))
812 return -EFAULT;
813
814 switch (args.type) {
815 case MSHV_VP_STATE_LAPIC:
816 state_data.type = HV_GET_SET_VP_STATE_LAPIC_STATE;
817 data_sz = HV_HYP_PAGE_SIZE;
818 break;
819 case MSHV_VP_STATE_XSAVE:
820 {
821 u64 data_sz_64;
822
823 ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
824 HV_PARTITION_PROPERTY_XSAVE_STATES,
825 &state_data.xsave.states.as_uint64);
826 if (ret)
827 return ret;
828
829 ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
830 HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE,
831 &data_sz_64);
832 if (ret)
833 return ret;
834
835 data_sz = (u32)data_sz_64;
836 state_data.xsave.flags = 0;
837 /* Always request legacy states */
838 state_data.xsave.states.legacy_x87 = 1;
839 state_data.xsave.states.legacy_sse = 1;
840 state_data.type = HV_GET_SET_VP_STATE_XSAVE;
841 break;
842 }
843 case MSHV_VP_STATE_SIMP:
844 state_data.type = HV_GET_SET_VP_STATE_SIM_PAGE;
845 data_sz = HV_HYP_PAGE_SIZE;
846 break;
847 case MSHV_VP_STATE_SIEFP:
848 state_data.type = HV_GET_SET_VP_STATE_SIEF_PAGE;
849 data_sz = HV_HYP_PAGE_SIZE;
850 break;
851 case MSHV_VP_STATE_SYNTHETIC_TIMERS:
852 state_data.type = HV_GET_SET_VP_STATE_SYNTHETIC_TIMERS;
853 data_sz = sizeof(vp_state.synthetic_timers_state);
854 break;
855 default:
856 return -EINVAL;
857 }
858
859 if (copy_to_user(&user_args->buf_sz, &data_sz, sizeof(user_args->buf_sz)))
860 return -EFAULT;
861
862 if (data_sz > args.buf_sz)
863 return -EINVAL;
864
865 /* If the data is transmitted via pfns, delegate to helper */
866 if (state_data.type & HV_GET_SET_VP_STATE_TYPE_PFN) {
867 unsigned long user_pfn = PFN_DOWN(args.buf_ptr);
868 size_t page_count = PFN_DOWN(args.buf_sz);
869
870 return mshv_vp_ioctl_get_set_state_pfn(vp, state_data, user_pfn,
871 page_count, is_set);
872 }
873
874 /* Paranoia check - this shouldn't happen! */
875 if (data_sz > sizeof(vp_state)) {
876 vp_err(vp, "Invalid vp state data size!\n");
877 return -EINVAL;
878 }
879
880 if (is_set) {
881 if (copy_from_user(&vp_state, (__user void *)args.buf_ptr, data_sz))
882 return -EFAULT;
883
884 return hv_call_set_vp_state(vp->vp_index,
885 vp->vp_partition->pt_id,
886 state_data, 0, NULL,
887 sizeof(vp_state), (u8 *)&vp_state);
888 }
889
890 ret = hv_call_get_vp_state(vp->vp_index, vp->vp_partition->pt_id,
891 state_data, 0, NULL, &vp_state);
892 if (ret)
893 return ret;
894
895 if (copy_to_user((void __user *)args.buf_ptr, &vp_state, data_sz))
896 return -EFAULT;
897
898 return 0;
899 }
900
901 static long
mshv_vp_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)902 mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
903 {
904 struct mshv_vp *vp = filp->private_data;
905 long r = -ENOTTY;
906
907 if (mutex_lock_killable(&vp->vp_mutex))
908 return -EINTR;
909
910 switch (ioctl) {
911 case MSHV_RUN_VP:
912 r = mshv_vp_ioctl_run_vp(vp, (void __user *)arg);
913 break;
914 case MSHV_GET_VP_STATE:
915 r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, false);
916 break;
917 case MSHV_SET_VP_STATE:
918 r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, true);
919 break;
920 case MSHV_ROOT_HVCALL:
921 r = mshv_ioctl_passthru_hvcall(vp->vp_partition, false,
922 (void __user *)arg);
923 break;
924 default:
925 vp_warn(vp, "Invalid ioctl: %#x\n", ioctl);
926 break;
927 }
928 mutex_unlock(&vp->vp_mutex);
929
930 return r;
931 }
932
mshv_vp_fault(struct vm_fault * vmf)933 static vm_fault_t mshv_vp_fault(struct vm_fault *vmf)
934 {
935 struct mshv_vp *vp = vmf->vma->vm_file->private_data;
936
937 switch (vmf->vma->vm_pgoff) {
938 case MSHV_VP_MMAP_OFFSET_REGISTERS:
939 vmf->page = virt_to_page(vp->vp_register_page);
940 break;
941 case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE:
942 vmf->page = virt_to_page(vp->vp_intercept_msg_page);
943 break;
944 case MSHV_VP_MMAP_OFFSET_GHCB:
945 vmf->page = virt_to_page(vp->vp_ghcb_page);
946 break;
947 default:
948 return VM_FAULT_SIGBUS;
949 }
950
951 get_page(vmf->page);
952
953 return 0;
954 }
955
mshv_vp_mmap(struct file * file,struct vm_area_struct * vma)956 static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma)
957 {
958 struct mshv_vp *vp = file->private_data;
959
960 switch (vma->vm_pgoff) {
961 case MSHV_VP_MMAP_OFFSET_REGISTERS:
962 if (!vp->vp_register_page)
963 return -ENODEV;
964 break;
965 case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE:
966 if (!vp->vp_intercept_msg_page)
967 return -ENODEV;
968 break;
969 case MSHV_VP_MMAP_OFFSET_GHCB:
970 if (!vp->vp_ghcb_page)
971 return -ENODEV;
972 break;
973 default:
974 return -EINVAL;
975 }
976
977 vma->vm_ops = &mshv_vp_vm_ops;
978 return 0;
979 }
980
981 static int
mshv_vp_release(struct inode * inode,struct file * filp)982 mshv_vp_release(struct inode *inode, struct file *filp)
983 {
984 struct mshv_vp *vp = filp->private_data;
985
986 trace_mshv_vp_release(vp->vp_partition->pt_id, vp->vp_index);
987
988 /* Rest of VP cleanup happens in destroy_partition() */
989 mshv_partition_put(vp->vp_partition);
990 return 0;
991 }
992
mshv_vp_stats_unmap(u64 partition_id,u32 vp_index,struct hv_stats_page * stats_pages[])993 void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index,
994 struct hv_stats_page *stats_pages[])
995 {
996 union hv_stats_object_identity identity = {
997 .vp.partition_id = partition_id,
998 .vp.vp_index = vp_index,
999 };
1000 int err;
1001
1002 identity.vp.stats_area_type = HV_STATS_AREA_SELF;
1003 err = hv_unmap_stats_page(HV_STATS_OBJECT_VP,
1004 stats_pages[HV_STATS_AREA_SELF],
1005 &identity);
1006 if (err)
1007 pr_err("%s: failed to unmap partition %llu vp %u self stats, err: %d\n",
1008 __func__, partition_id, vp_index, err);
1009
1010 if (stats_pages[HV_STATS_AREA_PARENT] != stats_pages[HV_STATS_AREA_SELF]) {
1011 identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
1012 err = hv_unmap_stats_page(HV_STATS_OBJECT_VP,
1013 stats_pages[HV_STATS_AREA_PARENT],
1014 &identity);
1015 if (err)
1016 pr_err("%s: failed to unmap partition %llu vp %u parent stats, err: %d\n",
1017 __func__, partition_id, vp_index, err);
1018 }
1019 }
1020
mshv_vp_stats_map(u64 partition_id,u32 vp_index,struct hv_stats_page * stats_pages[])1021 int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
1022 struct hv_stats_page *stats_pages[])
1023 {
1024 union hv_stats_object_identity identity = {
1025 .vp.partition_id = partition_id,
1026 .vp.vp_index = vp_index,
1027 };
1028 int err;
1029
1030 identity.vp.stats_area_type = HV_STATS_AREA_SELF;
1031 err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
1032 &stats_pages[HV_STATS_AREA_SELF]);
1033 if (err) {
1034 pr_err("%s: failed to map partition %llu vp %u self stats, err: %d\n",
1035 __func__, partition_id, vp_index, err);
1036 return err;
1037 }
1038
1039 /*
1040 * L1VH partition cannot access its vp stats in parent area.
1041 */
1042 if (is_l1vh_parent(partition_id)) {
1043 stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF];
1044 } else {
1045 identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
1046 err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
1047 &stats_pages[HV_STATS_AREA_PARENT]);
1048 if (err) {
1049 pr_err("%s: failed to map partition %llu vp %u parent stats, err: %d\n",
1050 __func__, partition_id, vp_index, err);
1051 goto unmap_self;
1052 }
1053 if (!stats_pages[HV_STATS_AREA_PARENT])
1054 stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF];
1055 }
1056
1057 return 0;
1058
1059 unmap_self:
1060 identity.vp.stats_area_type = HV_STATS_AREA_SELF;
1061 hv_unmap_stats_page(HV_STATS_OBJECT_VP,
1062 stats_pages[HV_STATS_AREA_SELF],
1063 &identity);
1064 return err;
1065 }
1066
1067 static long
mshv_partition_ioctl_create_vp(struct mshv_partition * partition,void __user * arg)1068 mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
1069 void __user *arg)
1070 {
1071 struct mshv_create_vp args;
1072 struct mshv_vp *vp;
1073 struct page *intercept_msg_page, *register_page, *ghcb_page;
1074 struct hv_stats_page *stats_pages[2];
1075 long ret;
1076
1077 if (copy_from_user(&args, arg, sizeof(args)))
1078 return -EFAULT;
1079
1080 if (args.vp_index >= MSHV_MAX_VPS)
1081 return -EINVAL;
1082
1083 if (partition->pt_vp_array[args.vp_index])
1084 return -EEXIST;
1085
1086 ret = hv_call_create_vp(NUMA_NO_NODE, partition->pt_id, args.vp_index,
1087 0 /* Only valid for root partition VPs */);
1088 if (ret)
1089 return ret;
1090
1091 ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
1092 HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
1093 input_vtl_zero, &intercept_msg_page);
1094 if (ret)
1095 goto destroy_vp;
1096
1097 if (!mshv_partition_encrypted(partition)) {
1098 ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
1099 HV_VP_STATE_PAGE_REGISTERS,
1100 input_vtl_zero, ®ister_page);
1101 if (ret)
1102 goto unmap_intercept_message_page;
1103 }
1104
1105 if (mshv_partition_encrypted(partition) &&
1106 is_ghcb_mapping_available()) {
1107 ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
1108 HV_VP_STATE_PAGE_GHCB,
1109 input_vtl_normal, &ghcb_page);
1110 if (ret)
1111 goto unmap_register_page;
1112 }
1113
1114 ret = mshv_vp_stats_map(partition->pt_id, args.vp_index,
1115 stats_pages);
1116 if (ret)
1117 goto unmap_ghcb_page;
1118
1119 vp = kzalloc_obj(*vp);
1120 if (!vp)
1121 goto unmap_stats_pages;
1122
1123 vp->vp_partition = mshv_partition_get(partition);
1124 if (!vp->vp_partition) {
1125 ret = -EBADF;
1126 goto free_vp;
1127 }
1128
1129 mutex_init(&vp->vp_mutex);
1130 init_waitqueue_head(&vp->run.vp_suspend_queue);
1131 atomic64_set(&vp->run.vp_signaled_count, 0);
1132
1133 vp->vp_index = args.vp_index;
1134 vp->vp_intercept_msg_page = page_to_virt(intercept_msg_page);
1135 if (!mshv_partition_encrypted(partition))
1136 vp->vp_register_page = page_to_virt(register_page);
1137
1138 if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
1139 vp->vp_ghcb_page = page_to_virt(ghcb_page);
1140
1141 memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages));
1142
1143 ret = mshv_debugfs_vp_create(vp);
1144 if (ret)
1145 goto put_partition;
1146
1147 /*
1148 * Keep anon_inode_getfd last: it installs fd in the file struct and
1149 * thus makes the state accessible in user space.
1150 */
1151 ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp,
1152 O_RDWR | O_CLOEXEC);
1153 if (ret < 0)
1154 goto remove_debugfs_vp;
1155
1156 /* already exclusive with the partition mutex for all ioctls */
1157 partition->pt_vp_count++;
1158 partition->pt_vp_array[args.vp_index] = vp;
1159
1160 goto out;
1161
1162 remove_debugfs_vp:
1163 mshv_debugfs_vp_remove(vp);
1164 put_partition:
1165 mshv_partition_put(partition);
1166 free_vp:
1167 kfree(vp);
1168 unmap_stats_pages:
1169 mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages);
1170 unmap_ghcb_page:
1171 if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
1172 hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
1173 HV_VP_STATE_PAGE_GHCB, ghcb_page,
1174 input_vtl_normal);
1175 unmap_register_page:
1176 if (!mshv_partition_encrypted(partition))
1177 hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
1178 HV_VP_STATE_PAGE_REGISTERS,
1179 register_page, input_vtl_zero);
1180 unmap_intercept_message_page:
1181 hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
1182 HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
1183 intercept_msg_page, input_vtl_zero);
1184 destroy_vp:
1185 hv_call_delete_vp(partition->pt_id, args.vp_index);
1186 out:
1187 trace_mshv_create_vp(partition->pt_id, args.vp_index, ret);
1188 return ret;
1189 }
1190
mshv_init_async_handler(struct mshv_partition * partition)1191 static int mshv_init_async_handler(struct mshv_partition *partition)
1192 {
1193 if (completion_done(&partition->async_hypercall)) {
1194 pt_err(partition,
1195 "Cannot issue async hypercall while another one in progress!\n");
1196 return -EPERM;
1197 }
1198
1199 reinit_completion(&partition->async_hypercall);
1200 return 0;
1201 }
1202
mshv_async_hvcall_handler(void * data,u64 * status)1203 static void mshv_async_hvcall_handler(void *data, u64 *status)
1204 {
1205 struct mshv_partition *partition = data;
1206
1207 wait_for_completion(&partition->async_hypercall);
1208 pt_dbg(partition, "Async hypercall completed!\n");
1209
1210 *status = partition->async_hypercall_status;
1211 }
1212
1213 /*
1214 * NB: caller checks and makes sure mem->size is page aligned
1215 * Returns: 0 with regionpp updated on success, or -errno
1216 */
mshv_partition_create_region(struct mshv_partition * partition,struct mshv_user_mem_region * mem,struct mshv_mem_region ** regionpp,bool is_mmio)1217 static int mshv_partition_create_region(struct mshv_partition *partition,
1218 struct mshv_user_mem_region *mem,
1219 struct mshv_mem_region **regionpp,
1220 bool is_mmio)
1221 {
1222 struct mshv_mem_region *rg;
1223 u64 nr_pages = HVPFN_DOWN(mem->size);
1224
1225 /* Reject overlapping regions */
1226 spin_lock(&partition->pt_mem_regions_lock);
1227 hlist_for_each_entry(rg, &partition->pt_mem_regions, hnode) {
1228 if (mem->guest_pfn + nr_pages <= rg->start_gfn ||
1229 rg->start_gfn + rg->nr_pages <= mem->guest_pfn)
1230 continue;
1231 spin_unlock(&partition->pt_mem_regions_lock);
1232 return -EEXIST;
1233 }
1234 spin_unlock(&partition->pt_mem_regions_lock);
1235
1236 rg = mshv_region_create(mem->guest_pfn, nr_pages,
1237 mem->userspace_addr, mem->flags);
1238 if (IS_ERR(rg))
1239 return PTR_ERR(rg);
1240
1241 if (is_mmio)
1242 rg->mreg_type = MSHV_REGION_TYPE_MMIO;
1243 else if (mshv_partition_encrypted(partition) ||
1244 !mshv_region_movable_init(rg))
1245 rg->mreg_type = MSHV_REGION_TYPE_MEM_PINNED;
1246 else
1247 rg->mreg_type = MSHV_REGION_TYPE_MEM_MOVABLE;
1248
1249 rg->partition = partition;
1250
1251 *regionpp = rg;
1252
1253 return 0;
1254 }
1255
1256 /**
1257 * mshv_prepare_pinned_region - Pin and map memory regions
1258 * @region: Pointer to the memory region structure
1259 *
1260 * This function processes memory regions that are explicitly marked as pinned.
1261 * Pinned regions are preallocated, mapped upfront, and do not rely on fault-based
1262 * population. The function ensures the region is properly populated, handles
1263 * encryption requirements for SNP partitions if applicable, maps the region,
1264 * and performs necessary sharing or eviction operations based on the mapping
1265 * result.
1266 *
1267 * Return: 0 on success, negative error code on failure.
1268 */
mshv_prepare_pinned_region(struct mshv_mem_region * region)1269 static int mshv_prepare_pinned_region(struct mshv_mem_region *region)
1270 {
1271 struct mshv_partition *partition = region->partition;
1272 int ret;
1273
1274 ret = mshv_region_pin(region);
1275 if (ret) {
1276 pt_err(partition, "Failed to pin memory region: %d\n",
1277 ret);
1278 goto err_out;
1279 }
1280
1281 /*
1282 * For an SNP partition it is a requirement that for every memory region
1283 * that we are going to map for this partition we should make sure that
1284 * host access to that region is released. This is ensured by doing an
1285 * additional hypercall which will update the SLAT to release host
1286 * access to guest memory regions.
1287 */
1288 if (mshv_partition_encrypted(partition)) {
1289 ret = mshv_region_unshare(region);
1290 if (ret) {
1291 pt_err(partition,
1292 "Failed to unshare memory region (guest_pfn: %llu): %d\n",
1293 region->start_gfn, ret);
1294 goto invalidate_region;
1295 }
1296 }
1297
1298 ret = mshv_region_map(region);
1299 if (ret && mshv_partition_encrypted(partition)) {
1300 int shrc;
1301
1302 shrc = mshv_region_share(region);
1303 if (!shrc)
1304 goto invalidate_region;
1305
1306 pt_err(partition,
1307 "Failed to share memory region (guest_pfn: %llu): %d\n",
1308 region->start_gfn, shrc);
1309 /*
1310 * Don't unpin if marking shared failed because pages are no
1311 * longer mapped in the host, ie root, anymore.
1312 */
1313 goto err_out;
1314 }
1315
1316 return 0;
1317
1318 invalidate_region:
1319 mshv_region_invalidate(region);
1320 err_out:
1321 return ret;
1322 }
1323
1324 /*
1325 * This maps two things: guest RAM and for pci passthru mmio space.
1326 *
1327 * mmio:
1328 * - vfio overloads vm_pgoff to store the mmio start pfn/spa.
1329 * - Two things need to happen for mapping mmio range:
1330 * 1. mapped in the uaddr so VMM can access it.
1331 * 2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it.
1332 *
1333 * This function takes care of the second. The first one is managed by vfio,
1334 * and hence is taken care of via vfio_pci_mmap_fault().
1335 */
1336 static long
mshv_map_user_memory(struct mshv_partition * partition,struct mshv_user_mem_region * mem)1337 mshv_map_user_memory(struct mshv_partition *partition,
1338 struct mshv_user_mem_region *mem)
1339 {
1340 struct mshv_mem_region *region;
1341 struct vm_area_struct *vma;
1342 bool is_mmio;
1343 ulong mmio_pfn;
1344 long ret;
1345
1346 if (mem->flags & BIT(MSHV_SET_MEM_BIT_UNMAP) ||
1347 !access_ok((const void __user *)mem->userspace_addr, mem->size))
1348 return -EINVAL;
1349
1350 mmap_read_lock(current->mm);
1351 vma = vma_lookup(current->mm, mem->userspace_addr);
1352 is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
1353 mmio_pfn = is_mmio ? vma->vm_pgoff : 0;
1354 mmap_read_unlock(current->mm);
1355
1356 if (!vma)
1357 return -EINVAL;
1358
1359 ret = mshv_partition_create_region(partition, mem, ®ion,
1360 is_mmio);
1361 if (ret)
1362 return ret;
1363
1364 switch (region->mreg_type) {
1365 case MSHV_REGION_TYPE_MEM_PINNED:
1366 ret = mshv_prepare_pinned_region(region);
1367 break;
1368 case MSHV_REGION_TYPE_MEM_MOVABLE:
1369 /*
1370 * For movable memory regions, remap with no access to let
1371 * the hypervisor track dirty pages, enabling pre-copy live
1372 * migration.
1373 */
1374 ret = hv_call_map_gpa_pages(partition->pt_id,
1375 region->start_gfn,
1376 region->nr_pages,
1377 HV_MAP_GPA_NO_ACCESS, NULL);
1378 break;
1379 case MSHV_REGION_TYPE_MMIO:
1380 ret = hv_call_map_mmio_pages(partition->pt_id,
1381 region->start_gfn,
1382 mmio_pfn,
1383 region->nr_pages);
1384 break;
1385 }
1386
1387 trace_mshv_map_user_memory(partition->pt_id, region->start_uaddr,
1388 region->start_gfn, region->nr_pages,
1389 region->hv_map_flags, ret);
1390
1391 if (ret)
1392 goto errout;
1393
1394 spin_lock(&partition->pt_mem_regions_lock);
1395 hlist_add_head(®ion->hnode, &partition->pt_mem_regions);
1396 spin_unlock(&partition->pt_mem_regions_lock);
1397
1398 return 0;
1399
1400 errout:
1401 mshv_region_put(region);
1402 return ret;
1403 }
1404
1405 /* Called for unmapping both the guest ram and the mmio space */
1406 static long
mshv_unmap_user_memory(struct mshv_partition * partition,struct mshv_user_mem_region * mem)1407 mshv_unmap_user_memory(struct mshv_partition *partition,
1408 struct mshv_user_mem_region *mem)
1409 {
1410 struct mshv_mem_region *region;
1411
1412 if (!(mem->flags & BIT(MSHV_SET_MEM_BIT_UNMAP)))
1413 return -EINVAL;
1414
1415 spin_lock(&partition->pt_mem_regions_lock);
1416
1417 region = mshv_partition_region_by_gfn(partition, mem->guest_pfn);
1418 if (!region) {
1419 spin_unlock(&partition->pt_mem_regions_lock);
1420 return -ENOENT;
1421 }
1422
1423 /* Paranoia check */
1424 if (region->start_uaddr != mem->userspace_addr ||
1425 region->start_gfn != mem->guest_pfn ||
1426 region->nr_pages != HVPFN_DOWN(mem->size)) {
1427 spin_unlock(&partition->pt_mem_regions_lock);
1428 return -EINVAL;
1429 }
1430
1431 hlist_del(®ion->hnode);
1432
1433 spin_unlock(&partition->pt_mem_regions_lock);
1434
1435 mshv_region_put(region);
1436
1437 return 0;
1438 }
1439
1440 static long
mshv_partition_ioctl_set_memory(struct mshv_partition * partition,struct mshv_user_mem_region __user * user_mem)1441 mshv_partition_ioctl_set_memory(struct mshv_partition *partition,
1442 struct mshv_user_mem_region __user *user_mem)
1443 {
1444 struct mshv_user_mem_region mem;
1445
1446 if (copy_from_user(&mem, user_mem, sizeof(mem)))
1447 return -EFAULT;
1448
1449 if (!mem.size ||
1450 !PAGE_ALIGNED(mem.size) ||
1451 !PAGE_ALIGNED(mem.userspace_addr) ||
1452 (mem.flags & ~MSHV_SET_MEM_FLAGS_MASK) ||
1453 mshv_field_nonzero(mem, rsvd))
1454 return -EINVAL;
1455
1456 if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP))
1457 return mshv_unmap_user_memory(partition, &mem);
1458
1459 return mshv_map_user_memory(partition, &mem);
1460 }
1461
1462 static long
mshv_partition_ioctl_ioeventfd(struct mshv_partition * partition,void __user * user_args)1463 mshv_partition_ioctl_ioeventfd(struct mshv_partition *partition,
1464 void __user *user_args)
1465 {
1466 struct mshv_user_ioeventfd args;
1467
1468 if (copy_from_user(&args, user_args, sizeof(args)))
1469 return -EFAULT;
1470
1471 return mshv_set_unset_ioeventfd(partition, &args);
1472 }
1473
1474 static long
mshv_partition_ioctl_irqfd(struct mshv_partition * partition,void __user * user_args)1475 mshv_partition_ioctl_irqfd(struct mshv_partition *partition,
1476 void __user *user_args)
1477 {
1478 struct mshv_user_irqfd args;
1479
1480 if (copy_from_user(&args, user_args, sizeof(args)))
1481 return -EFAULT;
1482
1483 return mshv_set_unset_irqfd(partition, &args);
1484 }
1485
1486 static long
mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition * partition,void __user * user_args)1487 mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition *partition,
1488 void __user *user_args)
1489 {
1490 struct mshv_gpap_access_bitmap args;
1491 union hv_gpa_page_access_state *states;
1492 long ret, i;
1493 union hv_gpa_page_access_state_flags hv_flags = {};
1494 u8 hv_type_mask;
1495 ulong bitmap_buf_sz, states_buf_sz;
1496 int written = 0;
1497
1498 if (copy_from_user(&args, user_args, sizeof(args)))
1499 return -EFAULT;
1500
1501 if (args.access_type >= MSHV_GPAP_ACCESS_TYPE_COUNT ||
1502 args.access_op >= MSHV_GPAP_ACCESS_OP_COUNT ||
1503 mshv_field_nonzero(args, rsvd) || !args.page_count ||
1504 !args.bitmap_ptr)
1505 return -EINVAL;
1506
1507 if (check_mul_overflow(args.page_count, sizeof(*states), &states_buf_sz))
1508 return -E2BIG;
1509
1510 /* Num bytes needed to store bitmap; one bit per page rounded up */
1511 bitmap_buf_sz = DIV_ROUND_UP(args.page_count, 8);
1512
1513 /* Sanity check */
1514 if (bitmap_buf_sz > states_buf_sz)
1515 return -EBADFD;
1516
1517 switch (args.access_type) {
1518 case MSHV_GPAP_ACCESS_TYPE_ACCESSED:
1519 hv_type_mask = 1;
1520 if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) {
1521 hv_flags.clear_accessed = 1;
1522 /* not accessed implies not dirty */
1523 hv_flags.clear_dirty = 1;
1524 } else { /* MSHV_GPAP_ACCESS_OP_SET */
1525 hv_flags.set_accessed = 1;
1526 }
1527 break;
1528 case MSHV_GPAP_ACCESS_TYPE_DIRTY:
1529 hv_type_mask = 2;
1530 if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) {
1531 hv_flags.clear_dirty = 1;
1532 } else { /* MSHV_GPAP_ACCESS_OP_SET */
1533 hv_flags.set_dirty = 1;
1534 /* dirty implies accessed */
1535 hv_flags.set_accessed = 1;
1536 }
1537 break;
1538 }
1539
1540 states = vzalloc(states_buf_sz);
1541 if (!states)
1542 return -ENOMEM;
1543
1544 ret = hv_call_get_gpa_access_states(partition->pt_id, args.page_count,
1545 args.gpap_base, hv_flags, &written,
1546 states);
1547 if (ret)
1548 goto free_return;
1549
1550 /*
1551 * Overwrite states buffer with bitmap - the bits in hv_type_mask
1552 * correspond to bitfields in hv_gpa_page_access_state
1553 */
1554 for (i = 0; i < written; ++i)
1555 __assign_bit(i, (ulong *)states,
1556 states[i].as_uint8 & hv_type_mask);
1557
1558 /* zero the unused bits in the last byte(s) of the returned bitmap */
1559 for (i = written; i < bitmap_buf_sz * 8; ++i)
1560 __clear_bit(i, (ulong *)states);
1561
1562 if (copy_to_user((void __user *)args.bitmap_ptr, states, bitmap_buf_sz))
1563 ret = -EFAULT;
1564
1565 free_return:
1566 vfree(states);
1567 return ret;
1568 }
1569
1570 static long
mshv_partition_ioctl_set_msi_routing(struct mshv_partition * partition,void __user * user_args)1571 mshv_partition_ioctl_set_msi_routing(struct mshv_partition *partition,
1572 void __user *user_args)
1573 {
1574 struct mshv_user_irq_entry *entries = NULL;
1575 struct mshv_user_irq_table args;
1576 long ret;
1577
1578 if (copy_from_user(&args, user_args, sizeof(args)))
1579 return -EFAULT;
1580
1581 if (args.nr > MSHV_MAX_GUEST_IRQS ||
1582 mshv_field_nonzero(args, rsvd))
1583 return -EINVAL;
1584
1585 if (args.nr) {
1586 struct mshv_user_irq_table __user *urouting = user_args;
1587
1588 entries = vmemdup_user(urouting->entries,
1589 array_size(sizeof(*entries),
1590 args.nr));
1591 if (IS_ERR(entries))
1592 return PTR_ERR(entries);
1593 }
1594 ret = mshv_update_routing_table(partition, entries, args.nr);
1595 kvfree(entries);
1596
1597 return ret;
1598 }
1599
1600 static long
mshv_partition_ioctl_initialize(struct mshv_partition * partition)1601 mshv_partition_ioctl_initialize(struct mshv_partition *partition)
1602 {
1603 long ret;
1604
1605 if (partition->pt_initialized)
1606 return 0;
1607
1608 ret = hv_call_initialize_partition(partition->pt_id);
1609 if (ret)
1610 goto withdraw_mem;
1611
1612 ret = mshv_debugfs_partition_create(partition);
1613 if (ret)
1614 goto finalize_partition;
1615
1616 partition->pt_initialized = true;
1617
1618 return 0;
1619
1620 finalize_partition:
1621 hv_call_finalize_partition(partition->pt_id);
1622 withdraw_mem:
1623 hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);
1624
1625 return ret;
1626 }
1627
1628 static long
mshv_partition_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)1629 mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
1630 {
1631 struct mshv_partition *partition = filp->private_data;
1632 long ret;
1633 void __user *uarg = (void __user *)arg;
1634
1635 if (mutex_lock_killable(&partition->pt_mutex))
1636 return -EINTR;
1637
1638 switch (ioctl) {
1639 case MSHV_INITIALIZE_PARTITION:
1640 ret = mshv_partition_ioctl_initialize(partition);
1641 break;
1642 case MSHV_SET_GUEST_MEMORY:
1643 ret = mshv_partition_ioctl_set_memory(partition, uarg);
1644 break;
1645 case MSHV_CREATE_VP:
1646 ret = mshv_partition_ioctl_create_vp(partition, uarg);
1647 break;
1648 case MSHV_IRQFD:
1649 ret = mshv_partition_ioctl_irqfd(partition, uarg);
1650 break;
1651 case MSHV_IOEVENTFD:
1652 ret = mshv_partition_ioctl_ioeventfd(partition, uarg);
1653 break;
1654 case MSHV_SET_MSI_ROUTING:
1655 ret = mshv_partition_ioctl_set_msi_routing(partition, uarg);
1656 break;
1657 case MSHV_GET_GPAP_ACCESS_BITMAP:
1658 ret = mshv_partition_ioctl_get_gpap_access_bitmap(partition,
1659 uarg);
1660 break;
1661 case MSHV_ROOT_HVCALL:
1662 ret = mshv_ioctl_passthru_hvcall(partition, true, uarg);
1663 break;
1664 default:
1665 ret = -ENOTTY;
1666 }
1667
1668 mutex_unlock(&partition->pt_mutex);
1669 return ret;
1670 }
1671
1672 static int
disable_vp_dispatch(struct mshv_vp * vp)1673 disable_vp_dispatch(struct mshv_vp *vp)
1674 {
1675 int ret;
1676 struct hv_register_assoc dispatch_suspend = {
1677 .name = HV_REGISTER_DISPATCH_SUSPEND,
1678 .value.dispatch_suspend.suspended = 1,
1679 };
1680
1681 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
1682 1, &dispatch_suspend);
1683 if (ret)
1684 vp_err(vp, "failed to suspend\n");
1685
1686 trace_mshv_disable_vp_dispatch(vp->vp_partition->pt_id,
1687 vp->vp_index, ret);
1688
1689 return ret;
1690 }
1691
1692 static int
get_vp_signaled_count(struct mshv_vp * vp,u64 * count)1693 get_vp_signaled_count(struct mshv_vp *vp, u64 *count)
1694 {
1695 int ret;
1696 struct hv_register_assoc root_signal_count = {
1697 .name = HV_REGISTER_VP_ROOT_SIGNAL_COUNT,
1698 };
1699
1700 ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
1701 1, &root_signal_count);
1702
1703 if (ret) {
1704 vp_err(vp, "Failed to get root signal count");
1705 *count = 0;
1706 return ret;
1707 }
1708
1709 *count = root_signal_count.value.reg64;
1710
1711 return ret;
1712 }
1713
1714 static void
drain_vp_signals(struct mshv_vp * vp)1715 drain_vp_signals(struct mshv_vp *vp)
1716 {
1717 u64 hv_signal_count;
1718 u64 vp_signal_count;
1719
1720 get_vp_signaled_count(vp, &hv_signal_count);
1721
1722 vp_signal_count = atomic64_read(&vp->run.vp_signaled_count);
1723
1724 /*
1725 * There should be at most 1 outstanding notification, but be extra
1726 * careful anyway.
1727 */
1728 while (hv_signal_count != vp_signal_count) {
1729 WARN_ON(hv_signal_count - vp_signal_count != 1);
1730
1731 if (wait_event_interruptible(vp->run.vp_suspend_queue,
1732 vp->run.kicked_by_hv == 1))
1733 break;
1734 vp->run.kicked_by_hv = 0;
1735 vp_signal_count = atomic64_read(&vp->run.vp_signaled_count);
1736 }
1737
1738 trace_mshv_drain_vp_signals(vp->vp_partition->pt_id, vp->vp_index);
1739 }
1740
drain_all_vps(const struct mshv_partition * partition)1741 static void drain_all_vps(const struct mshv_partition *partition)
1742 {
1743 int i;
1744 struct mshv_vp *vp;
1745
1746 /*
1747 * VPs are reachable from ISR. It is safe to not take the partition
1748 * lock because nobody else can enter this function and drop the
1749 * partition from the list.
1750 */
1751 for (i = 0; i < MSHV_MAX_VPS; i++) {
1752 vp = partition->pt_vp_array[i];
1753 if (!vp)
1754 continue;
1755 /*
1756 * Disable dispatching of the VP in the hypervisor. After this
1757 * the hypervisor guarantees it won't generate any signals for
1758 * the VP and the hypervisor's VP signal count won't change.
1759 */
1760 disable_vp_dispatch(vp);
1761 drain_vp_signals(vp);
1762 }
1763 }
1764
1765 static void
remove_partition(struct mshv_partition * partition)1766 remove_partition(struct mshv_partition *partition)
1767 {
1768 spin_lock(&mshv_root.pt_ht_lock);
1769 hlist_del_rcu(&partition->pt_hnode);
1770 spin_unlock(&mshv_root.pt_ht_lock);
1771
1772 synchronize_rcu();
1773 }
1774
1775 /*
1776 * Tear down a partition and remove it from the list.
1777 * Partition's refcount must be 0
1778 */
destroy_partition(struct mshv_partition * partition)1779 static void destroy_partition(struct mshv_partition *partition)
1780 {
1781 struct mshv_vp *vp;
1782 struct mshv_mem_region *region;
1783 struct hlist_node *n;
1784 int i;
1785
1786 if (refcount_read(&partition->pt_ref_count)) {
1787 pt_err(partition,
1788 "Attempt to destroy partition but refcount > 0\n");
1789 return;
1790 }
1791
1792 trace_mshv_destroy_partition(partition->pt_id);
1793
1794 if (partition->pt_initialized) {
1795 /*
1796 * We only need to drain signals for root scheduler. This should be
1797 * done before removing the partition from the partition list.
1798 */
1799 if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
1800 drain_all_vps(partition);
1801
1802 /* Remove vps */
1803 for (i = 0; i < MSHV_MAX_VPS; ++i) {
1804 vp = partition->pt_vp_array[i];
1805 if (!vp)
1806 continue;
1807
1808 mshv_debugfs_vp_remove(vp);
1809 mshv_vp_stats_unmap(partition->pt_id, vp->vp_index,
1810 vp->vp_stats_pages);
1811
1812 if (vp->vp_register_page) {
1813 (void)hv_unmap_vp_state_page(partition->pt_id,
1814 vp->vp_index,
1815 HV_VP_STATE_PAGE_REGISTERS,
1816 virt_to_page(vp->vp_register_page),
1817 input_vtl_zero);
1818 vp->vp_register_page = NULL;
1819 }
1820
1821 (void)hv_unmap_vp_state_page(partition->pt_id,
1822 vp->vp_index,
1823 HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
1824 virt_to_page(vp->vp_intercept_msg_page),
1825 input_vtl_zero);
1826 vp->vp_intercept_msg_page = NULL;
1827
1828 if (vp->vp_ghcb_page) {
1829 (void)hv_unmap_vp_state_page(partition->pt_id,
1830 vp->vp_index,
1831 HV_VP_STATE_PAGE_GHCB,
1832 virt_to_page(vp->vp_ghcb_page),
1833 input_vtl_normal);
1834 vp->vp_ghcb_page = NULL;
1835 }
1836
1837 kfree(vp);
1838
1839 partition->pt_vp_array[i] = NULL;
1840 }
1841
1842 mshv_debugfs_partition_remove(partition);
1843
1844 /* Deallocates and unmaps everything including vcpus, GPA mappings etc */
1845 hv_call_finalize_partition(partition->pt_id);
1846
1847 partition->pt_initialized = false;
1848 }
1849
1850 remove_partition(partition);
1851
1852 hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions,
1853 hnode) {
1854 hlist_del(®ion->hnode);
1855 mshv_region_put(region);
1856 }
1857
1858 /* Withdraw and free all pages we deposited */
1859 hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);
1860 hv_call_delete_partition(partition->pt_id);
1861
1862 mshv_free_routing_table(partition);
1863 kfree(partition);
1864 }
1865
1866 struct
mshv_partition_get(struct mshv_partition * partition)1867 mshv_partition *mshv_partition_get(struct mshv_partition *partition)
1868 {
1869 if (refcount_inc_not_zero(&partition->pt_ref_count))
1870 return partition;
1871 return NULL;
1872 }
1873
1874 struct
mshv_partition_find(u64 partition_id)1875 mshv_partition *mshv_partition_find(u64 partition_id)
1876 __must_hold(RCU)
1877 {
1878 struct mshv_partition *p;
1879
1880 hash_for_each_possible_rcu(mshv_root.pt_htable, p, pt_hnode,
1881 partition_id)
1882 if (p->pt_id == partition_id)
1883 return p;
1884
1885 return NULL;
1886 }
1887
1888 void
mshv_partition_put(struct mshv_partition * partition)1889 mshv_partition_put(struct mshv_partition *partition)
1890 {
1891 if (refcount_dec_and_test(&partition->pt_ref_count))
1892 destroy_partition(partition);
1893 }
1894
1895 static int
mshv_partition_release(struct inode * inode,struct file * filp)1896 mshv_partition_release(struct inode *inode, struct file *filp)
1897 {
1898 struct mshv_partition *partition = filp->private_data;
1899
1900 trace_mshv_partition_release(partition->pt_id);
1901
1902 mshv_eventfd_release(partition);
1903
1904 cleanup_srcu_struct(&partition->pt_irq_srcu);
1905
1906 mshv_partition_put(partition);
1907
1908 return 0;
1909 }
1910
1911 static int
add_partition(struct mshv_partition * partition)1912 add_partition(struct mshv_partition *partition)
1913 {
1914 spin_lock(&mshv_root.pt_ht_lock);
1915
1916 hash_add_rcu(mshv_root.pt_htable, &partition->pt_hnode,
1917 partition->pt_id);
1918
1919 spin_unlock(&mshv_root.pt_ht_lock);
1920
1921 return 0;
1922 }
1923
1924 static_assert(MSHV_NUM_CPU_FEATURES_BANKS ==
1925 HV_PARTITION_PROCESSOR_FEATURES_BANKS);
1926
mshv_ioctl_process_pt_flags(void __user * user_arg,u64 * pt_flags,struct hv_partition_creation_properties * cr_props,union hv_partition_isolation_properties * isol_props)1927 static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags,
1928 struct hv_partition_creation_properties *cr_props,
1929 union hv_partition_isolation_properties *isol_props)
1930 {
1931 int i;
1932 struct mshv_create_partition_v2 args;
1933 union hv_partition_processor_features *disabled_procs;
1934 union hv_partition_processor_xsave_features *disabled_xsave;
1935
1936 /* First, copy v1 struct in case user is on previous versions */
1937 if (copy_from_user(&args, user_arg,
1938 sizeof(struct mshv_create_partition)))
1939 return -EFAULT;
1940
1941 if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
1942 args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
1943 return -EINVAL;
1944
1945 disabled_procs = &cr_props->disabled_processor_features;
1946 disabled_xsave = &cr_props->disabled_processor_xsave_features;
1947
1948 /* Check if user provided newer struct with feature fields */
1949 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES)) {
1950 if (copy_from_user(&args, user_arg, sizeof(args)))
1951 return -EFAULT;
1952
1953 /* Re-validate v1 fields after second copy_from_user() */
1954 if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
1955 args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
1956 return -EINVAL;
1957
1958 if (args.pt_num_cpu_fbanks != MSHV_NUM_CPU_FEATURES_BANKS ||
1959 mshv_field_nonzero(args, pt_rsvd) ||
1960 mshv_field_nonzero(args, pt_rsvd1))
1961 return -EINVAL;
1962
1963 /*
1964 * Note this assumes MSHV_NUM_CPU_FEATURES_BANKS will never
1965 * change and equals HV_PARTITION_PROCESSOR_FEATURES_BANKS
1966 * (i.e. 2).
1967 *
1968 * Further banks (index >= 2) will be modifiable as 'early'
1969 * properties via the set partition property hypercall.
1970 */
1971 for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
1972 disabled_procs->as_uint64[i] = args.pt_cpu_fbanks[i];
1973
1974 #if IS_ENABLED(CONFIG_X86_64)
1975 disabled_xsave->as_uint64 = args.pt_disabled_xsave;
1976 #else
1977 /*
1978 * In practice this field is ignored on arm64, but safer to
1979 * zero it in case it is ever used.
1980 */
1981 disabled_xsave->as_uint64 = 0;
1982
1983 if (mshv_field_nonzero(args, pt_rsvd2))
1984 return -EINVAL;
1985 #endif
1986 } else {
1987 /*
1988 * v1 behavior: try to enable everything. The hypervisor will
1989 * disable features that are not supported. The banks can be
1990 * queried via the get partition property hypercall.
1991 */
1992 for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
1993 disabled_procs->as_uint64[i] = 0;
1994
1995 disabled_xsave->as_uint64 = 0;
1996 }
1997
1998 /* Only support EXO partitions */
1999 *pt_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION |
2000 HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED;
2001
2002 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_LAPIC))
2003 *pt_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED;
2004 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_X2APIC))
2005 *pt_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE;
2006 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_GPA_SUPER_PAGES))
2007 *pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED;
2008 if (args.pt_flags & BIT(MSHV_PT_BIT_NESTED_VIRTUALIZATION))
2009 *pt_flags |= HV_PARTITION_CREATION_FLAG_NESTED_VIRTUALIZATION_CAPABLE;
2010 if (args.pt_flags & BIT(MSHV_PT_BIT_SMT_ENABLED_GUEST))
2011 *pt_flags |= HV_PARTITION_CREATION_FLAG_SMT_ENABLED_GUEST;
2012
2013 isol_props->as_uint64 = 0;
2014
2015 switch (args.pt_isolation) {
2016 case MSHV_PT_ISOLATION_NONE:
2017 isol_props->isolation_type = HV_PARTITION_ISOLATION_TYPE_NONE;
2018 break;
2019 }
2020
2021 return 0;
2022 }
2023
2024 static long
mshv_ioctl_create_partition(void __user * user_arg,struct device * module_dev)2025 mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
2026 {
2027 u64 creation_flags;
2028 struct hv_partition_creation_properties creation_properties;
2029 union hv_partition_isolation_properties isolation_properties;
2030 struct mshv_partition *partition;
2031 u64 pt_id = -1;
2032 long ret;
2033
2034 ret = mshv_ioctl_process_pt_flags(user_arg, &creation_flags,
2035 &creation_properties,
2036 &isolation_properties);
2037 if (ret)
2038 return ret;
2039
2040 partition = kzalloc_obj(*partition);
2041 if (!partition)
2042 return -ENOMEM;
2043
2044 partition->pt_module_dev = module_dev;
2045 partition->isolation_type = isolation_properties.isolation_type;
2046
2047 refcount_set(&partition->pt_ref_count, 1);
2048
2049 mutex_init(&partition->pt_mutex);
2050
2051 mutex_init(&partition->pt_irq_lock);
2052
2053 init_completion(&partition->async_hypercall);
2054
2055 INIT_HLIST_HEAD(&partition->irq_ack_notifier_list);
2056
2057 INIT_HLIST_HEAD(&partition->pt_devices);
2058
2059 spin_lock_init(&partition->pt_mem_regions_lock);
2060 INIT_HLIST_HEAD(&partition->pt_mem_regions);
2061
2062 mshv_eventfd_init(partition);
2063
2064 ret = init_srcu_struct(&partition->pt_irq_srcu);
2065 if (ret)
2066 goto free_partition;
2067
2068 ret = hv_call_create_partition(creation_flags,
2069 creation_properties,
2070 isolation_properties,
2071 &pt_id);
2072 if (ret)
2073 goto cleanup_irq_srcu;
2074
2075 partition->pt_id = pt_id;
2076
2077 ret = add_partition(partition);
2078 if (ret)
2079 goto delete_partition;
2080
2081 ret = mshv_init_async_handler(partition);
2082 if (ret)
2083 goto remove_partition;
2084
2085 ret = FD_ADD(O_CLOEXEC, anon_inode_getfile("mshv_partition",
2086 &mshv_partition_fops,
2087 partition, O_RDWR));
2088 if (ret < 0)
2089 goto remove_partition;
2090
2091 goto out;
2092
2093 remove_partition:
2094 remove_partition(partition);
2095 delete_partition:
2096 hv_call_delete_partition(partition->pt_id);
2097 cleanup_irq_srcu:
2098 cleanup_srcu_struct(&partition->pt_irq_srcu);
2099 free_partition:
2100 kfree(partition);
2101 out:
2102 trace_mshv_create_partition(pt_id, ret);
2103 return ret;
2104 }
2105
mshv_dev_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)2106 static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl,
2107 unsigned long arg)
2108 {
2109 struct miscdevice *misc = filp->private_data;
2110
2111 switch (ioctl) {
2112 case MSHV_CREATE_PARTITION:
2113 return mshv_ioctl_create_partition((void __user *)arg,
2114 misc->this_device);
2115 case MSHV_ROOT_HVCALL:
2116 return mshv_ioctl_passthru_hvcall(NULL, false,
2117 (void __user *)arg);
2118 }
2119
2120 return -ENOTTY;
2121 }
2122
2123 static int
mshv_dev_open(struct inode * inode,struct file * filp)2124 mshv_dev_open(struct inode *inode, struct file *filp)
2125 {
2126 return 0;
2127 }
2128
2129 static int
mshv_dev_release(struct inode * inode,struct file * filp)2130 mshv_dev_release(struct inode *inode, struct file *filp)
2131 {
2132 return 0;
2133 }
2134
2135 static int mshv_root_sched_online;
2136
scheduler_type_to_string(enum hv_scheduler_type type)2137 static const char *scheduler_type_to_string(enum hv_scheduler_type type)
2138 {
2139 switch (type) {
2140 case HV_SCHEDULER_TYPE_LP:
2141 return "classic scheduler without SMT";
2142 case HV_SCHEDULER_TYPE_LP_SMT:
2143 return "classic scheduler with SMT";
2144 case HV_SCHEDULER_TYPE_CORE_SMT:
2145 return "core scheduler";
2146 case HV_SCHEDULER_TYPE_ROOT:
2147 return "root scheduler";
2148 default:
2149 return "unknown scheduler";
2150 };
2151 }
2152
l1vh_retrieve_scheduler_type(enum hv_scheduler_type * out)2153 static int __init l1vh_retrieve_scheduler_type(enum hv_scheduler_type *out)
2154 {
2155 u64 integrated_sched_enabled;
2156 int ret;
2157
2158 *out = HV_SCHEDULER_TYPE_CORE_SMT;
2159
2160 if (!mshv_root.vmm_caps.vmm_enable_integrated_scheduler)
2161 return 0;
2162
2163 ret = hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF,
2164 HV_PARTITION_PROPERTY_INTEGRATED_SCHEDULER_ENABLED,
2165 0, &integrated_sched_enabled,
2166 sizeof(integrated_sched_enabled));
2167 if (ret)
2168 return ret;
2169
2170 if (integrated_sched_enabled)
2171 *out = HV_SCHEDULER_TYPE_ROOT;
2172
2173 return 0;
2174 }
2175
2176 /* TODO move this to hv_common.c when needed outside */
hv_retrieve_scheduler_type(enum hv_scheduler_type * out)2177 static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out)
2178 {
2179 struct hv_input_get_system_property *input;
2180 struct hv_output_get_system_property *output;
2181 unsigned long flags;
2182 u64 status;
2183
2184 local_irq_save(flags);
2185 input = *this_cpu_ptr(hyperv_pcpu_input_arg);
2186 output = *this_cpu_ptr(hyperv_pcpu_output_arg);
2187
2188 memset(input, 0, sizeof(*input));
2189 memset(output, 0, sizeof(*output));
2190 input->property_id = HV_SYSTEM_PROPERTY_SCHEDULER_TYPE;
2191
2192 status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output);
2193 if (!hv_result_success(status)) {
2194 local_irq_restore(flags);
2195 pr_err("%s: %s\n", __func__, hv_result_to_string(status));
2196 return hv_result_to_errno(status);
2197 }
2198
2199 *out = output->scheduler_type;
2200 local_irq_restore(flags);
2201
2202 return 0;
2203 }
2204
2205 /* Retrieve and stash the supported scheduler type */
mshv_retrieve_scheduler_type(struct device * dev)2206 static int __init mshv_retrieve_scheduler_type(struct device *dev)
2207 {
2208 int ret;
2209
2210 if (hv_l1vh_partition())
2211 ret = l1vh_retrieve_scheduler_type(&hv_scheduler_type);
2212 else
2213 ret = hv_retrieve_scheduler_type(&hv_scheduler_type);
2214 if (ret)
2215 return ret;
2216
2217 dev_info(dev, "Hypervisor using %s\n",
2218 scheduler_type_to_string(hv_scheduler_type));
2219
2220 switch (hv_scheduler_type) {
2221 case HV_SCHEDULER_TYPE_CORE_SMT:
2222 case HV_SCHEDULER_TYPE_LP_SMT:
2223 case HV_SCHEDULER_TYPE_ROOT:
2224 case HV_SCHEDULER_TYPE_LP:
2225 /* Supported scheduler, nothing to do */
2226 break;
2227 default:
2228 dev_err(dev, "unsupported scheduler 0x%x, bailing.\n",
2229 hv_scheduler_type);
2230 return -EOPNOTSUPP;
2231 }
2232
2233 return 0;
2234 }
2235
mshv_root_scheduler_init(unsigned int cpu)2236 static int mshv_root_scheduler_init(unsigned int cpu)
2237 {
2238 void **inputarg, **outputarg, *p;
2239
2240 inputarg = (void **)this_cpu_ptr(root_scheduler_input);
2241 outputarg = (void **)this_cpu_ptr(root_scheduler_output);
2242
2243 /* Allocate two consecutive pages. One for input, one for output. */
2244 p = kmalloc(2 * HV_HYP_PAGE_SIZE, GFP_KERNEL);
2245 if (!p)
2246 return -ENOMEM;
2247
2248 *inputarg = p;
2249 *outputarg = (char *)p + HV_HYP_PAGE_SIZE;
2250
2251 return 0;
2252 }
2253
mshv_root_scheduler_cleanup(unsigned int cpu)2254 static int mshv_root_scheduler_cleanup(unsigned int cpu)
2255 {
2256 void *p, **inputarg, **outputarg;
2257
2258 inputarg = (void **)this_cpu_ptr(root_scheduler_input);
2259 outputarg = (void **)this_cpu_ptr(root_scheduler_output);
2260
2261 p = *inputarg;
2262
2263 *inputarg = NULL;
2264 *outputarg = NULL;
2265
2266 kfree(p);
2267
2268 return 0;
2269 }
2270
2271 /* Must be called after retrieving the scheduler type */
2272 static int
root_scheduler_init(struct device * dev)2273 root_scheduler_init(struct device *dev)
2274 {
2275 int ret;
2276
2277 if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
2278 return 0;
2279
2280 root_scheduler_input = alloc_percpu(void *);
2281 root_scheduler_output = alloc_percpu(void *);
2282
2283 if (!root_scheduler_input || !root_scheduler_output) {
2284 dev_err(dev, "Failed to allocate root scheduler buffers\n");
2285 ret = -ENOMEM;
2286 goto out;
2287 }
2288
2289 ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_root_sched",
2290 mshv_root_scheduler_init,
2291 mshv_root_scheduler_cleanup);
2292
2293 if (ret < 0) {
2294 dev_err(dev, "Failed to setup root scheduler state: %i\n", ret);
2295 goto out;
2296 }
2297
2298 mshv_root_sched_online = ret;
2299
2300 return 0;
2301
2302 out:
2303 free_percpu(root_scheduler_input);
2304 free_percpu(root_scheduler_output);
2305 return ret;
2306 }
2307
2308 static void
root_scheduler_deinit(void)2309 root_scheduler_deinit(void)
2310 {
2311 if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
2312 return;
2313
2314 cpuhp_remove_state(mshv_root_sched_online);
2315 free_percpu(root_scheduler_input);
2316 free_percpu(root_scheduler_output);
2317 }
2318
mshv_init_vmm_caps(struct device * dev)2319 static int __init mshv_init_vmm_caps(struct device *dev)
2320 {
2321 int ret;
2322
2323 ret = hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF,
2324 HV_PARTITION_PROPERTY_VMM_CAPABILITIES,
2325 0, &mshv_root.vmm_caps,
2326 sizeof(mshv_root.vmm_caps));
2327 if (ret && hv_l1vh_partition()) {
2328 dev_err(dev, "Failed to get VMM capabilities: %d\n", ret);
2329 return ret;
2330 }
2331
2332 dev_dbg(dev, "vmm_caps = %#llx\n", mshv_root.vmm_caps.as_uint64[0]);
2333
2334 return 0;
2335 }
2336
mshv_parent_partition_init(void)2337 static int __init mshv_parent_partition_init(void)
2338 {
2339 int ret;
2340 struct device *dev;
2341 union hv_hypervisor_version_info version_info;
2342
2343 if (!hv_parent_partition() || is_kdump_kernel())
2344 return -ENODEV;
2345
2346 if (hv_get_hypervisor_version(&version_info))
2347 return -ENODEV;
2348
2349 ret = misc_register(&mshv_dev);
2350 if (ret)
2351 return ret;
2352
2353 dev = mshv_dev.this_device;
2354
2355 if (version_info.build_number < MSHV_HV_MIN_VERSION ||
2356 version_info.build_number > MSHV_HV_MAX_VERSION) {
2357 dev_err(dev, "Running on unvalidated Hyper-V version\n");
2358 dev_err(dev, "Versions: current: %u min: %u max: %u\n",
2359 version_info.build_number, MSHV_HV_MIN_VERSION,
2360 MSHV_HV_MAX_VERSION);
2361 }
2362
2363 ret = mshv_synic_init(dev);
2364 if (ret)
2365 goto device_deregister;
2366
2367 ret = mshv_init_vmm_caps(dev);
2368 if (ret)
2369 goto synic_cleanup;
2370
2371 ret = mshv_retrieve_scheduler_type(dev);
2372 if (ret)
2373 goto synic_cleanup;
2374
2375 ret = root_scheduler_init(dev);
2376 if (ret)
2377 goto synic_cleanup;
2378
2379 ret = mshv_debugfs_init();
2380 if (ret)
2381 goto deinit_root_scheduler;
2382
2383 ret = mshv_irqfd_wq_init();
2384 if (ret)
2385 goto exit_debugfs;
2386
2387 spin_lock_init(&mshv_root.pt_ht_lock);
2388 hash_init(mshv_root.pt_htable);
2389
2390 hv_setup_mshv_handler(mshv_isr);
2391
2392 return 0;
2393
2394 exit_debugfs:
2395 mshv_debugfs_exit();
2396 deinit_root_scheduler:
2397 root_scheduler_deinit();
2398 synic_cleanup:
2399 mshv_synic_exit();
2400 device_deregister:
2401 misc_deregister(&mshv_dev);
2402 return ret;
2403 }
2404
mshv_parent_partition_exit(void)2405 static void __exit mshv_parent_partition_exit(void)
2406 {
2407 hv_setup_mshv_handler(NULL);
2408 mshv_port_table_fini();
2409 mshv_debugfs_exit();
2410 misc_deregister(&mshv_dev);
2411 mshv_irqfd_wq_cleanup();
2412 root_scheduler_deinit();
2413 mshv_synic_exit();
2414 }
2415
2416 module_init(mshv_parent_partition_init);
2417 module_exit(mshv_parent_partition_exit);
2418