1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (c) 2024, Microsoft Corporation.
4 *
5 * The main part of the mshv_root module, providing APIs to create
6 * and manage guest partitions.
7 *
8 * Authors: Microsoft Linux virtualization team
9 */
10
11 #include <linux/entry-virt.h>
12 #include <linux/kernel.h>
13 #include <linux/module.h>
14 #include <linux/fs.h>
15 #include <linux/miscdevice.h>
16 #include <linux/slab.h>
17 #include <linux/file.h>
18 #include <linux/anon_inodes.h>
19 #include <linux/mm.h>
20 #include <linux/io.h>
21 #include <linux/cpuhotplug.h>
22 #include <linux/random.h>
23 #include <asm/mshyperv.h>
24 #include <linux/hyperv.h>
25 #include <linux/notifier.h>
26 #include <linux/reboot.h>
27 #include <linux/kexec.h>
28 #include <linux/page-flags.h>
29 #include <linux/crash_dump.h>
30 #include <linux/panic_notifier.h>
31 #include <linux/vmalloc.h>
32 #include <linux/rseq.h>
33
34 #include "mshv_eventfd.h"
35 #include "mshv.h"
36 #include "mshv_root.h"
37
38 MODULE_AUTHOR("Microsoft");
39 MODULE_LICENSE("GPL");
40 MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv");
41
42 /* HV_THREAD_COUNTER */
43 #if defined(CONFIG_X86_64)
44 #define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 202
45 #elif defined(CONFIG_ARM64)
46 #define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 95
47 #endif
48
49 struct mshv_root mshv_root;
50
51 enum hv_scheduler_type hv_scheduler_type;
52
53 /* Once we implement the fast extended hypercall ABI they can go away. */
54 static void * __percpu *root_scheduler_input;
55 static void * __percpu *root_scheduler_output;
56
57 static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
58 static int mshv_dev_open(struct inode *inode, struct file *filp);
59 static int mshv_dev_release(struct inode *inode, struct file *filp);
60 static int mshv_vp_release(struct inode *inode, struct file *filp);
61 static long mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
62 static int mshv_partition_release(struct inode *inode, struct file *filp);
63 static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
64 static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma);
65 static vm_fault_t mshv_vp_fault(struct vm_fault *vmf);
66 static int mshv_init_async_handler(struct mshv_partition *partition);
67 static void mshv_async_hvcall_handler(void *data, u64 *status);
68
69 static const union hv_input_vtl input_vtl_zero;
70 static const union hv_input_vtl input_vtl_normal = {
71 .target_vtl = HV_NORMAL_VTL,
72 .use_target_vtl = 1,
73 };
74
75 static const struct vm_operations_struct mshv_vp_vm_ops = {
76 .fault = mshv_vp_fault,
77 };
78
79 static const struct file_operations mshv_vp_fops = {
80 .owner = THIS_MODULE,
81 .release = mshv_vp_release,
82 .unlocked_ioctl = mshv_vp_ioctl,
83 .llseek = noop_llseek,
84 .mmap = mshv_vp_mmap,
85 };
86
87 static const struct file_operations mshv_partition_fops = {
88 .owner = THIS_MODULE,
89 .release = mshv_partition_release,
90 .unlocked_ioctl = mshv_partition_ioctl,
91 .llseek = noop_llseek,
92 };
93
94 static const struct file_operations mshv_dev_fops = {
95 .owner = THIS_MODULE,
96 .open = mshv_dev_open,
97 .release = mshv_dev_release,
98 .unlocked_ioctl = mshv_dev_ioctl,
99 .llseek = noop_llseek,
100 };
101
102 static struct miscdevice mshv_dev = {
103 .minor = MISC_DYNAMIC_MINOR,
104 .name = "mshv",
105 .fops = &mshv_dev_fops,
106 .mode = 0600,
107 };
108
109 /*
110 * Only allow hypercalls that have a u64 partition id as the first member of
111 * the input structure.
112 * These are sorted by value.
113 */
114 static u16 mshv_passthru_hvcalls[] = {
115 HVCALL_GET_PARTITION_PROPERTY,
116 HVCALL_GET_PARTITION_PROPERTY_EX,
117 HVCALL_SET_PARTITION_PROPERTY,
118 HVCALL_INSTALL_INTERCEPT,
119 HVCALL_GET_VP_REGISTERS,
120 HVCALL_SET_VP_REGISTERS,
121 HVCALL_TRANSLATE_VIRTUAL_ADDRESS,
122 HVCALL_CLEAR_VIRTUAL_INTERRUPT,
123 HVCALL_REGISTER_INTERCEPT_RESULT,
124 HVCALL_ASSERT_VIRTUAL_INTERRUPT,
125 HVCALL_GET_GPA_PAGES_ACCESS_STATES,
126 HVCALL_SIGNAL_EVENT_DIRECT,
127 HVCALL_POST_MESSAGE_DIRECT,
128 HVCALL_GET_VP_CPUID_VALUES,
129 };
130
131 /*
132 * Only allow hypercalls that are safe to be called by the VMM with the host
133 * partition as target (i.e. HV_PARTITION_ID_SELF). Carefully audit that a
134 * hypercall cannot be misused by the VMM before adding it to this list.
135 */
136 static u16 mshv_self_passthru_hvcalls[] = {
137 HVCALL_GET_PARTITION_PROPERTY,
138 HVCALL_GET_PARTITION_PROPERTY_EX,
139 };
140
mshv_hvcall_is_async(u16 code)141 static bool mshv_hvcall_is_async(u16 code)
142 {
143 switch (code) {
144 case HVCALL_SET_PARTITION_PROPERTY:
145 return true;
146 default:
147 break;
148 }
149 return false;
150 }
151
mshv_passthru_hvcall_allowed(u16 code,u64 pt_id)152 static bool mshv_passthru_hvcall_allowed(u16 code, u64 pt_id)
153 {
154 int i;
155 int n = ARRAY_SIZE(mshv_passthru_hvcalls);
156 u16 *allowed_hvcalls = mshv_passthru_hvcalls;
157
158 if (pt_id == HV_PARTITION_ID_SELF) {
159 n = ARRAY_SIZE(mshv_self_passthru_hvcalls);
160 allowed_hvcalls = mshv_self_passthru_hvcalls;
161 }
162
163 for (i = 0; i < n; ++i)
164 if (allowed_hvcalls[i] == code)
165 return true;
166
167 return false;
168 }
169
mshv_ioctl_passthru_hvcall(struct mshv_partition * partition,bool partition_locked,void __user * user_args)170 static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition,
171 bool partition_locked,
172 void __user *user_args)
173 {
174 u64 status;
175 int ret = 0;
176 bool is_async;
177 struct mshv_root_hvcall args;
178 struct page *page;
179 unsigned int pages_order;
180 void *input_pg = NULL;
181 void *output_pg = NULL;
182 u16 reps_completed;
183 u64 pt_id = partition ? partition->pt_id : HV_PARTITION_ID_SELF;
184
185 if (copy_from_user(&args, user_args, sizeof(args)))
186 return -EFAULT;
187
188 if (args.status || !args.in_ptr || args.in_sz < sizeof(u64) ||
189 mshv_field_nonzero(args, rsvd) || args.in_sz > HV_HYP_PAGE_SIZE)
190 return -EINVAL;
191
192 if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE))
193 return -EINVAL;
194
195 if (!mshv_passthru_hvcall_allowed(args.code, pt_id))
196 return -EINVAL;
197
198 is_async = mshv_hvcall_is_async(args.code);
199 if (is_async) {
200 /* async hypercalls can only be called from partition fd */
201 if (!partition || !partition_locked)
202 return -EINVAL;
203 ret = mshv_init_async_handler(partition);
204 if (ret)
205 return ret;
206 }
207
208 pages_order = args.out_ptr ? 1 : 0;
209 page = alloc_pages(GFP_KERNEL, pages_order);
210 if (!page)
211 return -ENOMEM;
212 input_pg = page_address(page);
213
214 if (args.out_ptr)
215 output_pg = (char *)input_pg + PAGE_SIZE;
216 else
217 output_pg = NULL;
218
219 if (copy_from_user(input_pg, (void __user *)args.in_ptr,
220 args.in_sz)) {
221 ret = -EFAULT;
222 goto free_pages_out;
223 }
224
225 /*
226 * NOTE: This only works because all the allowed hypercalls' input
227 * structs begin with a u64 partition_id field.
228 */
229 *(u64 *)input_pg = pt_id;
230
231 reps_completed = 0;
232 do {
233 if (args.reps) {
234 status = hv_do_rep_hypercall_ex(args.code, args.reps,
235 0, reps_completed,
236 input_pg, output_pg);
237 reps_completed = hv_repcomp(status);
238 } else {
239 status = hv_do_hypercall(args.code, input_pg, output_pg);
240 }
241
242 if (hv_result(status) == HV_STATUS_CALL_PENDING) {
243 if (is_async) {
244 mshv_async_hvcall_handler(partition, &status);
245 } else { /* Paranoia check. This shouldn't happen! */
246 ret = -EBADFD;
247 goto free_pages_out;
248 }
249 }
250
251 if (hv_result_success(status))
252 break;
253
254 if (!hv_result_needs_memory(status))
255 ret = hv_result_to_errno(status);
256 else
257 ret = hv_deposit_memory(pt_id, status);
258 } while (!ret);
259
260 args.status = hv_result(status);
261 args.reps = reps_completed;
262 if (copy_to_user(user_args, &args, sizeof(args)))
263 ret = -EFAULT;
264
265 if (!ret && output_pg &&
266 copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz))
267 ret = -EFAULT;
268
269 free_pages_out:
270 free_pages((unsigned long)input_pg, pages_order);
271
272 return ret;
273 }
274
is_ghcb_mapping_available(void)275 static inline bool is_ghcb_mapping_available(void)
276 {
277 #if IS_ENABLED(CONFIG_X86_64)
278 return ms_hyperv.ext_features & HV_VP_GHCB_ROOT_MAPPING_AVAILABLE;
279 #else
280 return 0;
281 #endif
282 }
283
mshv_get_vp_registers(u32 vp_index,u64 partition_id,u16 count,struct hv_register_assoc * registers)284 static int mshv_get_vp_registers(u32 vp_index, u64 partition_id, u16 count,
285 struct hv_register_assoc *registers)
286 {
287 return hv_call_get_vp_registers(vp_index, partition_id,
288 count, input_vtl_zero, registers);
289 }
290
mshv_set_vp_registers(u32 vp_index,u64 partition_id,u16 count,struct hv_register_assoc * registers)291 static int mshv_set_vp_registers(u32 vp_index, u64 partition_id, u16 count,
292 struct hv_register_assoc *registers)
293 {
294 return hv_call_set_vp_registers(vp_index, partition_id,
295 count, input_vtl_zero, registers);
296 }
297
298 /*
299 * Explicit guest vCPU suspend is asynchronous by nature (as it is requested by
300 * dom0 vCPU for guest vCPU) and thus it can race with "intercept" suspend,
301 * done by the hypervisor.
302 * "Intercept" suspend leads to asynchronous message delivery to dom0 which
303 * should be awaited to keep the VP loop consistent (i.e. no message pending
304 * upon VP resume).
305 * VP intercept suspend can't be done when the VP is explicitly suspended
306 * already, and thus can be only two possible race scenarios:
307 * 1. implicit suspend bit set -> explicit suspend bit set -> message sent
308 * 2. implicit suspend bit set -> message sent -> explicit suspend bit set
309 * Checking for implicit suspend bit set after explicit suspend request has
310 * succeeded in either case allows us to reliably identify, if there is a
311 * message to receive and deliver to VMM.
312 */
313 static int
mshv_suspend_vp(const struct mshv_vp * vp,bool * message_in_flight)314 mshv_suspend_vp(const struct mshv_vp *vp, bool *message_in_flight)
315 {
316 struct hv_register_assoc explicit_suspend = {
317 .name = HV_REGISTER_EXPLICIT_SUSPEND
318 };
319 struct hv_register_assoc intercept_suspend = {
320 .name = HV_REGISTER_INTERCEPT_SUSPEND
321 };
322 union hv_explicit_suspend_register *es =
323 &explicit_suspend.value.explicit_suspend;
324 union hv_intercept_suspend_register *is =
325 &intercept_suspend.value.intercept_suspend;
326 int ret;
327
328 es->suspended = 1;
329
330 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
331 1, &explicit_suspend);
332 if (ret) {
333 vp_err(vp, "Failed to explicitly suspend vCPU\n");
334 return ret;
335 }
336
337 ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
338 1, &intercept_suspend);
339 if (ret) {
340 vp_err(vp, "Failed to get intercept suspend state\n");
341 return ret;
342 }
343
344 *message_in_flight = is->suspended;
345
346 return 0;
347 }
348
349 /*
350 * This function is used when VPs are scheduled by the hypervisor's
351 * scheduler.
352 *
353 * Caller has to make sure the registers contain cleared
354 * HV_REGISTER_INTERCEPT_SUSPEND and HV_REGISTER_EXPLICIT_SUSPEND registers
355 * exactly in this order (the hypervisor clears them sequentially) to avoid
356 * potential invalid clearing a newly arrived HV_REGISTER_INTERCEPT_SUSPEND
357 * after VP is released from HV_REGISTER_EXPLICIT_SUSPEND in case of the
358 * opposite order.
359 */
mshv_run_vp_with_hyp_scheduler(struct mshv_vp * vp)360 static long mshv_run_vp_with_hyp_scheduler(struct mshv_vp *vp)
361 {
362 long ret;
363 struct hv_register_assoc suspend_regs[2] = {
364 { .name = HV_REGISTER_INTERCEPT_SUSPEND },
365 { .name = HV_REGISTER_EXPLICIT_SUSPEND }
366 };
367 size_t count = ARRAY_SIZE(suspend_regs);
368
369 /* Resume VP execution */
370 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
371 count, suspend_regs);
372 if (ret) {
373 vp_err(vp, "Failed to resume vp execution. %lx\n", ret);
374 return ret;
375 }
376
377 ret = wait_event_interruptible(vp->run.vp_suspend_queue,
378 vp->run.kicked_by_hv == 1);
379 if (ret) {
380 bool message_in_flight;
381
382 /*
383 * Otherwise the waiting was interrupted by a signal: suspend
384 * the vCPU explicitly and copy message in flight (if any).
385 */
386 ret = mshv_suspend_vp(vp, &message_in_flight);
387 if (ret)
388 return ret;
389
390 /* Return if no message in flight */
391 if (!message_in_flight)
392 return -EINTR;
393
394 /* Wait for the message in flight. */
395 wait_event(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1);
396 }
397
398 /*
399 * Reset the flag to make the wait_event call above work
400 * next time.
401 */
402 vp->run.kicked_by_hv = 0;
403
404 return 0;
405 }
406
407 static int
mshv_vp_dispatch(struct mshv_vp * vp,u32 flags,struct hv_output_dispatch_vp * res)408 mshv_vp_dispatch(struct mshv_vp *vp, u32 flags,
409 struct hv_output_dispatch_vp *res)
410 {
411 struct hv_input_dispatch_vp *input;
412 struct hv_output_dispatch_vp *output;
413 u64 status;
414
415 preempt_disable();
416 input = *this_cpu_ptr(root_scheduler_input);
417 output = *this_cpu_ptr(root_scheduler_output);
418
419 memset(input, 0, sizeof(*input));
420 memset(output, 0, sizeof(*output));
421
422 input->partition_id = vp->vp_partition->pt_id;
423 input->vp_index = vp->vp_index;
424 input->time_slice = 0; /* Run forever until something happens */
425 input->spec_ctrl = 0; /* TODO: set sensible flags */
426 input->flags = flags;
427
428 vp->run.flags.root_sched_dispatched = 1;
429 status = hv_do_hypercall(HVCALL_DISPATCH_VP, input, output);
430 vp->run.flags.root_sched_dispatched = 0;
431
432 *res = *output;
433 preempt_enable();
434
435 if (!hv_result_success(status))
436 vp_err(vp, "%s: status %s\n", __func__,
437 hv_result_to_string(status));
438
439 return hv_result_to_errno(status);
440 }
441
442 static int
mshv_vp_clear_explicit_suspend(struct mshv_vp * vp)443 mshv_vp_clear_explicit_suspend(struct mshv_vp *vp)
444 {
445 struct hv_register_assoc explicit_suspend = {
446 .name = HV_REGISTER_EXPLICIT_SUSPEND,
447 .value.explicit_suspend.suspended = 0,
448 };
449 int ret;
450
451 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
452 1, &explicit_suspend);
453
454 if (ret)
455 vp_err(vp, "Failed to unsuspend\n");
456
457 return ret;
458 }
459
460 #if IS_ENABLED(CONFIG_X86_64)
mshv_vp_interrupt_pending(struct mshv_vp * vp)461 static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp)
462 {
463 if (!vp->vp_register_page)
464 return 0;
465 return vp->vp_register_page->interrupt_vectors.as_uint64;
466 }
467 #else
mshv_vp_interrupt_pending(struct mshv_vp * vp)468 static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp)
469 {
470 return 0;
471 }
472 #endif
473
mshv_vp_dispatch_thread_blocked(struct mshv_vp * vp)474 static bool mshv_vp_dispatch_thread_blocked(struct mshv_vp *vp)
475 {
476 struct hv_stats_page **stats = vp->vp_stats_pages;
477 u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->data;
478 u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->data;
479
480 return parent_vp_cntrs[HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED] ||
481 self_vp_cntrs[HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED];
482 }
483
484 static int
mshv_vp_wait_for_hv_kick(struct mshv_vp * vp)485 mshv_vp_wait_for_hv_kick(struct mshv_vp *vp)
486 {
487 int ret;
488
489 ret = wait_event_interruptible(vp->run.vp_suspend_queue,
490 (vp->run.kicked_by_hv == 1 &&
491 !mshv_vp_dispatch_thread_blocked(vp)) ||
492 mshv_vp_interrupt_pending(vp));
493 if (ret)
494 return -EINTR;
495
496 vp->run.flags.root_sched_blocked = 0;
497 vp->run.kicked_by_hv = 0;
498
499 return 0;
500 }
501
502 /* Must be called with interrupts enabled */
mshv_run_vp_with_root_scheduler(struct mshv_vp * vp)503 static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp)
504 {
505 long ret;
506
507 if (vp->run.flags.root_sched_blocked) {
508 /*
509 * Dispatch state of this VP is blocked. Need to wait
510 * for the hypervisor to clear the blocked state before
511 * dispatching it.
512 */
513 ret = mshv_vp_wait_for_hv_kick(vp);
514 if (ret)
515 return ret;
516 }
517
518 do {
519 u32 flags = 0;
520 struct hv_output_dispatch_vp output;
521
522 if (__xfer_to_guest_mode_work_pending()) {
523 ret = xfer_to_guest_mode_handle_work();
524 if (ret)
525 break;
526 }
527
528 if (vp->run.flags.intercept_suspend)
529 flags |= HV_DISPATCH_VP_FLAG_CLEAR_INTERCEPT_SUSPEND;
530
531 if (mshv_vp_interrupt_pending(vp))
532 flags |= HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION;
533
534 ret = mshv_vp_dispatch(vp, flags, &output);
535 if (ret)
536 break;
537
538 vp->run.flags.intercept_suspend = 0;
539
540 if (output.dispatch_state == HV_VP_DISPATCH_STATE_BLOCKED) {
541 if (output.dispatch_event ==
542 HV_VP_DISPATCH_EVENT_SUSPEND) {
543 /*
544 * TODO: remove the warning once VP canceling
545 * is supported
546 */
547 WARN_ONCE(atomic64_read(&vp->run.vp_signaled_count),
548 "%s: vp#%d: unexpected explicit suspend\n",
549 __func__, vp->vp_index);
550 /*
551 * Need to clear explicit suspend before
552 * dispatching.
553 * Explicit suspend is either:
554 * - set right after the first VP dispatch or
555 * - set explicitly via hypercall
556 * Since the latter case is not yet supported,
557 * simply clear it here.
558 */
559 ret = mshv_vp_clear_explicit_suspend(vp);
560 if (ret)
561 break;
562
563 ret = mshv_vp_wait_for_hv_kick(vp);
564 if (ret)
565 break;
566 } else {
567 vp->run.flags.root_sched_blocked = 1;
568 ret = mshv_vp_wait_for_hv_kick(vp);
569 if (ret)
570 break;
571 }
572 } else {
573 /* HV_VP_DISPATCH_STATE_READY */
574 if (output.dispatch_event ==
575 HV_VP_DISPATCH_EVENT_INTERCEPT)
576 vp->run.flags.intercept_suspend = 1;
577 }
578 } while (!vp->run.flags.intercept_suspend);
579
580 rseq_virt_userspace_exit();
581
582 return ret;
583 }
584
585 static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ,
586 "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ");
587
588 static struct mshv_mem_region *
mshv_partition_region_by_gfn(struct mshv_partition * partition,u64 gfn)589 mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
590 {
591 struct mshv_mem_region *region;
592
593 hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) {
594 if (gfn >= region->start_gfn &&
595 gfn < region->start_gfn + region->nr_pages)
596 return region;
597 }
598
599 return NULL;
600 }
601
602 static struct mshv_mem_region *
mshv_partition_region_by_gfn_get(struct mshv_partition * p,u64 gfn)603 mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
604 {
605 struct mshv_mem_region *region;
606
607 spin_lock(&p->pt_mem_regions_lock);
608 region = mshv_partition_region_by_gfn(p, gfn);
609 if (!region || !mshv_region_get(region)) {
610 spin_unlock(&p->pt_mem_regions_lock);
611 return NULL;
612 }
613 spin_unlock(&p->pt_mem_regions_lock);
614
615 return region;
616 }
617
618 /**
619 * mshv_handle_gpa_intercept - Handle GPA (Guest Physical Address) intercepts.
620 * @vp: Pointer to the virtual processor structure.
621 *
622 * This function processes GPA intercepts by identifying the memory region
623 * corresponding to the intercepted GPA, aligning the page offset, and
624 * mapping the required pages. It ensures that the region is valid and
625 * handles faults efficiently by mapping multiple pages at once.
626 *
627 * Return: true if the intercept was handled successfully, false otherwise.
628 */
mshv_handle_gpa_intercept(struct mshv_vp * vp)629 static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
630 {
631 struct mshv_partition *p = vp->vp_partition;
632 struct mshv_mem_region *region;
633 bool ret = false;
634 u64 gfn;
635 #if defined(CONFIG_X86_64)
636 struct hv_x64_memory_intercept_message *msg =
637 (struct hv_x64_memory_intercept_message *)
638 vp->vp_intercept_msg_page->u.payload;
639 #elif defined(CONFIG_ARM64)
640 struct hv_arm64_memory_intercept_message *msg =
641 (struct hv_arm64_memory_intercept_message *)
642 vp->vp_intercept_msg_page->u.payload;
643 #endif
644 enum hv_intercept_access_type access_type =
645 msg->header.intercept_access_type;
646
647 gfn = HVPFN_DOWN(msg->guest_physical_address);
648
649 region = mshv_partition_region_by_gfn_get(p, gfn);
650 if (!region)
651 return false;
652
653 if (access_type == HV_INTERCEPT_ACCESS_WRITE &&
654 !(region->hv_map_flags & HV_MAP_GPA_WRITABLE))
655 goto put_region;
656
657 if (access_type == HV_INTERCEPT_ACCESS_EXECUTE &&
658 !(region->hv_map_flags & HV_MAP_GPA_EXECUTABLE))
659 goto put_region;
660
661 /* Only movable memory ranges are supported for GPA intercepts */
662 if (region->mreg_type == MSHV_REGION_TYPE_MEM_MOVABLE)
663 ret = mshv_region_handle_gfn_fault(region, gfn);
664
665 put_region:
666 mshv_region_put(region);
667
668 return ret;
669 }
670
mshv_vp_handle_intercept(struct mshv_vp * vp)671 static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
672 {
673 switch (vp->vp_intercept_msg_page->header.message_type) {
674 case HVMSG_GPA_INTERCEPT:
675 return mshv_handle_gpa_intercept(vp);
676 }
677 return false;
678 }
679
mshv_vp_ioctl_run_vp(struct mshv_vp * vp,void __user * ret_msg)680 static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg)
681 {
682 long rc;
683
684 do {
685 if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
686 rc = mshv_run_vp_with_root_scheduler(vp);
687 else
688 rc = mshv_run_vp_with_hyp_scheduler(vp);
689 } while (rc == 0 && mshv_vp_handle_intercept(vp));
690
691 if (rc)
692 return rc;
693
694 if (copy_to_user(ret_msg, vp->vp_intercept_msg_page,
695 sizeof(struct hv_message)))
696 rc = -EFAULT;
697
698 return rc;
699 }
700
701 static int
mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp * vp,struct hv_vp_state_data state_data,unsigned long user_pfn,size_t page_count,bool is_set)702 mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp *vp,
703 struct hv_vp_state_data state_data,
704 unsigned long user_pfn, size_t page_count,
705 bool is_set)
706 {
707 int completed, ret = 0;
708 unsigned long check;
709 struct page **pages;
710
711 if (page_count > INT_MAX)
712 return -EINVAL;
713 /*
714 * Check the arithmetic for wraparound/overflow.
715 * The last page address in the buffer is:
716 * (user_pfn + (page_count - 1)) * PAGE_SIZE
717 */
718 if (check_add_overflow(user_pfn, (page_count - 1), &check))
719 return -EOVERFLOW;
720 if (check_mul_overflow(check, PAGE_SIZE, &check))
721 return -EOVERFLOW;
722
723 /* Pin user pages so hypervisor can copy directly to them */
724 pages = kzalloc_objs(struct page *, page_count);
725 if (!pages)
726 return -ENOMEM;
727
728 for (completed = 0; completed < page_count; completed += ret) {
729 unsigned long user_addr = (user_pfn + completed) * PAGE_SIZE;
730 int remaining = page_count - completed;
731
732 ret = pin_user_pages_fast(user_addr, remaining, FOLL_WRITE,
733 &pages[completed]);
734 if (ret < 0) {
735 vp_err(vp, "%s: Failed to pin user pages error %i\n",
736 __func__, ret);
737 goto unpin_pages;
738 }
739 }
740
741 if (is_set)
742 ret = hv_call_set_vp_state(vp->vp_index,
743 vp->vp_partition->pt_id,
744 state_data, page_count, pages,
745 0, NULL);
746 else
747 ret = hv_call_get_vp_state(vp->vp_index,
748 vp->vp_partition->pt_id,
749 state_data, page_count, pages,
750 NULL);
751
752 unpin_pages:
753 unpin_user_pages(pages, completed);
754 kfree(pages);
755 return ret;
756 }
757
758 static long
mshv_vp_ioctl_get_set_state(struct mshv_vp * vp,struct mshv_get_set_vp_state __user * user_args,bool is_set)759 mshv_vp_ioctl_get_set_state(struct mshv_vp *vp,
760 struct mshv_get_set_vp_state __user *user_args,
761 bool is_set)
762 {
763 struct mshv_get_set_vp_state args;
764 long ret = 0;
765 union hv_output_get_vp_state vp_state;
766 u32 data_sz;
767 struct hv_vp_state_data state_data = {};
768
769 if (copy_from_user(&args, user_args, sizeof(args)))
770 return -EFAULT;
771
772 if (args.type >= MSHV_VP_STATE_COUNT || mshv_field_nonzero(args, rsvd) ||
773 !args.buf_sz || !PAGE_ALIGNED(args.buf_sz) ||
774 !PAGE_ALIGNED(args.buf_ptr))
775 return -EINVAL;
776
777 if (!access_ok((void __user *)args.buf_ptr, args.buf_sz))
778 return -EFAULT;
779
780 switch (args.type) {
781 case MSHV_VP_STATE_LAPIC:
782 state_data.type = HV_GET_SET_VP_STATE_LAPIC_STATE;
783 data_sz = HV_HYP_PAGE_SIZE;
784 break;
785 case MSHV_VP_STATE_XSAVE:
786 {
787 u64 data_sz_64;
788
789 ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
790 HV_PARTITION_PROPERTY_XSAVE_STATES,
791 &state_data.xsave.states.as_uint64);
792 if (ret)
793 return ret;
794
795 ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
796 HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE,
797 &data_sz_64);
798 if (ret)
799 return ret;
800
801 data_sz = (u32)data_sz_64;
802 state_data.xsave.flags = 0;
803 /* Always request legacy states */
804 state_data.xsave.states.legacy_x87 = 1;
805 state_data.xsave.states.legacy_sse = 1;
806 state_data.type = HV_GET_SET_VP_STATE_XSAVE;
807 break;
808 }
809 case MSHV_VP_STATE_SIMP:
810 state_data.type = HV_GET_SET_VP_STATE_SIM_PAGE;
811 data_sz = HV_HYP_PAGE_SIZE;
812 break;
813 case MSHV_VP_STATE_SIEFP:
814 state_data.type = HV_GET_SET_VP_STATE_SIEF_PAGE;
815 data_sz = HV_HYP_PAGE_SIZE;
816 break;
817 case MSHV_VP_STATE_SYNTHETIC_TIMERS:
818 state_data.type = HV_GET_SET_VP_STATE_SYNTHETIC_TIMERS;
819 data_sz = sizeof(vp_state.synthetic_timers_state);
820 break;
821 default:
822 return -EINVAL;
823 }
824
825 if (copy_to_user(&user_args->buf_sz, &data_sz, sizeof(user_args->buf_sz)))
826 return -EFAULT;
827
828 if (data_sz > args.buf_sz)
829 return -EINVAL;
830
831 /* If the data is transmitted via pfns, delegate to helper */
832 if (state_data.type & HV_GET_SET_VP_STATE_TYPE_PFN) {
833 unsigned long user_pfn = PFN_DOWN(args.buf_ptr);
834 size_t page_count = PFN_DOWN(args.buf_sz);
835
836 return mshv_vp_ioctl_get_set_state_pfn(vp, state_data, user_pfn,
837 page_count, is_set);
838 }
839
840 /* Paranoia check - this shouldn't happen! */
841 if (data_sz > sizeof(vp_state)) {
842 vp_err(vp, "Invalid vp state data size!\n");
843 return -EINVAL;
844 }
845
846 if (is_set) {
847 if (copy_from_user(&vp_state, (__user void *)args.buf_ptr, data_sz))
848 return -EFAULT;
849
850 return hv_call_set_vp_state(vp->vp_index,
851 vp->vp_partition->pt_id,
852 state_data, 0, NULL,
853 sizeof(vp_state), (u8 *)&vp_state);
854 }
855
856 ret = hv_call_get_vp_state(vp->vp_index, vp->vp_partition->pt_id,
857 state_data, 0, NULL, &vp_state);
858 if (ret)
859 return ret;
860
861 if (copy_to_user((void __user *)args.buf_ptr, &vp_state, data_sz))
862 return -EFAULT;
863
864 return 0;
865 }
866
867 static long
mshv_vp_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)868 mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
869 {
870 struct mshv_vp *vp = filp->private_data;
871 long r = -ENOTTY;
872
873 if (mutex_lock_killable(&vp->vp_mutex))
874 return -EINTR;
875
876 switch (ioctl) {
877 case MSHV_RUN_VP:
878 r = mshv_vp_ioctl_run_vp(vp, (void __user *)arg);
879 break;
880 case MSHV_GET_VP_STATE:
881 r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, false);
882 break;
883 case MSHV_SET_VP_STATE:
884 r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, true);
885 break;
886 case MSHV_ROOT_HVCALL:
887 r = mshv_ioctl_passthru_hvcall(vp->vp_partition, false,
888 (void __user *)arg);
889 break;
890 default:
891 vp_warn(vp, "Invalid ioctl: %#x\n", ioctl);
892 break;
893 }
894 mutex_unlock(&vp->vp_mutex);
895
896 return r;
897 }
898
mshv_vp_fault(struct vm_fault * vmf)899 static vm_fault_t mshv_vp_fault(struct vm_fault *vmf)
900 {
901 struct mshv_vp *vp = vmf->vma->vm_file->private_data;
902
903 switch (vmf->vma->vm_pgoff) {
904 case MSHV_VP_MMAP_OFFSET_REGISTERS:
905 vmf->page = virt_to_page(vp->vp_register_page);
906 break;
907 case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE:
908 vmf->page = virt_to_page(vp->vp_intercept_msg_page);
909 break;
910 case MSHV_VP_MMAP_OFFSET_GHCB:
911 vmf->page = virt_to_page(vp->vp_ghcb_page);
912 break;
913 default:
914 return VM_FAULT_SIGBUS;
915 }
916
917 get_page(vmf->page);
918
919 return 0;
920 }
921
mshv_vp_mmap(struct file * file,struct vm_area_struct * vma)922 static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma)
923 {
924 struct mshv_vp *vp = file->private_data;
925
926 switch (vma->vm_pgoff) {
927 case MSHV_VP_MMAP_OFFSET_REGISTERS:
928 if (!vp->vp_register_page)
929 return -ENODEV;
930 break;
931 case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE:
932 if (!vp->vp_intercept_msg_page)
933 return -ENODEV;
934 break;
935 case MSHV_VP_MMAP_OFFSET_GHCB:
936 if (!vp->vp_ghcb_page)
937 return -ENODEV;
938 break;
939 default:
940 return -EINVAL;
941 }
942
943 vma->vm_ops = &mshv_vp_vm_ops;
944 return 0;
945 }
946
947 static int
mshv_vp_release(struct inode * inode,struct file * filp)948 mshv_vp_release(struct inode *inode, struct file *filp)
949 {
950 struct mshv_vp *vp = filp->private_data;
951
952 /* Rest of VP cleanup happens in destroy_partition() */
953 mshv_partition_put(vp->vp_partition);
954 return 0;
955 }
956
mshv_vp_stats_unmap(u64 partition_id,u32 vp_index,struct hv_stats_page * stats_pages[])957 void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index,
958 struct hv_stats_page *stats_pages[])
959 {
960 union hv_stats_object_identity identity = {
961 .vp.partition_id = partition_id,
962 .vp.vp_index = vp_index,
963 };
964 int err;
965
966 identity.vp.stats_area_type = HV_STATS_AREA_SELF;
967 err = hv_unmap_stats_page(HV_STATS_OBJECT_VP,
968 stats_pages[HV_STATS_AREA_SELF],
969 &identity);
970 if (err)
971 pr_err("%s: failed to unmap partition %llu vp %u self stats, err: %d\n",
972 __func__, partition_id, vp_index, err);
973
974 if (stats_pages[HV_STATS_AREA_PARENT] != stats_pages[HV_STATS_AREA_SELF]) {
975 identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
976 err = hv_unmap_stats_page(HV_STATS_OBJECT_VP,
977 stats_pages[HV_STATS_AREA_PARENT],
978 &identity);
979 if (err)
980 pr_err("%s: failed to unmap partition %llu vp %u parent stats, err: %d\n",
981 __func__, partition_id, vp_index, err);
982 }
983 }
984
mshv_vp_stats_map(u64 partition_id,u32 vp_index,struct hv_stats_page * stats_pages[])985 int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
986 struct hv_stats_page *stats_pages[])
987 {
988 union hv_stats_object_identity identity = {
989 .vp.partition_id = partition_id,
990 .vp.vp_index = vp_index,
991 };
992 int err;
993
994 identity.vp.stats_area_type = HV_STATS_AREA_SELF;
995 err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
996 &stats_pages[HV_STATS_AREA_SELF]);
997 if (err) {
998 pr_err("%s: failed to map partition %llu vp %u self stats, err: %d\n",
999 __func__, partition_id, vp_index, err);
1000 return err;
1001 }
1002
1003 /*
1004 * L1VH partition cannot access its vp stats in parent area.
1005 */
1006 if (is_l1vh_parent(partition_id)) {
1007 stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF];
1008 } else {
1009 identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
1010 err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
1011 &stats_pages[HV_STATS_AREA_PARENT]);
1012 if (err) {
1013 pr_err("%s: failed to map partition %llu vp %u parent stats, err: %d\n",
1014 __func__, partition_id, vp_index, err);
1015 goto unmap_self;
1016 }
1017 if (!stats_pages[HV_STATS_AREA_PARENT])
1018 stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF];
1019 }
1020
1021 return 0;
1022
1023 unmap_self:
1024 identity.vp.stats_area_type = HV_STATS_AREA_SELF;
1025 hv_unmap_stats_page(HV_STATS_OBJECT_VP,
1026 stats_pages[HV_STATS_AREA_SELF],
1027 &identity);
1028 return err;
1029 }
1030
1031 static long
mshv_partition_ioctl_create_vp(struct mshv_partition * partition,void __user * arg)1032 mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
1033 void __user *arg)
1034 {
1035 struct mshv_create_vp args;
1036 struct mshv_vp *vp;
1037 struct page *intercept_msg_page, *register_page, *ghcb_page;
1038 struct hv_stats_page *stats_pages[2];
1039 long ret;
1040
1041 if (copy_from_user(&args, arg, sizeof(args)))
1042 return -EFAULT;
1043
1044 if (args.vp_index >= MSHV_MAX_VPS)
1045 return -EINVAL;
1046
1047 if (partition->pt_vp_array[args.vp_index])
1048 return -EEXIST;
1049
1050 ret = hv_call_create_vp(NUMA_NO_NODE, partition->pt_id, args.vp_index,
1051 0 /* Only valid for root partition VPs */);
1052 if (ret)
1053 return ret;
1054
1055 ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
1056 HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
1057 input_vtl_zero, &intercept_msg_page);
1058 if (ret)
1059 goto destroy_vp;
1060
1061 if (!mshv_partition_encrypted(partition)) {
1062 ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
1063 HV_VP_STATE_PAGE_REGISTERS,
1064 input_vtl_zero, ®ister_page);
1065 if (ret)
1066 goto unmap_intercept_message_page;
1067 }
1068
1069 if (mshv_partition_encrypted(partition) &&
1070 is_ghcb_mapping_available()) {
1071 ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
1072 HV_VP_STATE_PAGE_GHCB,
1073 input_vtl_normal, &ghcb_page);
1074 if (ret)
1075 goto unmap_register_page;
1076 }
1077
1078 ret = mshv_vp_stats_map(partition->pt_id, args.vp_index,
1079 stats_pages);
1080 if (ret)
1081 goto unmap_ghcb_page;
1082
1083 vp = kzalloc_obj(*vp);
1084 if (!vp)
1085 goto unmap_stats_pages;
1086
1087 vp->vp_partition = mshv_partition_get(partition);
1088 if (!vp->vp_partition) {
1089 ret = -EBADF;
1090 goto free_vp;
1091 }
1092
1093 mutex_init(&vp->vp_mutex);
1094 init_waitqueue_head(&vp->run.vp_suspend_queue);
1095 atomic64_set(&vp->run.vp_signaled_count, 0);
1096
1097 vp->vp_index = args.vp_index;
1098 vp->vp_intercept_msg_page = page_to_virt(intercept_msg_page);
1099 if (!mshv_partition_encrypted(partition))
1100 vp->vp_register_page = page_to_virt(register_page);
1101
1102 if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
1103 vp->vp_ghcb_page = page_to_virt(ghcb_page);
1104
1105 memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages));
1106
1107 ret = mshv_debugfs_vp_create(vp);
1108 if (ret)
1109 goto put_partition;
1110
1111 /*
1112 * Keep anon_inode_getfd last: it installs fd in the file struct and
1113 * thus makes the state accessible in user space.
1114 */
1115 ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp,
1116 O_RDWR | O_CLOEXEC);
1117 if (ret < 0)
1118 goto remove_debugfs_vp;
1119
1120 /* already exclusive with the partition mutex for all ioctls */
1121 partition->pt_vp_count++;
1122 partition->pt_vp_array[args.vp_index] = vp;
1123
1124 return ret;
1125
1126 remove_debugfs_vp:
1127 mshv_debugfs_vp_remove(vp);
1128 put_partition:
1129 mshv_partition_put(partition);
1130 free_vp:
1131 kfree(vp);
1132 unmap_stats_pages:
1133 mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages);
1134 unmap_ghcb_page:
1135 if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
1136 hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
1137 HV_VP_STATE_PAGE_GHCB, ghcb_page,
1138 input_vtl_normal);
1139 unmap_register_page:
1140 if (!mshv_partition_encrypted(partition))
1141 hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
1142 HV_VP_STATE_PAGE_REGISTERS,
1143 register_page, input_vtl_zero);
1144 unmap_intercept_message_page:
1145 hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
1146 HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
1147 intercept_msg_page, input_vtl_zero);
1148 destroy_vp:
1149 hv_call_delete_vp(partition->pt_id, args.vp_index);
1150 return ret;
1151 }
1152
mshv_init_async_handler(struct mshv_partition * partition)1153 static int mshv_init_async_handler(struct mshv_partition *partition)
1154 {
1155 if (completion_done(&partition->async_hypercall)) {
1156 pt_err(partition,
1157 "Cannot issue async hypercall while another one in progress!\n");
1158 return -EPERM;
1159 }
1160
1161 reinit_completion(&partition->async_hypercall);
1162 return 0;
1163 }
1164
mshv_async_hvcall_handler(void * data,u64 * status)1165 static void mshv_async_hvcall_handler(void *data, u64 *status)
1166 {
1167 struct mshv_partition *partition = data;
1168
1169 wait_for_completion(&partition->async_hypercall);
1170 pt_dbg(partition, "Async hypercall completed!\n");
1171
1172 *status = partition->async_hypercall_status;
1173 }
1174
1175 /*
1176 * NB: caller checks and makes sure mem->size is page aligned
1177 * Returns: 0 with regionpp updated on success, or -errno
1178 */
mshv_partition_create_region(struct mshv_partition * partition,struct mshv_user_mem_region * mem,struct mshv_mem_region ** regionpp,bool is_mmio)1179 static int mshv_partition_create_region(struct mshv_partition *partition,
1180 struct mshv_user_mem_region *mem,
1181 struct mshv_mem_region **regionpp,
1182 bool is_mmio)
1183 {
1184 struct mshv_mem_region *rg;
1185 u64 nr_pages = HVPFN_DOWN(mem->size);
1186
1187 /* Reject overlapping regions */
1188 spin_lock(&partition->pt_mem_regions_lock);
1189 hlist_for_each_entry(rg, &partition->pt_mem_regions, hnode) {
1190 if (mem->guest_pfn + nr_pages <= rg->start_gfn ||
1191 rg->start_gfn + rg->nr_pages <= mem->guest_pfn)
1192 continue;
1193 spin_unlock(&partition->pt_mem_regions_lock);
1194 return -EEXIST;
1195 }
1196 spin_unlock(&partition->pt_mem_regions_lock);
1197
1198 rg = mshv_region_create(mem->guest_pfn, nr_pages,
1199 mem->userspace_addr, mem->flags);
1200 if (IS_ERR(rg))
1201 return PTR_ERR(rg);
1202
1203 if (is_mmio)
1204 rg->mreg_type = MSHV_REGION_TYPE_MMIO;
1205 else if (mshv_partition_encrypted(partition) ||
1206 !mshv_region_movable_init(rg))
1207 rg->mreg_type = MSHV_REGION_TYPE_MEM_PINNED;
1208 else
1209 rg->mreg_type = MSHV_REGION_TYPE_MEM_MOVABLE;
1210
1211 rg->partition = partition;
1212
1213 *regionpp = rg;
1214
1215 return 0;
1216 }
1217
1218 /**
1219 * mshv_prepare_pinned_region - Pin and map memory regions
1220 * @region: Pointer to the memory region structure
1221 *
1222 * This function processes memory regions that are explicitly marked as pinned.
1223 * Pinned regions are preallocated, mapped upfront, and do not rely on fault-based
1224 * population. The function ensures the region is properly populated, handles
1225 * encryption requirements for SNP partitions if applicable, maps the region,
1226 * and performs necessary sharing or eviction operations based on the mapping
1227 * result.
1228 *
1229 * Return: 0 on success, negative error code on failure.
1230 */
mshv_prepare_pinned_region(struct mshv_mem_region * region)1231 static int mshv_prepare_pinned_region(struct mshv_mem_region *region)
1232 {
1233 struct mshv_partition *partition = region->partition;
1234 int ret;
1235
1236 ret = mshv_region_pin(region);
1237 if (ret) {
1238 pt_err(partition, "Failed to pin memory region: %d\n",
1239 ret);
1240 goto err_out;
1241 }
1242
1243 /*
1244 * For an SNP partition it is a requirement that for every memory region
1245 * that we are going to map for this partition we should make sure that
1246 * host access to that region is released. This is ensured by doing an
1247 * additional hypercall which will update the SLAT to release host
1248 * access to guest memory regions.
1249 */
1250 if (mshv_partition_encrypted(partition)) {
1251 ret = mshv_region_unshare(region);
1252 if (ret) {
1253 pt_err(partition,
1254 "Failed to unshare memory region (guest_pfn: %llu): %d\n",
1255 region->start_gfn, ret);
1256 goto invalidate_region;
1257 }
1258 }
1259
1260 ret = mshv_region_map(region);
1261 if (ret && mshv_partition_encrypted(partition)) {
1262 int shrc;
1263
1264 shrc = mshv_region_share(region);
1265 if (!shrc)
1266 goto invalidate_region;
1267
1268 pt_err(partition,
1269 "Failed to share memory region (guest_pfn: %llu): %d\n",
1270 region->start_gfn, shrc);
1271 /*
1272 * Don't unpin if marking shared failed because pages are no
1273 * longer mapped in the host, ie root, anymore.
1274 */
1275 goto err_out;
1276 }
1277
1278 return 0;
1279
1280 invalidate_region:
1281 mshv_region_invalidate(region);
1282 err_out:
1283 return ret;
1284 }
1285
1286 /*
1287 * This maps two things: guest RAM and for pci passthru mmio space.
1288 *
1289 * mmio:
1290 * - vfio overloads vm_pgoff to store the mmio start pfn/spa.
1291 * - Two things need to happen for mapping mmio range:
1292 * 1. mapped in the uaddr so VMM can access it.
1293 * 2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it.
1294 *
1295 * This function takes care of the second. The first one is managed by vfio,
1296 * and hence is taken care of via vfio_pci_mmap_fault().
1297 */
1298 static long
mshv_map_user_memory(struct mshv_partition * partition,struct mshv_user_mem_region * mem)1299 mshv_map_user_memory(struct mshv_partition *partition,
1300 struct mshv_user_mem_region *mem)
1301 {
1302 struct mshv_mem_region *region;
1303 struct vm_area_struct *vma;
1304 bool is_mmio;
1305 ulong mmio_pfn;
1306 long ret;
1307
1308 if (mem->flags & BIT(MSHV_SET_MEM_BIT_UNMAP) ||
1309 !access_ok((const void __user *)mem->userspace_addr, mem->size))
1310 return -EINVAL;
1311
1312 mmap_read_lock(current->mm);
1313 vma = vma_lookup(current->mm, mem->userspace_addr);
1314 is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
1315 mmio_pfn = is_mmio ? vma->vm_pgoff : 0;
1316 mmap_read_unlock(current->mm);
1317
1318 if (!vma)
1319 return -EINVAL;
1320
1321 ret = mshv_partition_create_region(partition, mem, ®ion,
1322 is_mmio);
1323 if (ret)
1324 return ret;
1325
1326 switch (region->mreg_type) {
1327 case MSHV_REGION_TYPE_MEM_PINNED:
1328 ret = mshv_prepare_pinned_region(region);
1329 break;
1330 case MSHV_REGION_TYPE_MEM_MOVABLE:
1331 /*
1332 * For movable memory regions, remap with no access to let
1333 * the hypervisor track dirty pages, enabling pre-copy live
1334 * migration.
1335 */
1336 ret = hv_call_map_gpa_pages(partition->pt_id,
1337 region->start_gfn,
1338 region->nr_pages,
1339 HV_MAP_GPA_NO_ACCESS, NULL);
1340 break;
1341 case MSHV_REGION_TYPE_MMIO:
1342 ret = hv_call_map_mmio_pages(partition->pt_id,
1343 region->start_gfn,
1344 mmio_pfn,
1345 region->nr_pages);
1346 break;
1347 }
1348
1349 if (ret)
1350 goto errout;
1351
1352 spin_lock(&partition->pt_mem_regions_lock);
1353 hlist_add_head(®ion->hnode, &partition->pt_mem_regions);
1354 spin_unlock(&partition->pt_mem_regions_lock);
1355
1356 return 0;
1357
1358 errout:
1359 mshv_region_put(region);
1360 return ret;
1361 }
1362
1363 /* Called for unmapping both the guest ram and the mmio space */
1364 static long
mshv_unmap_user_memory(struct mshv_partition * partition,struct mshv_user_mem_region * mem)1365 mshv_unmap_user_memory(struct mshv_partition *partition,
1366 struct mshv_user_mem_region *mem)
1367 {
1368 struct mshv_mem_region *region;
1369
1370 if (!(mem->flags & BIT(MSHV_SET_MEM_BIT_UNMAP)))
1371 return -EINVAL;
1372
1373 spin_lock(&partition->pt_mem_regions_lock);
1374
1375 region = mshv_partition_region_by_gfn(partition, mem->guest_pfn);
1376 if (!region) {
1377 spin_unlock(&partition->pt_mem_regions_lock);
1378 return -ENOENT;
1379 }
1380
1381 /* Paranoia check */
1382 if (region->start_uaddr != mem->userspace_addr ||
1383 region->start_gfn != mem->guest_pfn ||
1384 region->nr_pages != HVPFN_DOWN(mem->size)) {
1385 spin_unlock(&partition->pt_mem_regions_lock);
1386 return -EINVAL;
1387 }
1388
1389 hlist_del(®ion->hnode);
1390
1391 spin_unlock(&partition->pt_mem_regions_lock);
1392
1393 mshv_region_put(region);
1394
1395 return 0;
1396 }
1397
1398 static long
mshv_partition_ioctl_set_memory(struct mshv_partition * partition,struct mshv_user_mem_region __user * user_mem)1399 mshv_partition_ioctl_set_memory(struct mshv_partition *partition,
1400 struct mshv_user_mem_region __user *user_mem)
1401 {
1402 struct mshv_user_mem_region mem;
1403
1404 if (copy_from_user(&mem, user_mem, sizeof(mem)))
1405 return -EFAULT;
1406
1407 if (!mem.size ||
1408 !PAGE_ALIGNED(mem.size) ||
1409 !PAGE_ALIGNED(mem.userspace_addr) ||
1410 (mem.flags & ~MSHV_SET_MEM_FLAGS_MASK) ||
1411 mshv_field_nonzero(mem, rsvd))
1412 return -EINVAL;
1413
1414 if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP))
1415 return mshv_unmap_user_memory(partition, &mem);
1416
1417 return mshv_map_user_memory(partition, &mem);
1418 }
1419
1420 static long
mshv_partition_ioctl_ioeventfd(struct mshv_partition * partition,void __user * user_args)1421 mshv_partition_ioctl_ioeventfd(struct mshv_partition *partition,
1422 void __user *user_args)
1423 {
1424 struct mshv_user_ioeventfd args;
1425
1426 if (copy_from_user(&args, user_args, sizeof(args)))
1427 return -EFAULT;
1428
1429 return mshv_set_unset_ioeventfd(partition, &args);
1430 }
1431
1432 static long
mshv_partition_ioctl_irqfd(struct mshv_partition * partition,void __user * user_args)1433 mshv_partition_ioctl_irqfd(struct mshv_partition *partition,
1434 void __user *user_args)
1435 {
1436 struct mshv_user_irqfd args;
1437
1438 if (copy_from_user(&args, user_args, sizeof(args)))
1439 return -EFAULT;
1440
1441 return mshv_set_unset_irqfd(partition, &args);
1442 }
1443
1444 static long
mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition * partition,void __user * user_args)1445 mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition *partition,
1446 void __user *user_args)
1447 {
1448 struct mshv_gpap_access_bitmap args;
1449 union hv_gpa_page_access_state *states;
1450 long ret, i;
1451 union hv_gpa_page_access_state_flags hv_flags = {};
1452 u8 hv_type_mask;
1453 ulong bitmap_buf_sz, states_buf_sz;
1454 int written = 0;
1455
1456 if (copy_from_user(&args, user_args, sizeof(args)))
1457 return -EFAULT;
1458
1459 if (args.access_type >= MSHV_GPAP_ACCESS_TYPE_COUNT ||
1460 args.access_op >= MSHV_GPAP_ACCESS_OP_COUNT ||
1461 mshv_field_nonzero(args, rsvd) || !args.page_count ||
1462 !args.bitmap_ptr)
1463 return -EINVAL;
1464
1465 if (check_mul_overflow(args.page_count, sizeof(*states), &states_buf_sz))
1466 return -E2BIG;
1467
1468 /* Num bytes needed to store bitmap; one bit per page rounded up */
1469 bitmap_buf_sz = DIV_ROUND_UP(args.page_count, 8);
1470
1471 /* Sanity check */
1472 if (bitmap_buf_sz > states_buf_sz)
1473 return -EBADFD;
1474
1475 switch (args.access_type) {
1476 case MSHV_GPAP_ACCESS_TYPE_ACCESSED:
1477 hv_type_mask = 1;
1478 if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) {
1479 hv_flags.clear_accessed = 1;
1480 /* not accessed implies not dirty */
1481 hv_flags.clear_dirty = 1;
1482 } else { /* MSHV_GPAP_ACCESS_OP_SET */
1483 hv_flags.set_accessed = 1;
1484 }
1485 break;
1486 case MSHV_GPAP_ACCESS_TYPE_DIRTY:
1487 hv_type_mask = 2;
1488 if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) {
1489 hv_flags.clear_dirty = 1;
1490 } else { /* MSHV_GPAP_ACCESS_OP_SET */
1491 hv_flags.set_dirty = 1;
1492 /* dirty implies accessed */
1493 hv_flags.set_accessed = 1;
1494 }
1495 break;
1496 }
1497
1498 states = vzalloc(states_buf_sz);
1499 if (!states)
1500 return -ENOMEM;
1501
1502 ret = hv_call_get_gpa_access_states(partition->pt_id, args.page_count,
1503 args.gpap_base, hv_flags, &written,
1504 states);
1505 if (ret)
1506 goto free_return;
1507
1508 /*
1509 * Overwrite states buffer with bitmap - the bits in hv_type_mask
1510 * correspond to bitfields in hv_gpa_page_access_state
1511 */
1512 for (i = 0; i < written; ++i)
1513 __assign_bit(i, (ulong *)states,
1514 states[i].as_uint8 & hv_type_mask);
1515
1516 /* zero the unused bits in the last byte(s) of the returned bitmap */
1517 for (i = written; i < bitmap_buf_sz * 8; ++i)
1518 __clear_bit(i, (ulong *)states);
1519
1520 if (copy_to_user((void __user *)args.bitmap_ptr, states, bitmap_buf_sz))
1521 ret = -EFAULT;
1522
1523 free_return:
1524 vfree(states);
1525 return ret;
1526 }
1527
1528 static long
mshv_partition_ioctl_set_msi_routing(struct mshv_partition * partition,void __user * user_args)1529 mshv_partition_ioctl_set_msi_routing(struct mshv_partition *partition,
1530 void __user *user_args)
1531 {
1532 struct mshv_user_irq_entry *entries = NULL;
1533 struct mshv_user_irq_table args;
1534 long ret;
1535
1536 if (copy_from_user(&args, user_args, sizeof(args)))
1537 return -EFAULT;
1538
1539 if (args.nr > MSHV_MAX_GUEST_IRQS ||
1540 mshv_field_nonzero(args, rsvd))
1541 return -EINVAL;
1542
1543 if (args.nr) {
1544 struct mshv_user_irq_table __user *urouting = user_args;
1545
1546 entries = vmemdup_user(urouting->entries,
1547 array_size(sizeof(*entries),
1548 args.nr));
1549 if (IS_ERR(entries))
1550 return PTR_ERR(entries);
1551 }
1552 ret = mshv_update_routing_table(partition, entries, args.nr);
1553 kvfree(entries);
1554
1555 return ret;
1556 }
1557
1558 static long
mshv_partition_ioctl_initialize(struct mshv_partition * partition)1559 mshv_partition_ioctl_initialize(struct mshv_partition *partition)
1560 {
1561 long ret;
1562
1563 if (partition->pt_initialized)
1564 return 0;
1565
1566 ret = hv_call_initialize_partition(partition->pt_id);
1567 if (ret)
1568 goto withdraw_mem;
1569
1570 ret = mshv_debugfs_partition_create(partition);
1571 if (ret)
1572 goto finalize_partition;
1573
1574 partition->pt_initialized = true;
1575
1576 return 0;
1577
1578 finalize_partition:
1579 hv_call_finalize_partition(partition->pt_id);
1580 withdraw_mem:
1581 hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);
1582
1583 return ret;
1584 }
1585
1586 static long
mshv_partition_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)1587 mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
1588 {
1589 struct mshv_partition *partition = filp->private_data;
1590 long ret;
1591 void __user *uarg = (void __user *)arg;
1592
1593 if (mutex_lock_killable(&partition->pt_mutex))
1594 return -EINTR;
1595
1596 switch (ioctl) {
1597 case MSHV_INITIALIZE_PARTITION:
1598 ret = mshv_partition_ioctl_initialize(partition);
1599 break;
1600 case MSHV_SET_GUEST_MEMORY:
1601 ret = mshv_partition_ioctl_set_memory(partition, uarg);
1602 break;
1603 case MSHV_CREATE_VP:
1604 ret = mshv_partition_ioctl_create_vp(partition, uarg);
1605 break;
1606 case MSHV_IRQFD:
1607 ret = mshv_partition_ioctl_irqfd(partition, uarg);
1608 break;
1609 case MSHV_IOEVENTFD:
1610 ret = mshv_partition_ioctl_ioeventfd(partition, uarg);
1611 break;
1612 case MSHV_SET_MSI_ROUTING:
1613 ret = mshv_partition_ioctl_set_msi_routing(partition, uarg);
1614 break;
1615 case MSHV_GET_GPAP_ACCESS_BITMAP:
1616 ret = mshv_partition_ioctl_get_gpap_access_bitmap(partition,
1617 uarg);
1618 break;
1619 case MSHV_ROOT_HVCALL:
1620 ret = mshv_ioctl_passthru_hvcall(partition, true, uarg);
1621 break;
1622 default:
1623 ret = -ENOTTY;
1624 }
1625
1626 mutex_unlock(&partition->pt_mutex);
1627 return ret;
1628 }
1629
1630 static int
disable_vp_dispatch(struct mshv_vp * vp)1631 disable_vp_dispatch(struct mshv_vp *vp)
1632 {
1633 int ret;
1634 struct hv_register_assoc dispatch_suspend = {
1635 .name = HV_REGISTER_DISPATCH_SUSPEND,
1636 .value.dispatch_suspend.suspended = 1,
1637 };
1638
1639 ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
1640 1, &dispatch_suspend);
1641 if (ret)
1642 vp_err(vp, "failed to suspend\n");
1643
1644 return ret;
1645 }
1646
1647 static int
get_vp_signaled_count(struct mshv_vp * vp,u64 * count)1648 get_vp_signaled_count(struct mshv_vp *vp, u64 *count)
1649 {
1650 int ret;
1651 struct hv_register_assoc root_signal_count = {
1652 .name = HV_REGISTER_VP_ROOT_SIGNAL_COUNT,
1653 };
1654
1655 ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
1656 1, &root_signal_count);
1657
1658 if (ret) {
1659 vp_err(vp, "Failed to get root signal count");
1660 *count = 0;
1661 return ret;
1662 }
1663
1664 *count = root_signal_count.value.reg64;
1665
1666 return ret;
1667 }
1668
1669 static void
drain_vp_signals(struct mshv_vp * vp)1670 drain_vp_signals(struct mshv_vp *vp)
1671 {
1672 u64 hv_signal_count;
1673 u64 vp_signal_count;
1674
1675 get_vp_signaled_count(vp, &hv_signal_count);
1676
1677 vp_signal_count = atomic64_read(&vp->run.vp_signaled_count);
1678
1679 /*
1680 * There should be at most 1 outstanding notification, but be extra
1681 * careful anyway.
1682 */
1683 while (hv_signal_count != vp_signal_count) {
1684 WARN_ON(hv_signal_count - vp_signal_count != 1);
1685
1686 if (wait_event_interruptible(vp->run.vp_suspend_queue,
1687 vp->run.kicked_by_hv == 1))
1688 break;
1689 vp->run.kicked_by_hv = 0;
1690 vp_signal_count = atomic64_read(&vp->run.vp_signaled_count);
1691 }
1692 }
1693
drain_all_vps(const struct mshv_partition * partition)1694 static void drain_all_vps(const struct mshv_partition *partition)
1695 {
1696 int i;
1697 struct mshv_vp *vp;
1698
1699 /*
1700 * VPs are reachable from ISR. It is safe to not take the partition
1701 * lock because nobody else can enter this function and drop the
1702 * partition from the list.
1703 */
1704 for (i = 0; i < MSHV_MAX_VPS; i++) {
1705 vp = partition->pt_vp_array[i];
1706 if (!vp)
1707 continue;
1708 /*
1709 * Disable dispatching of the VP in the hypervisor. After this
1710 * the hypervisor guarantees it won't generate any signals for
1711 * the VP and the hypervisor's VP signal count won't change.
1712 */
1713 disable_vp_dispatch(vp);
1714 drain_vp_signals(vp);
1715 }
1716 }
1717
1718 static void
remove_partition(struct mshv_partition * partition)1719 remove_partition(struct mshv_partition *partition)
1720 {
1721 spin_lock(&mshv_root.pt_ht_lock);
1722 hlist_del_rcu(&partition->pt_hnode);
1723 spin_unlock(&mshv_root.pt_ht_lock);
1724
1725 synchronize_rcu();
1726 }
1727
1728 /*
1729 * Tear down a partition and remove it from the list.
1730 * Partition's refcount must be 0
1731 */
destroy_partition(struct mshv_partition * partition)1732 static void destroy_partition(struct mshv_partition *partition)
1733 {
1734 struct mshv_vp *vp;
1735 struct mshv_mem_region *region;
1736 struct hlist_node *n;
1737 int i;
1738
1739 if (refcount_read(&partition->pt_ref_count)) {
1740 pt_err(partition,
1741 "Attempt to destroy partition but refcount > 0\n");
1742 return;
1743 }
1744
1745 if (partition->pt_initialized) {
1746 /*
1747 * We only need to drain signals for root scheduler. This should be
1748 * done before removing the partition from the partition list.
1749 */
1750 if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
1751 drain_all_vps(partition);
1752
1753 /* Remove vps */
1754 for (i = 0; i < MSHV_MAX_VPS; ++i) {
1755 vp = partition->pt_vp_array[i];
1756 if (!vp)
1757 continue;
1758
1759 mshv_debugfs_vp_remove(vp);
1760 mshv_vp_stats_unmap(partition->pt_id, vp->vp_index,
1761 vp->vp_stats_pages);
1762
1763 if (vp->vp_register_page) {
1764 (void)hv_unmap_vp_state_page(partition->pt_id,
1765 vp->vp_index,
1766 HV_VP_STATE_PAGE_REGISTERS,
1767 virt_to_page(vp->vp_register_page),
1768 input_vtl_zero);
1769 vp->vp_register_page = NULL;
1770 }
1771
1772 (void)hv_unmap_vp_state_page(partition->pt_id,
1773 vp->vp_index,
1774 HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
1775 virt_to_page(vp->vp_intercept_msg_page),
1776 input_vtl_zero);
1777 vp->vp_intercept_msg_page = NULL;
1778
1779 if (vp->vp_ghcb_page) {
1780 (void)hv_unmap_vp_state_page(partition->pt_id,
1781 vp->vp_index,
1782 HV_VP_STATE_PAGE_GHCB,
1783 virt_to_page(vp->vp_ghcb_page),
1784 input_vtl_normal);
1785 vp->vp_ghcb_page = NULL;
1786 }
1787
1788 kfree(vp);
1789
1790 partition->pt_vp_array[i] = NULL;
1791 }
1792
1793 mshv_debugfs_partition_remove(partition);
1794
1795 /* Deallocates and unmaps everything including vcpus, GPA mappings etc */
1796 hv_call_finalize_partition(partition->pt_id);
1797
1798 partition->pt_initialized = false;
1799 }
1800
1801 remove_partition(partition);
1802
1803 hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions,
1804 hnode) {
1805 hlist_del(®ion->hnode);
1806 mshv_region_put(region);
1807 }
1808
1809 /* Withdraw and free all pages we deposited */
1810 hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);
1811 hv_call_delete_partition(partition->pt_id);
1812
1813 mshv_free_routing_table(partition);
1814 kfree(partition);
1815 }
1816
1817 struct
mshv_partition_get(struct mshv_partition * partition)1818 mshv_partition *mshv_partition_get(struct mshv_partition *partition)
1819 {
1820 if (refcount_inc_not_zero(&partition->pt_ref_count))
1821 return partition;
1822 return NULL;
1823 }
1824
1825 struct
mshv_partition_find(u64 partition_id)1826 mshv_partition *mshv_partition_find(u64 partition_id)
1827 __must_hold(RCU)
1828 {
1829 struct mshv_partition *p;
1830
1831 hash_for_each_possible_rcu(mshv_root.pt_htable, p, pt_hnode,
1832 partition_id)
1833 if (p->pt_id == partition_id)
1834 return p;
1835
1836 return NULL;
1837 }
1838
1839 void
mshv_partition_put(struct mshv_partition * partition)1840 mshv_partition_put(struct mshv_partition *partition)
1841 {
1842 if (refcount_dec_and_test(&partition->pt_ref_count))
1843 destroy_partition(partition);
1844 }
1845
1846 static int
mshv_partition_release(struct inode * inode,struct file * filp)1847 mshv_partition_release(struct inode *inode, struct file *filp)
1848 {
1849 struct mshv_partition *partition = filp->private_data;
1850
1851 mshv_eventfd_release(partition);
1852
1853 cleanup_srcu_struct(&partition->pt_irq_srcu);
1854
1855 mshv_partition_put(partition);
1856
1857 return 0;
1858 }
1859
1860 static int
add_partition(struct mshv_partition * partition)1861 add_partition(struct mshv_partition *partition)
1862 {
1863 spin_lock(&mshv_root.pt_ht_lock);
1864
1865 hash_add_rcu(mshv_root.pt_htable, &partition->pt_hnode,
1866 partition->pt_id);
1867
1868 spin_unlock(&mshv_root.pt_ht_lock);
1869
1870 return 0;
1871 }
1872
1873 static_assert(MSHV_NUM_CPU_FEATURES_BANKS ==
1874 HV_PARTITION_PROCESSOR_FEATURES_BANKS);
1875
mshv_ioctl_process_pt_flags(void __user * user_arg,u64 * pt_flags,struct hv_partition_creation_properties * cr_props,union hv_partition_isolation_properties * isol_props)1876 static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags,
1877 struct hv_partition_creation_properties *cr_props,
1878 union hv_partition_isolation_properties *isol_props)
1879 {
1880 int i;
1881 struct mshv_create_partition_v2 args;
1882 union hv_partition_processor_features *disabled_procs;
1883 union hv_partition_processor_xsave_features *disabled_xsave;
1884
1885 /* First, copy v1 struct in case user is on previous versions */
1886 if (copy_from_user(&args, user_arg,
1887 sizeof(struct mshv_create_partition)))
1888 return -EFAULT;
1889
1890 if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
1891 args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
1892 return -EINVAL;
1893
1894 disabled_procs = &cr_props->disabled_processor_features;
1895 disabled_xsave = &cr_props->disabled_processor_xsave_features;
1896
1897 /* Check if user provided newer struct with feature fields */
1898 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES)) {
1899 if (copy_from_user(&args, user_arg, sizeof(args)))
1900 return -EFAULT;
1901
1902 /* Re-validate v1 fields after second copy_from_user() */
1903 if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
1904 args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
1905 return -EINVAL;
1906
1907 if (args.pt_num_cpu_fbanks != MSHV_NUM_CPU_FEATURES_BANKS ||
1908 mshv_field_nonzero(args, pt_rsvd) ||
1909 mshv_field_nonzero(args, pt_rsvd1))
1910 return -EINVAL;
1911
1912 /*
1913 * Note this assumes MSHV_NUM_CPU_FEATURES_BANKS will never
1914 * change and equals HV_PARTITION_PROCESSOR_FEATURES_BANKS
1915 * (i.e. 2).
1916 *
1917 * Further banks (index >= 2) will be modifiable as 'early'
1918 * properties via the set partition property hypercall.
1919 */
1920 for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
1921 disabled_procs->as_uint64[i] = args.pt_cpu_fbanks[i];
1922
1923 #if IS_ENABLED(CONFIG_X86_64)
1924 disabled_xsave->as_uint64 = args.pt_disabled_xsave;
1925 #else
1926 /*
1927 * In practice this field is ignored on arm64, but safer to
1928 * zero it in case it is ever used.
1929 */
1930 disabled_xsave->as_uint64 = 0;
1931
1932 if (mshv_field_nonzero(args, pt_rsvd2))
1933 return -EINVAL;
1934 #endif
1935 } else {
1936 /*
1937 * v1 behavior: try to enable everything. The hypervisor will
1938 * disable features that are not supported. The banks can be
1939 * queried via the get partition property hypercall.
1940 */
1941 for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
1942 disabled_procs->as_uint64[i] = 0;
1943
1944 disabled_xsave->as_uint64 = 0;
1945 }
1946
1947 /* Only support EXO partitions */
1948 *pt_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION |
1949 HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED;
1950
1951 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_LAPIC))
1952 *pt_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED;
1953 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_X2APIC))
1954 *pt_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE;
1955 if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_GPA_SUPER_PAGES))
1956 *pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED;
1957 if (args.pt_flags & BIT(MSHV_PT_BIT_NESTED_VIRTUALIZATION))
1958 *pt_flags |= HV_PARTITION_CREATION_FLAG_NESTED_VIRTUALIZATION_CAPABLE;
1959 if (args.pt_flags & BIT(MSHV_PT_BIT_SMT_ENABLED_GUEST))
1960 *pt_flags |= HV_PARTITION_CREATION_FLAG_SMT_ENABLED_GUEST;
1961
1962 isol_props->as_uint64 = 0;
1963
1964 switch (args.pt_isolation) {
1965 case MSHV_PT_ISOLATION_NONE:
1966 isol_props->isolation_type = HV_PARTITION_ISOLATION_TYPE_NONE;
1967 break;
1968 }
1969
1970 return 0;
1971 }
1972
1973 static long
mshv_ioctl_create_partition(void __user * user_arg,struct device * module_dev)1974 mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
1975 {
1976 u64 creation_flags;
1977 struct hv_partition_creation_properties creation_properties;
1978 union hv_partition_isolation_properties isolation_properties;
1979 struct mshv_partition *partition;
1980 long ret;
1981
1982 ret = mshv_ioctl_process_pt_flags(user_arg, &creation_flags,
1983 &creation_properties,
1984 &isolation_properties);
1985 if (ret)
1986 return ret;
1987
1988 partition = kzalloc_obj(*partition);
1989 if (!partition)
1990 return -ENOMEM;
1991
1992 partition->pt_module_dev = module_dev;
1993 partition->isolation_type = isolation_properties.isolation_type;
1994
1995 refcount_set(&partition->pt_ref_count, 1);
1996
1997 mutex_init(&partition->pt_mutex);
1998
1999 mutex_init(&partition->pt_irq_lock);
2000
2001 init_completion(&partition->async_hypercall);
2002
2003 INIT_HLIST_HEAD(&partition->irq_ack_notifier_list);
2004
2005 INIT_HLIST_HEAD(&partition->pt_devices);
2006
2007 spin_lock_init(&partition->pt_mem_regions_lock);
2008 INIT_HLIST_HEAD(&partition->pt_mem_regions);
2009
2010 mshv_eventfd_init(partition);
2011
2012 ret = init_srcu_struct(&partition->pt_irq_srcu);
2013 if (ret)
2014 goto free_partition;
2015
2016 ret = hv_call_create_partition(creation_flags,
2017 creation_properties,
2018 isolation_properties,
2019 &partition->pt_id);
2020 if (ret)
2021 goto cleanup_irq_srcu;
2022
2023 ret = add_partition(partition);
2024 if (ret)
2025 goto delete_partition;
2026
2027 ret = mshv_init_async_handler(partition);
2028 if (!ret) {
2029 ret = FD_ADD(O_CLOEXEC, anon_inode_getfile("mshv_partition",
2030 &mshv_partition_fops,
2031 partition, O_RDWR));
2032 if (ret >= 0)
2033 return ret;
2034 }
2035 remove_partition(partition);
2036 delete_partition:
2037 hv_call_delete_partition(partition->pt_id);
2038 cleanup_irq_srcu:
2039 cleanup_srcu_struct(&partition->pt_irq_srcu);
2040 free_partition:
2041 kfree(partition);
2042
2043 return ret;
2044 }
2045
mshv_dev_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)2046 static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl,
2047 unsigned long arg)
2048 {
2049 struct miscdevice *misc = filp->private_data;
2050
2051 switch (ioctl) {
2052 case MSHV_CREATE_PARTITION:
2053 return mshv_ioctl_create_partition((void __user *)arg,
2054 misc->this_device);
2055 case MSHV_ROOT_HVCALL:
2056 return mshv_ioctl_passthru_hvcall(NULL, false,
2057 (void __user *)arg);
2058 }
2059
2060 return -ENOTTY;
2061 }
2062
2063 static int
mshv_dev_open(struct inode * inode,struct file * filp)2064 mshv_dev_open(struct inode *inode, struct file *filp)
2065 {
2066 return 0;
2067 }
2068
2069 static int
mshv_dev_release(struct inode * inode,struct file * filp)2070 mshv_dev_release(struct inode *inode, struct file *filp)
2071 {
2072 return 0;
2073 }
2074
2075 static int mshv_root_sched_online;
2076
scheduler_type_to_string(enum hv_scheduler_type type)2077 static const char *scheduler_type_to_string(enum hv_scheduler_type type)
2078 {
2079 switch (type) {
2080 case HV_SCHEDULER_TYPE_LP:
2081 return "classic scheduler without SMT";
2082 case HV_SCHEDULER_TYPE_LP_SMT:
2083 return "classic scheduler with SMT";
2084 case HV_SCHEDULER_TYPE_CORE_SMT:
2085 return "core scheduler";
2086 case HV_SCHEDULER_TYPE_ROOT:
2087 return "root scheduler";
2088 default:
2089 return "unknown scheduler";
2090 };
2091 }
2092
l1vh_retrieve_scheduler_type(enum hv_scheduler_type * out)2093 static int __init l1vh_retrieve_scheduler_type(enum hv_scheduler_type *out)
2094 {
2095 u64 integrated_sched_enabled;
2096 int ret;
2097
2098 *out = HV_SCHEDULER_TYPE_CORE_SMT;
2099
2100 if (!mshv_root.vmm_caps.vmm_enable_integrated_scheduler)
2101 return 0;
2102
2103 ret = hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF,
2104 HV_PARTITION_PROPERTY_INTEGRATED_SCHEDULER_ENABLED,
2105 0, &integrated_sched_enabled,
2106 sizeof(integrated_sched_enabled));
2107 if (ret)
2108 return ret;
2109
2110 if (integrated_sched_enabled)
2111 *out = HV_SCHEDULER_TYPE_ROOT;
2112
2113 return 0;
2114 }
2115
2116 /* TODO move this to hv_common.c when needed outside */
hv_retrieve_scheduler_type(enum hv_scheduler_type * out)2117 static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out)
2118 {
2119 struct hv_input_get_system_property *input;
2120 struct hv_output_get_system_property *output;
2121 unsigned long flags;
2122 u64 status;
2123
2124 local_irq_save(flags);
2125 input = *this_cpu_ptr(hyperv_pcpu_input_arg);
2126 output = *this_cpu_ptr(hyperv_pcpu_output_arg);
2127
2128 memset(input, 0, sizeof(*input));
2129 memset(output, 0, sizeof(*output));
2130 input->property_id = HV_SYSTEM_PROPERTY_SCHEDULER_TYPE;
2131
2132 status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output);
2133 if (!hv_result_success(status)) {
2134 local_irq_restore(flags);
2135 pr_err("%s: %s\n", __func__, hv_result_to_string(status));
2136 return hv_result_to_errno(status);
2137 }
2138
2139 *out = output->scheduler_type;
2140 local_irq_restore(flags);
2141
2142 return 0;
2143 }
2144
2145 /* Retrieve and stash the supported scheduler type */
mshv_retrieve_scheduler_type(struct device * dev)2146 static int __init mshv_retrieve_scheduler_type(struct device *dev)
2147 {
2148 int ret;
2149
2150 if (hv_l1vh_partition())
2151 ret = l1vh_retrieve_scheduler_type(&hv_scheduler_type);
2152 else
2153 ret = hv_retrieve_scheduler_type(&hv_scheduler_type);
2154 if (ret)
2155 return ret;
2156
2157 dev_info(dev, "Hypervisor using %s\n",
2158 scheduler_type_to_string(hv_scheduler_type));
2159
2160 switch (hv_scheduler_type) {
2161 case HV_SCHEDULER_TYPE_CORE_SMT:
2162 case HV_SCHEDULER_TYPE_LP_SMT:
2163 case HV_SCHEDULER_TYPE_ROOT:
2164 case HV_SCHEDULER_TYPE_LP:
2165 /* Supported scheduler, nothing to do */
2166 break;
2167 default:
2168 dev_err(dev, "unsupported scheduler 0x%x, bailing.\n",
2169 hv_scheduler_type);
2170 return -EOPNOTSUPP;
2171 }
2172
2173 return 0;
2174 }
2175
mshv_root_scheduler_init(unsigned int cpu)2176 static int mshv_root_scheduler_init(unsigned int cpu)
2177 {
2178 void **inputarg, **outputarg, *p;
2179
2180 inputarg = (void **)this_cpu_ptr(root_scheduler_input);
2181 outputarg = (void **)this_cpu_ptr(root_scheduler_output);
2182
2183 /* Allocate two consecutive pages. One for input, one for output. */
2184 p = kmalloc(2 * HV_HYP_PAGE_SIZE, GFP_KERNEL);
2185 if (!p)
2186 return -ENOMEM;
2187
2188 *inputarg = p;
2189 *outputarg = (char *)p + HV_HYP_PAGE_SIZE;
2190
2191 return 0;
2192 }
2193
mshv_root_scheduler_cleanup(unsigned int cpu)2194 static int mshv_root_scheduler_cleanup(unsigned int cpu)
2195 {
2196 void *p, **inputarg, **outputarg;
2197
2198 inputarg = (void **)this_cpu_ptr(root_scheduler_input);
2199 outputarg = (void **)this_cpu_ptr(root_scheduler_output);
2200
2201 p = *inputarg;
2202
2203 *inputarg = NULL;
2204 *outputarg = NULL;
2205
2206 kfree(p);
2207
2208 return 0;
2209 }
2210
2211 /* Must be called after retrieving the scheduler type */
2212 static int
root_scheduler_init(struct device * dev)2213 root_scheduler_init(struct device *dev)
2214 {
2215 int ret;
2216
2217 if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
2218 return 0;
2219
2220 root_scheduler_input = alloc_percpu(void *);
2221 root_scheduler_output = alloc_percpu(void *);
2222
2223 if (!root_scheduler_input || !root_scheduler_output) {
2224 dev_err(dev, "Failed to allocate root scheduler buffers\n");
2225 ret = -ENOMEM;
2226 goto out;
2227 }
2228
2229 ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_root_sched",
2230 mshv_root_scheduler_init,
2231 mshv_root_scheduler_cleanup);
2232
2233 if (ret < 0) {
2234 dev_err(dev, "Failed to setup root scheduler state: %i\n", ret);
2235 goto out;
2236 }
2237
2238 mshv_root_sched_online = ret;
2239
2240 return 0;
2241
2242 out:
2243 free_percpu(root_scheduler_input);
2244 free_percpu(root_scheduler_output);
2245 return ret;
2246 }
2247
2248 static void
root_scheduler_deinit(void)2249 root_scheduler_deinit(void)
2250 {
2251 if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
2252 return;
2253
2254 cpuhp_remove_state(mshv_root_sched_online);
2255 free_percpu(root_scheduler_input);
2256 free_percpu(root_scheduler_output);
2257 }
2258
mshv_init_vmm_caps(struct device * dev)2259 static int __init mshv_init_vmm_caps(struct device *dev)
2260 {
2261 int ret;
2262
2263 ret = hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF,
2264 HV_PARTITION_PROPERTY_VMM_CAPABILITIES,
2265 0, &mshv_root.vmm_caps,
2266 sizeof(mshv_root.vmm_caps));
2267 if (ret && hv_l1vh_partition()) {
2268 dev_err(dev, "Failed to get VMM capabilities: %d\n", ret);
2269 return ret;
2270 }
2271
2272 dev_dbg(dev, "vmm_caps = %#llx\n", mshv_root.vmm_caps.as_uint64[0]);
2273
2274 return 0;
2275 }
2276
mshv_parent_partition_init(void)2277 static int __init mshv_parent_partition_init(void)
2278 {
2279 int ret;
2280 struct device *dev;
2281 union hv_hypervisor_version_info version_info;
2282
2283 if (!hv_parent_partition() || is_kdump_kernel())
2284 return -ENODEV;
2285
2286 if (hv_get_hypervisor_version(&version_info))
2287 return -ENODEV;
2288
2289 ret = misc_register(&mshv_dev);
2290 if (ret)
2291 return ret;
2292
2293 dev = mshv_dev.this_device;
2294
2295 if (version_info.build_number < MSHV_HV_MIN_VERSION ||
2296 version_info.build_number > MSHV_HV_MAX_VERSION) {
2297 dev_err(dev, "Running on unvalidated Hyper-V version\n");
2298 dev_err(dev, "Versions: current: %u min: %u max: %u\n",
2299 version_info.build_number, MSHV_HV_MIN_VERSION,
2300 MSHV_HV_MAX_VERSION);
2301 }
2302
2303 ret = mshv_synic_init(dev);
2304 if (ret)
2305 goto device_deregister;
2306
2307 ret = mshv_init_vmm_caps(dev);
2308 if (ret)
2309 goto synic_cleanup;
2310
2311 ret = mshv_retrieve_scheduler_type(dev);
2312 if (ret)
2313 goto synic_cleanup;
2314
2315 ret = root_scheduler_init(dev);
2316 if (ret)
2317 goto synic_cleanup;
2318
2319 ret = mshv_debugfs_init();
2320 if (ret)
2321 goto deinit_root_scheduler;
2322
2323 ret = mshv_irqfd_wq_init();
2324 if (ret)
2325 goto exit_debugfs;
2326
2327 spin_lock_init(&mshv_root.pt_ht_lock);
2328 hash_init(mshv_root.pt_htable);
2329
2330 hv_setup_mshv_handler(mshv_isr);
2331
2332 return 0;
2333
2334 exit_debugfs:
2335 mshv_debugfs_exit();
2336 deinit_root_scheduler:
2337 root_scheduler_deinit();
2338 synic_cleanup:
2339 mshv_synic_exit();
2340 device_deregister:
2341 misc_deregister(&mshv_dev);
2342 return ret;
2343 }
2344
mshv_parent_partition_exit(void)2345 static void __exit mshv_parent_partition_exit(void)
2346 {
2347 hv_setup_mshv_handler(NULL);
2348 mshv_port_table_fini();
2349 mshv_debugfs_exit();
2350 misc_deregister(&mshv_dev);
2351 mshv_irqfd_wq_cleanup();
2352 root_scheduler_deinit();
2353 mshv_synic_exit();
2354 }
2355
2356 module_init(mshv_parent_partition_init);
2357 module_exit(mshv_parent_partition_exit);
2358