xref: /linux/drivers/hv/mshv_root_main.c (revision feb06d2690bb826fd33798a99ce5cff8d07b38f9)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2024, Microsoft Corporation.
4  *
5  * The main part of the mshv_root module, providing APIs to create
6  * and manage guest partitions.
7  *
8  * Authors: Microsoft Linux virtualization team
9  */
10 
11 #include <linux/entry-virt.h>
12 #include <linux/kernel.h>
13 #include <linux/module.h>
14 #include <linux/fs.h>
15 #include <linux/miscdevice.h>
16 #include <linux/slab.h>
17 #include <linux/file.h>
18 #include <linux/anon_inodes.h>
19 #include <linux/mm.h>
20 #include <linux/io.h>
21 #include <linux/cpuhotplug.h>
22 #include <linux/random.h>
23 #include <asm/mshyperv.h>
24 #include <linux/hyperv.h>
25 #include <linux/notifier.h>
26 #include <linux/reboot.h>
27 #include <linux/kexec.h>
28 #include <linux/page-flags.h>
29 #include <linux/crash_dump.h>
30 #include <linux/panic_notifier.h>
31 #include <linux/vmalloc.h>
32 #include <linux/rseq.h>
33 
34 #include "mshv_eventfd.h"
35 #include "mshv.h"
36 #include "mshv_root.h"
37 
38 MODULE_AUTHOR("Microsoft");
39 MODULE_LICENSE("GPL");
40 MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv");
41 
42 /* TODO move this to another file when debugfs code is added */
43 enum hv_stats_vp_counters {			/* HV_THREAD_COUNTER */
44 #if defined(CONFIG_X86)
45 	VpRootDispatchThreadBlocked			= 202,
46 #elif defined(CONFIG_ARM64)
47 	VpRootDispatchThreadBlocked			= 94,
48 #endif
49 	VpStatsMaxCounter
50 };
51 
52 struct hv_stats_page {
53 	union {
54 		u64 vp_cntrs[VpStatsMaxCounter];		/* VP counters */
55 		u8 data[HV_HYP_PAGE_SIZE];
56 	};
57 } __packed;
58 
59 struct mshv_root mshv_root;
60 
61 enum hv_scheduler_type hv_scheduler_type;
62 
63 /* Once we implement the fast extended hypercall ABI they can go away. */
64 static void * __percpu *root_scheduler_input;
65 static void * __percpu *root_scheduler_output;
66 
67 static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
68 static int mshv_dev_open(struct inode *inode, struct file *filp);
69 static int mshv_dev_release(struct inode *inode, struct file *filp);
70 static int mshv_vp_release(struct inode *inode, struct file *filp);
71 static long mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
72 static int mshv_partition_release(struct inode *inode, struct file *filp);
73 static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
74 static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma);
75 static vm_fault_t mshv_vp_fault(struct vm_fault *vmf);
76 static int mshv_init_async_handler(struct mshv_partition *partition);
77 static void mshv_async_hvcall_handler(void *data, u64 *status);
78 
79 static const union hv_input_vtl input_vtl_zero;
80 static const union hv_input_vtl input_vtl_normal = {
81 	.target_vtl = HV_NORMAL_VTL,
82 	.use_target_vtl = 1,
83 };
84 
85 static const struct vm_operations_struct mshv_vp_vm_ops = {
86 	.fault = mshv_vp_fault,
87 };
88 
89 static const struct file_operations mshv_vp_fops = {
90 	.owner = THIS_MODULE,
91 	.release = mshv_vp_release,
92 	.unlocked_ioctl = mshv_vp_ioctl,
93 	.llseek = noop_llseek,
94 	.mmap = mshv_vp_mmap,
95 };
96 
97 static const struct file_operations mshv_partition_fops = {
98 	.owner = THIS_MODULE,
99 	.release = mshv_partition_release,
100 	.unlocked_ioctl = mshv_partition_ioctl,
101 	.llseek = noop_llseek,
102 };
103 
104 static const struct file_operations mshv_dev_fops = {
105 	.owner = THIS_MODULE,
106 	.open = mshv_dev_open,
107 	.release = mshv_dev_release,
108 	.unlocked_ioctl = mshv_dev_ioctl,
109 	.llseek = noop_llseek,
110 };
111 
112 static struct miscdevice mshv_dev = {
113 	.minor = MISC_DYNAMIC_MINOR,
114 	.name = "mshv",
115 	.fops = &mshv_dev_fops,
116 	.mode = 0600,
117 };
118 
119 /*
120  * Only allow hypercalls that have a u64 partition id as the first member of
121  * the input structure.
122  * These are sorted by value.
123  */
124 static u16 mshv_passthru_hvcalls[] = {
125 	HVCALL_GET_PARTITION_PROPERTY,
126 	HVCALL_GET_PARTITION_PROPERTY_EX,
127 	HVCALL_SET_PARTITION_PROPERTY,
128 	HVCALL_INSTALL_INTERCEPT,
129 	HVCALL_GET_VP_REGISTERS,
130 	HVCALL_SET_VP_REGISTERS,
131 	HVCALL_TRANSLATE_VIRTUAL_ADDRESS,
132 	HVCALL_CLEAR_VIRTUAL_INTERRUPT,
133 	HVCALL_REGISTER_INTERCEPT_RESULT,
134 	HVCALL_ASSERT_VIRTUAL_INTERRUPT,
135 	HVCALL_GET_GPA_PAGES_ACCESS_STATES,
136 	HVCALL_SIGNAL_EVENT_DIRECT,
137 	HVCALL_POST_MESSAGE_DIRECT,
138 	HVCALL_GET_VP_CPUID_VALUES,
139 };
140 
141 /*
142  * Only allow hypercalls that are safe to be called by the VMM with the host
143  * partition as target (i.e. HV_PARTITION_ID_SELF). Carefully audit that a
144  * hypercall cannot be misused by the VMM before adding it to this list.
145  */
146 static u16 mshv_self_passthru_hvcalls[] = {
147 	HVCALL_GET_PARTITION_PROPERTY,
148 	HVCALL_GET_PARTITION_PROPERTY_EX,
149 };
150 
mshv_hvcall_is_async(u16 code)151 static bool mshv_hvcall_is_async(u16 code)
152 {
153 	switch (code) {
154 	case HVCALL_SET_PARTITION_PROPERTY:
155 		return true;
156 	default:
157 		break;
158 	}
159 	return false;
160 }
161 
mshv_passthru_hvcall_allowed(u16 code,u64 pt_id)162 static bool mshv_passthru_hvcall_allowed(u16 code, u64 pt_id)
163 {
164 	int i;
165 	int n = ARRAY_SIZE(mshv_passthru_hvcalls);
166 	u16 *allowed_hvcalls = mshv_passthru_hvcalls;
167 
168 	if (pt_id == HV_PARTITION_ID_SELF) {
169 		n = ARRAY_SIZE(mshv_self_passthru_hvcalls);
170 		allowed_hvcalls = mshv_self_passthru_hvcalls;
171 	}
172 
173 	for (i = 0; i < n; ++i)
174 		if (allowed_hvcalls[i] == code)
175 			return true;
176 
177 	return false;
178 }
179 
mshv_ioctl_passthru_hvcall(struct mshv_partition * partition,bool partition_locked,void __user * user_args)180 static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition,
181 				      bool partition_locked,
182 				      void __user *user_args)
183 {
184 	u64 status;
185 	int ret = 0;
186 	bool is_async;
187 	struct mshv_root_hvcall args;
188 	struct page *page;
189 	unsigned int pages_order;
190 	void *input_pg = NULL;
191 	void *output_pg = NULL;
192 	u16 reps_completed;
193 	u64 pt_id = partition ? partition->pt_id : HV_PARTITION_ID_SELF;
194 
195 	if (copy_from_user(&args, user_args, sizeof(args)))
196 		return -EFAULT;
197 
198 	if (args.status || !args.in_ptr || args.in_sz < sizeof(u64) ||
199 	    mshv_field_nonzero(args, rsvd) || args.in_sz > HV_HYP_PAGE_SIZE)
200 		return -EINVAL;
201 
202 	if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE))
203 		return -EINVAL;
204 
205 	if (!mshv_passthru_hvcall_allowed(args.code, pt_id))
206 		return -EINVAL;
207 
208 	is_async = mshv_hvcall_is_async(args.code);
209 	if (is_async) {
210 		/* async hypercalls can only be called from partition fd */
211 		if (!partition || !partition_locked)
212 			return -EINVAL;
213 		ret = mshv_init_async_handler(partition);
214 		if (ret)
215 			return ret;
216 	}
217 
218 	pages_order = args.out_ptr ? 1 : 0;
219 	page = alloc_pages(GFP_KERNEL, pages_order);
220 	if (!page)
221 		return -ENOMEM;
222 	input_pg = page_address(page);
223 
224 	if (args.out_ptr)
225 		output_pg = (char *)input_pg + PAGE_SIZE;
226 	else
227 		output_pg = NULL;
228 
229 	if (copy_from_user(input_pg, (void __user *)args.in_ptr,
230 			   args.in_sz)) {
231 		ret = -EFAULT;
232 		goto free_pages_out;
233 	}
234 
235 	/*
236 	 * NOTE: This only works because all the allowed hypercalls' input
237 	 * structs begin with a u64 partition_id field.
238 	 */
239 	*(u64 *)input_pg = pt_id;
240 
241 	reps_completed = 0;
242 	do {
243 		if (args.reps) {
244 			status = hv_do_rep_hypercall_ex(args.code, args.reps,
245 							0, reps_completed,
246 							input_pg, output_pg);
247 			reps_completed = hv_repcomp(status);
248 		} else {
249 			status = hv_do_hypercall(args.code, input_pg, output_pg);
250 		}
251 
252 		if (hv_result(status) == HV_STATUS_CALL_PENDING) {
253 			if (is_async) {
254 				mshv_async_hvcall_handler(partition, &status);
255 			} else { /* Paranoia check. This shouldn't happen! */
256 				ret = -EBADFD;
257 				goto free_pages_out;
258 			}
259 		}
260 
261 		if (hv_result_success(status))
262 			break;
263 
264 		if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY)
265 			ret = hv_result_to_errno(status);
266 		else
267 			ret = hv_call_deposit_pages(NUMA_NO_NODE,
268 						    pt_id, 1);
269 	} while (!ret);
270 
271 	args.status = hv_result(status);
272 	args.reps = reps_completed;
273 	if (copy_to_user(user_args, &args, sizeof(args)))
274 		ret = -EFAULT;
275 
276 	if (!ret && output_pg &&
277 	    copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz))
278 		ret = -EFAULT;
279 
280 free_pages_out:
281 	free_pages((unsigned long)input_pg, pages_order);
282 
283 	return ret;
284 }
285 
is_ghcb_mapping_available(void)286 static inline bool is_ghcb_mapping_available(void)
287 {
288 #if IS_ENABLED(CONFIG_X86_64)
289 	return ms_hyperv.ext_features & HV_VP_GHCB_ROOT_MAPPING_AVAILABLE;
290 #else
291 	return 0;
292 #endif
293 }
294 
mshv_get_vp_registers(u32 vp_index,u64 partition_id,u16 count,struct hv_register_assoc * registers)295 static int mshv_get_vp_registers(u32 vp_index, u64 partition_id, u16 count,
296 				 struct hv_register_assoc *registers)
297 {
298 	return hv_call_get_vp_registers(vp_index, partition_id,
299 					count, input_vtl_zero, registers);
300 }
301 
mshv_set_vp_registers(u32 vp_index,u64 partition_id,u16 count,struct hv_register_assoc * registers)302 static int mshv_set_vp_registers(u32 vp_index, u64 partition_id, u16 count,
303 				 struct hv_register_assoc *registers)
304 {
305 	return hv_call_set_vp_registers(vp_index, partition_id,
306 					count, input_vtl_zero, registers);
307 }
308 
309 /*
310  * Explicit guest vCPU suspend is asynchronous by nature (as it is requested by
311  * dom0 vCPU for guest vCPU) and thus it can race with "intercept" suspend,
312  * done by the hypervisor.
313  * "Intercept" suspend leads to asynchronous message delivery to dom0 which
314  * should be awaited to keep the VP loop consistent (i.e. no message pending
315  * upon VP resume).
316  * VP intercept suspend can't be done when the VP is explicitly suspended
317  * already, and thus can be only two possible race scenarios:
318  *   1. implicit suspend bit set -> explicit suspend bit set -> message sent
319  *   2. implicit suspend bit set -> message sent -> explicit suspend bit set
320  * Checking for implicit suspend bit set after explicit suspend request has
321  * succeeded in either case allows us to reliably identify, if there is a
322  * message to receive and deliver to VMM.
323  */
324 static int
mshv_suspend_vp(const struct mshv_vp * vp,bool * message_in_flight)325 mshv_suspend_vp(const struct mshv_vp *vp, bool *message_in_flight)
326 {
327 	struct hv_register_assoc explicit_suspend = {
328 		.name = HV_REGISTER_EXPLICIT_SUSPEND
329 	};
330 	struct hv_register_assoc intercept_suspend = {
331 		.name = HV_REGISTER_INTERCEPT_SUSPEND
332 	};
333 	union hv_explicit_suspend_register *es =
334 		&explicit_suspend.value.explicit_suspend;
335 	union hv_intercept_suspend_register *is =
336 		&intercept_suspend.value.intercept_suspend;
337 	int ret;
338 
339 	es->suspended = 1;
340 
341 	ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
342 				    1, &explicit_suspend);
343 	if (ret) {
344 		vp_err(vp, "Failed to explicitly suspend vCPU\n");
345 		return ret;
346 	}
347 
348 	ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
349 				    1, &intercept_suspend);
350 	if (ret) {
351 		vp_err(vp, "Failed to get intercept suspend state\n");
352 		return ret;
353 	}
354 
355 	*message_in_flight = is->suspended;
356 
357 	return 0;
358 }
359 
360 /*
361  * This function is used when VPs are scheduled by the hypervisor's
362  * scheduler.
363  *
364  * Caller has to make sure the registers contain cleared
365  * HV_REGISTER_INTERCEPT_SUSPEND and HV_REGISTER_EXPLICIT_SUSPEND registers
366  * exactly in this order (the hypervisor clears them sequentially) to avoid
367  * potential invalid clearing a newly arrived HV_REGISTER_INTERCEPT_SUSPEND
368  * after VP is released from HV_REGISTER_EXPLICIT_SUSPEND in case of the
369  * opposite order.
370  */
mshv_run_vp_with_hyp_scheduler(struct mshv_vp * vp)371 static long mshv_run_vp_with_hyp_scheduler(struct mshv_vp *vp)
372 {
373 	long ret;
374 	struct hv_register_assoc suspend_regs[2] = {
375 			{ .name = HV_REGISTER_INTERCEPT_SUSPEND },
376 			{ .name = HV_REGISTER_EXPLICIT_SUSPEND }
377 	};
378 	size_t count = ARRAY_SIZE(suspend_regs);
379 
380 	/* Resume VP execution */
381 	ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
382 				    count, suspend_regs);
383 	if (ret) {
384 		vp_err(vp, "Failed to resume vp execution. %lx\n", ret);
385 		return ret;
386 	}
387 
388 	ret = wait_event_interruptible(vp->run.vp_suspend_queue,
389 				       vp->run.kicked_by_hv == 1);
390 	if (ret) {
391 		bool message_in_flight;
392 
393 		/*
394 		 * Otherwise the waiting was interrupted by a signal: suspend
395 		 * the vCPU explicitly and copy message in flight (if any).
396 		 */
397 		ret = mshv_suspend_vp(vp, &message_in_flight);
398 		if (ret)
399 			return ret;
400 
401 		/* Return if no message in flight */
402 		if (!message_in_flight)
403 			return -EINTR;
404 
405 		/* Wait for the message in flight. */
406 		wait_event(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1);
407 	}
408 
409 	/*
410 	 * Reset the flag to make the wait_event call above work
411 	 * next time.
412 	 */
413 	vp->run.kicked_by_hv = 0;
414 
415 	return 0;
416 }
417 
418 static int
mshv_vp_dispatch(struct mshv_vp * vp,u32 flags,struct hv_output_dispatch_vp * res)419 mshv_vp_dispatch(struct mshv_vp *vp, u32 flags,
420 		 struct hv_output_dispatch_vp *res)
421 {
422 	struct hv_input_dispatch_vp *input;
423 	struct hv_output_dispatch_vp *output;
424 	u64 status;
425 
426 	preempt_disable();
427 	input = *this_cpu_ptr(root_scheduler_input);
428 	output = *this_cpu_ptr(root_scheduler_output);
429 
430 	memset(input, 0, sizeof(*input));
431 	memset(output, 0, sizeof(*output));
432 
433 	input->partition_id = vp->vp_partition->pt_id;
434 	input->vp_index = vp->vp_index;
435 	input->time_slice = 0; /* Run forever until something happens */
436 	input->spec_ctrl = 0; /* TODO: set sensible flags */
437 	input->flags = flags;
438 
439 	vp->run.flags.root_sched_dispatched = 1;
440 	status = hv_do_hypercall(HVCALL_DISPATCH_VP, input, output);
441 	vp->run.flags.root_sched_dispatched = 0;
442 
443 	*res = *output;
444 	preempt_enable();
445 
446 	if (!hv_result_success(status))
447 		vp_err(vp, "%s: status %s\n", __func__,
448 		       hv_result_to_string(status));
449 
450 	return hv_result_to_errno(status);
451 }
452 
453 static int
mshv_vp_clear_explicit_suspend(struct mshv_vp * vp)454 mshv_vp_clear_explicit_suspend(struct mshv_vp *vp)
455 {
456 	struct hv_register_assoc explicit_suspend = {
457 		.name = HV_REGISTER_EXPLICIT_SUSPEND,
458 		.value.explicit_suspend.suspended = 0,
459 	};
460 	int ret;
461 
462 	ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
463 				    1, &explicit_suspend);
464 
465 	if (ret)
466 		vp_err(vp, "Failed to unsuspend\n");
467 
468 	return ret;
469 }
470 
471 #if IS_ENABLED(CONFIG_X86_64)
mshv_vp_interrupt_pending(struct mshv_vp * vp)472 static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp)
473 {
474 	if (!vp->vp_register_page)
475 		return 0;
476 	return vp->vp_register_page->interrupt_vectors.as_uint64;
477 }
478 #else
mshv_vp_interrupt_pending(struct mshv_vp * vp)479 static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp)
480 {
481 	return 0;
482 }
483 #endif
484 
mshv_vp_dispatch_thread_blocked(struct mshv_vp * vp)485 static bool mshv_vp_dispatch_thread_blocked(struct mshv_vp *vp)
486 {
487 	struct hv_stats_page **stats = vp->vp_stats_pages;
488 	u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->vp_cntrs;
489 	u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->vp_cntrs;
490 
491 	if (self_vp_cntrs[VpRootDispatchThreadBlocked])
492 		return self_vp_cntrs[VpRootDispatchThreadBlocked];
493 	return parent_vp_cntrs[VpRootDispatchThreadBlocked];
494 }
495 
496 static int
mshv_vp_wait_for_hv_kick(struct mshv_vp * vp)497 mshv_vp_wait_for_hv_kick(struct mshv_vp *vp)
498 {
499 	int ret;
500 
501 	ret = wait_event_interruptible(vp->run.vp_suspend_queue,
502 				       (vp->run.kicked_by_hv == 1 &&
503 					!mshv_vp_dispatch_thread_blocked(vp)) ||
504 				       mshv_vp_interrupt_pending(vp));
505 	if (ret)
506 		return -EINTR;
507 
508 	vp->run.flags.root_sched_blocked = 0;
509 	vp->run.kicked_by_hv = 0;
510 
511 	return 0;
512 }
513 
514 /* Must be called with interrupts enabled */
mshv_run_vp_with_root_scheduler(struct mshv_vp * vp)515 static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp)
516 {
517 	long ret;
518 
519 	if (vp->run.flags.root_sched_blocked) {
520 		/*
521 		 * Dispatch state of this VP is blocked. Need to wait
522 		 * for the hypervisor to clear the blocked state before
523 		 * dispatching it.
524 		 */
525 		ret = mshv_vp_wait_for_hv_kick(vp);
526 		if (ret)
527 			return ret;
528 	}
529 
530 	do {
531 		u32 flags = 0;
532 		struct hv_output_dispatch_vp output;
533 
534 		if (__xfer_to_guest_mode_work_pending()) {
535 			ret = xfer_to_guest_mode_handle_work();
536 			if (ret)
537 				break;
538 		}
539 
540 		if (vp->run.flags.intercept_suspend)
541 			flags |= HV_DISPATCH_VP_FLAG_CLEAR_INTERCEPT_SUSPEND;
542 
543 		if (mshv_vp_interrupt_pending(vp))
544 			flags |= HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION;
545 
546 		ret = mshv_vp_dispatch(vp, flags, &output);
547 		if (ret)
548 			break;
549 
550 		vp->run.flags.intercept_suspend = 0;
551 
552 		if (output.dispatch_state == HV_VP_DISPATCH_STATE_BLOCKED) {
553 			if (output.dispatch_event ==
554 						HV_VP_DISPATCH_EVENT_SUSPEND) {
555 				/*
556 				 * TODO: remove the warning once VP canceling
557 				 *	 is supported
558 				 */
559 				WARN_ONCE(atomic64_read(&vp->run.vp_signaled_count),
560 					  "%s: vp#%d: unexpected explicit suspend\n",
561 					  __func__, vp->vp_index);
562 				/*
563 				 * Need to clear explicit suspend before
564 				 * dispatching.
565 				 * Explicit suspend is either:
566 				 * - set right after the first VP dispatch or
567 				 * - set explicitly via hypercall
568 				 * Since the latter case is not yet supported,
569 				 * simply clear it here.
570 				 */
571 				ret = mshv_vp_clear_explicit_suspend(vp);
572 				if (ret)
573 					break;
574 
575 				ret = mshv_vp_wait_for_hv_kick(vp);
576 				if (ret)
577 					break;
578 			} else {
579 				vp->run.flags.root_sched_blocked = 1;
580 				ret = mshv_vp_wait_for_hv_kick(vp);
581 				if (ret)
582 					break;
583 			}
584 		} else {
585 			/* HV_VP_DISPATCH_STATE_READY */
586 			if (output.dispatch_event ==
587 						HV_VP_DISPATCH_EVENT_INTERCEPT)
588 				vp->run.flags.intercept_suspend = 1;
589 		}
590 	} while (!vp->run.flags.intercept_suspend);
591 
592 	rseq_virt_userspace_exit();
593 
594 	return ret;
595 }
596 
597 static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ,
598 	      "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ");
599 
600 static struct mshv_mem_region *
mshv_partition_region_by_gfn(struct mshv_partition * partition,u64 gfn)601 mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
602 {
603 	struct mshv_mem_region *region;
604 
605 	hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) {
606 		if (gfn >= region->start_gfn &&
607 		    gfn < region->start_gfn + region->nr_pages)
608 			return region;
609 	}
610 
611 	return NULL;
612 }
613 
614 #ifdef CONFIG_X86_64
615 static struct mshv_mem_region *
mshv_partition_region_by_gfn_get(struct mshv_partition * p,u64 gfn)616 mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
617 {
618 	struct mshv_mem_region *region;
619 
620 	spin_lock(&p->pt_mem_regions_lock);
621 	region = mshv_partition_region_by_gfn(p, gfn);
622 	if (!region || !mshv_region_get(region)) {
623 		spin_unlock(&p->pt_mem_regions_lock);
624 		return NULL;
625 	}
626 	spin_unlock(&p->pt_mem_regions_lock);
627 
628 	return region;
629 }
630 
631 /**
632  * mshv_handle_gpa_intercept - Handle GPA (Guest Physical Address) intercepts.
633  * @vp: Pointer to the virtual processor structure.
634  *
635  * This function processes GPA intercepts by identifying the memory region
636  * corresponding to the intercepted GPA, aligning the page offset, and
637  * mapping the required pages. It ensures that the region is valid and
638  * handles faults efficiently by mapping multiple pages at once.
639  *
640  * Return: true if the intercept was handled successfully, false otherwise.
641  */
mshv_handle_gpa_intercept(struct mshv_vp * vp)642 static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
643 {
644 	struct mshv_partition *p = vp->vp_partition;
645 	struct mshv_mem_region *region;
646 	struct hv_x64_memory_intercept_message *msg;
647 	bool ret;
648 	u64 gfn;
649 
650 	msg = (struct hv_x64_memory_intercept_message *)
651 		vp->vp_intercept_msg_page->u.payload;
652 
653 	gfn = HVPFN_DOWN(msg->guest_physical_address);
654 
655 	region = mshv_partition_region_by_gfn_get(p, gfn);
656 	if (!region)
657 		return false;
658 
659 	/* Only movable memory ranges are supported for GPA intercepts */
660 	if (region->type == MSHV_REGION_TYPE_MEM_MOVABLE)
661 		ret = mshv_region_handle_gfn_fault(region, gfn);
662 	else
663 		ret = false;
664 
665 	mshv_region_put(region);
666 
667 	return ret;
668 }
669 #else  /* CONFIG_X86_64 */
mshv_handle_gpa_intercept(struct mshv_vp * vp)670 static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; }
671 #endif /* CONFIG_X86_64 */
672 
mshv_vp_handle_intercept(struct mshv_vp * vp)673 static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
674 {
675 	switch (vp->vp_intercept_msg_page->header.message_type) {
676 	case HVMSG_GPA_INTERCEPT:
677 		return mshv_handle_gpa_intercept(vp);
678 	}
679 	return false;
680 }
681 
mshv_vp_ioctl_run_vp(struct mshv_vp * vp,void __user * ret_msg)682 static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg)
683 {
684 	long rc;
685 
686 	do {
687 		if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
688 			rc = mshv_run_vp_with_root_scheduler(vp);
689 		else
690 			rc = mshv_run_vp_with_hyp_scheduler(vp);
691 	} while (rc == 0 && mshv_vp_handle_intercept(vp));
692 
693 	if (rc)
694 		return rc;
695 
696 	if (copy_to_user(ret_msg, vp->vp_intercept_msg_page,
697 			 sizeof(struct hv_message)))
698 		rc = -EFAULT;
699 
700 	return rc;
701 }
702 
703 static int
mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp * vp,struct hv_vp_state_data state_data,unsigned long user_pfn,size_t page_count,bool is_set)704 mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp *vp,
705 				struct hv_vp_state_data state_data,
706 				unsigned long user_pfn, size_t page_count,
707 				bool is_set)
708 {
709 	int completed, ret = 0;
710 	unsigned long check;
711 	struct page **pages;
712 
713 	if (page_count > INT_MAX)
714 		return -EINVAL;
715 	/*
716 	 * Check the arithmetic for wraparound/overflow.
717 	 * The last page address in the buffer is:
718 	 * (user_pfn + (page_count - 1)) * PAGE_SIZE
719 	 */
720 	if (check_add_overflow(user_pfn, (page_count - 1), &check))
721 		return -EOVERFLOW;
722 	if (check_mul_overflow(check, PAGE_SIZE, &check))
723 		return -EOVERFLOW;
724 
725 	/* Pin user pages so hypervisor can copy directly to them */
726 	pages = kcalloc(page_count, sizeof(struct page *), GFP_KERNEL);
727 	if (!pages)
728 		return -ENOMEM;
729 
730 	for (completed = 0; completed < page_count; completed += ret) {
731 		unsigned long user_addr = (user_pfn + completed) * PAGE_SIZE;
732 		int remaining = page_count - completed;
733 
734 		ret = pin_user_pages_fast(user_addr, remaining, FOLL_WRITE,
735 					  &pages[completed]);
736 		if (ret < 0) {
737 			vp_err(vp, "%s: Failed to pin user pages error %i\n",
738 			       __func__, ret);
739 			goto unpin_pages;
740 		}
741 	}
742 
743 	if (is_set)
744 		ret = hv_call_set_vp_state(vp->vp_index,
745 					   vp->vp_partition->pt_id,
746 					   state_data, page_count, pages,
747 					   0, NULL);
748 	else
749 		ret = hv_call_get_vp_state(vp->vp_index,
750 					   vp->vp_partition->pt_id,
751 					   state_data, page_count, pages,
752 					   NULL);
753 
754 unpin_pages:
755 	unpin_user_pages(pages, completed);
756 	kfree(pages);
757 	return ret;
758 }
759 
760 static long
mshv_vp_ioctl_get_set_state(struct mshv_vp * vp,struct mshv_get_set_vp_state __user * user_args,bool is_set)761 mshv_vp_ioctl_get_set_state(struct mshv_vp *vp,
762 			    struct mshv_get_set_vp_state __user *user_args,
763 			    bool is_set)
764 {
765 	struct mshv_get_set_vp_state args;
766 	long ret = 0;
767 	union hv_output_get_vp_state vp_state;
768 	u32 data_sz;
769 	struct hv_vp_state_data state_data = {};
770 
771 	if (copy_from_user(&args, user_args, sizeof(args)))
772 		return -EFAULT;
773 
774 	if (args.type >= MSHV_VP_STATE_COUNT || mshv_field_nonzero(args, rsvd) ||
775 	    !args.buf_sz || !PAGE_ALIGNED(args.buf_sz) ||
776 	    !PAGE_ALIGNED(args.buf_ptr))
777 		return -EINVAL;
778 
779 	if (!access_ok((void __user *)args.buf_ptr, args.buf_sz))
780 		return -EFAULT;
781 
782 	switch (args.type) {
783 	case MSHV_VP_STATE_LAPIC:
784 		state_data.type = HV_GET_SET_VP_STATE_LAPIC_STATE;
785 		data_sz = HV_HYP_PAGE_SIZE;
786 		break;
787 	case MSHV_VP_STATE_XSAVE:
788 	{
789 		u64 data_sz_64;
790 
791 		ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
792 						     HV_PARTITION_PROPERTY_XSAVE_STATES,
793 						     &state_data.xsave.states.as_uint64);
794 		if (ret)
795 			return ret;
796 
797 		ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
798 						     HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE,
799 						     &data_sz_64);
800 		if (ret)
801 			return ret;
802 
803 		data_sz = (u32)data_sz_64;
804 		state_data.xsave.flags = 0;
805 		/* Always request legacy states */
806 		state_data.xsave.states.legacy_x87 = 1;
807 		state_data.xsave.states.legacy_sse = 1;
808 		state_data.type = HV_GET_SET_VP_STATE_XSAVE;
809 		break;
810 	}
811 	case MSHV_VP_STATE_SIMP:
812 		state_data.type = HV_GET_SET_VP_STATE_SIM_PAGE;
813 		data_sz = HV_HYP_PAGE_SIZE;
814 		break;
815 	case MSHV_VP_STATE_SIEFP:
816 		state_data.type = HV_GET_SET_VP_STATE_SIEF_PAGE;
817 		data_sz = HV_HYP_PAGE_SIZE;
818 		break;
819 	case MSHV_VP_STATE_SYNTHETIC_TIMERS:
820 		state_data.type = HV_GET_SET_VP_STATE_SYNTHETIC_TIMERS;
821 		data_sz = sizeof(vp_state.synthetic_timers_state);
822 		break;
823 	default:
824 		return -EINVAL;
825 	}
826 
827 	if (copy_to_user(&user_args->buf_sz, &data_sz, sizeof(user_args->buf_sz)))
828 		return -EFAULT;
829 
830 	if (data_sz > args.buf_sz)
831 		return -EINVAL;
832 
833 	/* If the data is transmitted via pfns, delegate to helper */
834 	if (state_data.type & HV_GET_SET_VP_STATE_TYPE_PFN) {
835 		unsigned long user_pfn = PFN_DOWN(args.buf_ptr);
836 		size_t page_count = PFN_DOWN(args.buf_sz);
837 
838 		return mshv_vp_ioctl_get_set_state_pfn(vp, state_data, user_pfn,
839 						       page_count, is_set);
840 	}
841 
842 	/* Paranoia check - this shouldn't happen! */
843 	if (data_sz > sizeof(vp_state)) {
844 		vp_err(vp, "Invalid vp state data size!\n");
845 		return -EINVAL;
846 	}
847 
848 	if (is_set) {
849 		if (copy_from_user(&vp_state, (__user void *)args.buf_ptr, data_sz))
850 			return -EFAULT;
851 
852 		return hv_call_set_vp_state(vp->vp_index,
853 					    vp->vp_partition->pt_id,
854 					    state_data, 0, NULL,
855 					    sizeof(vp_state), (u8 *)&vp_state);
856 	}
857 
858 	ret = hv_call_get_vp_state(vp->vp_index, vp->vp_partition->pt_id,
859 				   state_data, 0, NULL, &vp_state);
860 	if (ret)
861 		return ret;
862 
863 	if (copy_to_user((void __user *)args.buf_ptr, &vp_state, data_sz))
864 		return -EFAULT;
865 
866 	return 0;
867 }
868 
869 static long
mshv_vp_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)870 mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
871 {
872 	struct mshv_vp *vp = filp->private_data;
873 	long r = -ENOTTY;
874 
875 	if (mutex_lock_killable(&vp->vp_mutex))
876 		return -EINTR;
877 
878 	switch (ioctl) {
879 	case MSHV_RUN_VP:
880 		r = mshv_vp_ioctl_run_vp(vp, (void __user *)arg);
881 		break;
882 	case MSHV_GET_VP_STATE:
883 		r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, false);
884 		break;
885 	case MSHV_SET_VP_STATE:
886 		r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, true);
887 		break;
888 	case MSHV_ROOT_HVCALL:
889 		r = mshv_ioctl_passthru_hvcall(vp->vp_partition, false,
890 					       (void __user *)arg);
891 		break;
892 	default:
893 		vp_warn(vp, "Invalid ioctl: %#x\n", ioctl);
894 		break;
895 	}
896 	mutex_unlock(&vp->vp_mutex);
897 
898 	return r;
899 }
900 
mshv_vp_fault(struct vm_fault * vmf)901 static vm_fault_t mshv_vp_fault(struct vm_fault *vmf)
902 {
903 	struct mshv_vp *vp = vmf->vma->vm_file->private_data;
904 
905 	switch (vmf->vma->vm_pgoff) {
906 	case MSHV_VP_MMAP_OFFSET_REGISTERS:
907 		vmf->page = virt_to_page(vp->vp_register_page);
908 		break;
909 	case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE:
910 		vmf->page = virt_to_page(vp->vp_intercept_msg_page);
911 		break;
912 	case MSHV_VP_MMAP_OFFSET_GHCB:
913 		vmf->page = virt_to_page(vp->vp_ghcb_page);
914 		break;
915 	default:
916 		return VM_FAULT_SIGBUS;
917 	}
918 
919 	get_page(vmf->page);
920 
921 	return 0;
922 }
923 
mshv_vp_mmap(struct file * file,struct vm_area_struct * vma)924 static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma)
925 {
926 	struct mshv_vp *vp = file->private_data;
927 
928 	switch (vma->vm_pgoff) {
929 	case MSHV_VP_MMAP_OFFSET_REGISTERS:
930 		if (!vp->vp_register_page)
931 			return -ENODEV;
932 		break;
933 	case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE:
934 		if (!vp->vp_intercept_msg_page)
935 			return -ENODEV;
936 		break;
937 	case MSHV_VP_MMAP_OFFSET_GHCB:
938 		if (!vp->vp_ghcb_page)
939 			return -ENODEV;
940 		break;
941 	default:
942 		return -EINVAL;
943 	}
944 
945 	vma->vm_ops = &mshv_vp_vm_ops;
946 	return 0;
947 }
948 
949 static int
mshv_vp_release(struct inode * inode,struct file * filp)950 mshv_vp_release(struct inode *inode, struct file *filp)
951 {
952 	struct mshv_vp *vp = filp->private_data;
953 
954 	/* Rest of VP cleanup happens in destroy_partition() */
955 	mshv_partition_put(vp->vp_partition);
956 	return 0;
957 }
958 
mshv_vp_stats_unmap(u64 partition_id,u32 vp_index,void * stats_pages[])959 static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index,
960 				void *stats_pages[])
961 {
962 	union hv_stats_object_identity identity = {
963 		.vp.partition_id = partition_id,
964 		.vp.vp_index = vp_index,
965 	};
966 
967 	identity.vp.stats_area_type = HV_STATS_AREA_SELF;
968 	hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity);
969 
970 	identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
971 	hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity);
972 }
973 
mshv_vp_stats_map(u64 partition_id,u32 vp_index,void * stats_pages[])974 static int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
975 			     void *stats_pages[])
976 {
977 	union hv_stats_object_identity identity = {
978 		.vp.partition_id = partition_id,
979 		.vp.vp_index = vp_index,
980 	};
981 	int err;
982 
983 	identity.vp.stats_area_type = HV_STATS_AREA_SELF;
984 	err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
985 				&stats_pages[HV_STATS_AREA_SELF]);
986 	if (err)
987 		return err;
988 
989 	identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
990 	err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
991 				&stats_pages[HV_STATS_AREA_PARENT]);
992 	if (err)
993 		goto unmap_self;
994 
995 	return 0;
996 
997 unmap_self:
998 	identity.vp.stats_area_type = HV_STATS_AREA_SELF;
999 	hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity);
1000 	return err;
1001 }
1002 
1003 static long
mshv_partition_ioctl_create_vp(struct mshv_partition * partition,void __user * arg)1004 mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
1005 			       void __user *arg)
1006 {
1007 	struct mshv_create_vp args;
1008 	struct mshv_vp *vp;
1009 	struct page *intercept_msg_page, *register_page, *ghcb_page;
1010 	void *stats_pages[2];
1011 	long ret;
1012 
1013 	if (copy_from_user(&args, arg, sizeof(args)))
1014 		return -EFAULT;
1015 
1016 	if (args.vp_index >= MSHV_MAX_VPS)
1017 		return -EINVAL;
1018 
1019 	if (partition->pt_vp_array[args.vp_index])
1020 		return -EEXIST;
1021 
1022 	ret = hv_call_create_vp(NUMA_NO_NODE, partition->pt_id, args.vp_index,
1023 				0 /* Only valid for root partition VPs */);
1024 	if (ret)
1025 		return ret;
1026 
1027 	ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
1028 				   HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
1029 				   input_vtl_zero, &intercept_msg_page);
1030 	if (ret)
1031 		goto destroy_vp;
1032 
1033 	if (!mshv_partition_encrypted(partition)) {
1034 		ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
1035 					   HV_VP_STATE_PAGE_REGISTERS,
1036 					   input_vtl_zero, &register_page);
1037 		if (ret)
1038 			goto unmap_intercept_message_page;
1039 	}
1040 
1041 	if (mshv_partition_encrypted(partition) &&
1042 	    is_ghcb_mapping_available()) {
1043 		ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
1044 					   HV_VP_STATE_PAGE_GHCB,
1045 					   input_vtl_normal, &ghcb_page);
1046 		if (ret)
1047 			goto unmap_register_page;
1048 	}
1049 
1050 	/*
1051 	 * This mapping of the stats page is for detecting if dispatch thread
1052 	 * is blocked - only relevant for root scheduler
1053 	 */
1054 	if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) {
1055 		ret = mshv_vp_stats_map(partition->pt_id, args.vp_index,
1056 					stats_pages);
1057 		if (ret)
1058 			goto unmap_ghcb_page;
1059 	}
1060 
1061 	vp = kzalloc(sizeof(*vp), GFP_KERNEL);
1062 	if (!vp)
1063 		goto unmap_stats_pages;
1064 
1065 	vp->vp_partition = mshv_partition_get(partition);
1066 	if (!vp->vp_partition) {
1067 		ret = -EBADF;
1068 		goto free_vp;
1069 	}
1070 
1071 	mutex_init(&vp->vp_mutex);
1072 	init_waitqueue_head(&vp->run.vp_suspend_queue);
1073 	atomic64_set(&vp->run.vp_signaled_count, 0);
1074 
1075 	vp->vp_index = args.vp_index;
1076 	vp->vp_intercept_msg_page = page_to_virt(intercept_msg_page);
1077 	if (!mshv_partition_encrypted(partition))
1078 		vp->vp_register_page = page_to_virt(register_page);
1079 
1080 	if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
1081 		vp->vp_ghcb_page = page_to_virt(ghcb_page);
1082 
1083 	if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
1084 		memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages));
1085 
1086 	/*
1087 	 * Keep anon_inode_getfd last: it installs fd in the file struct and
1088 	 * thus makes the state accessible in user space.
1089 	 */
1090 	ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp,
1091 			       O_RDWR | O_CLOEXEC);
1092 	if (ret < 0)
1093 		goto put_partition;
1094 
1095 	/* already exclusive with the partition mutex for all ioctls */
1096 	partition->pt_vp_count++;
1097 	partition->pt_vp_array[args.vp_index] = vp;
1098 
1099 	return ret;
1100 
1101 put_partition:
1102 	mshv_partition_put(partition);
1103 free_vp:
1104 	kfree(vp);
1105 unmap_stats_pages:
1106 	if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
1107 		mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages);
1108 unmap_ghcb_page:
1109 	if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
1110 		hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
1111 				       HV_VP_STATE_PAGE_GHCB, ghcb_page,
1112 				       input_vtl_normal);
1113 unmap_register_page:
1114 	if (!mshv_partition_encrypted(partition))
1115 		hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
1116 				       HV_VP_STATE_PAGE_REGISTERS,
1117 				       register_page, input_vtl_zero);
1118 unmap_intercept_message_page:
1119 	hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
1120 			       HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
1121 			       intercept_msg_page, input_vtl_zero);
1122 destroy_vp:
1123 	hv_call_delete_vp(partition->pt_id, args.vp_index);
1124 	return ret;
1125 }
1126 
mshv_init_async_handler(struct mshv_partition * partition)1127 static int mshv_init_async_handler(struct mshv_partition *partition)
1128 {
1129 	if (completion_done(&partition->async_hypercall)) {
1130 		pt_err(partition,
1131 		       "Cannot issue async hypercall while another one in progress!\n");
1132 		return -EPERM;
1133 	}
1134 
1135 	reinit_completion(&partition->async_hypercall);
1136 	return 0;
1137 }
1138 
mshv_async_hvcall_handler(void * data,u64 * status)1139 static void mshv_async_hvcall_handler(void *data, u64 *status)
1140 {
1141 	struct mshv_partition *partition = data;
1142 
1143 	wait_for_completion(&partition->async_hypercall);
1144 	pt_dbg(partition, "Async hypercall completed!\n");
1145 
1146 	*status = partition->async_hypercall_status;
1147 }
1148 
1149 /*
1150  * NB: caller checks and makes sure mem->size is page aligned
1151  * Returns: 0 with regionpp updated on success, or -errno
1152  */
mshv_partition_create_region(struct mshv_partition * partition,struct mshv_user_mem_region * mem,struct mshv_mem_region ** regionpp,bool is_mmio)1153 static int mshv_partition_create_region(struct mshv_partition *partition,
1154 					struct mshv_user_mem_region *mem,
1155 					struct mshv_mem_region **regionpp,
1156 					bool is_mmio)
1157 {
1158 	struct mshv_mem_region *rg;
1159 	u64 nr_pages = HVPFN_DOWN(mem->size);
1160 
1161 	/* Reject overlapping regions */
1162 	spin_lock(&partition->pt_mem_regions_lock);
1163 	hlist_for_each_entry(rg, &partition->pt_mem_regions, hnode) {
1164 		if (mem->guest_pfn + nr_pages <= rg->start_gfn ||
1165 		    rg->start_gfn + rg->nr_pages <= mem->guest_pfn)
1166 			continue;
1167 		spin_unlock(&partition->pt_mem_regions_lock);
1168 		return -EEXIST;
1169 	}
1170 	spin_unlock(&partition->pt_mem_regions_lock);
1171 
1172 	rg = mshv_region_create(mem->guest_pfn, nr_pages,
1173 				mem->userspace_addr, mem->flags);
1174 	if (IS_ERR(rg))
1175 		return PTR_ERR(rg);
1176 
1177 	if (is_mmio)
1178 		rg->type = MSHV_REGION_TYPE_MMIO;
1179 	else if (mshv_partition_encrypted(partition) ||
1180 		 !mshv_region_movable_init(rg))
1181 		rg->type = MSHV_REGION_TYPE_MEM_PINNED;
1182 	else
1183 		rg->type = MSHV_REGION_TYPE_MEM_MOVABLE;
1184 
1185 	rg->partition = partition;
1186 
1187 	*regionpp = rg;
1188 
1189 	return 0;
1190 }
1191 
1192 /**
1193  * mshv_prepare_pinned_region - Pin and map memory regions
1194  * @region: Pointer to the memory region structure
1195  *
1196  * This function processes memory regions that are explicitly marked as pinned.
1197  * Pinned regions are preallocated, mapped upfront, and do not rely on fault-based
1198  * population. The function ensures the region is properly populated, handles
1199  * encryption requirements for SNP partitions if applicable, maps the region,
1200  * and performs necessary sharing or eviction operations based on the mapping
1201  * result.
1202  *
1203  * Return: 0 on success, negative error code on failure.
1204  */
mshv_prepare_pinned_region(struct mshv_mem_region * region)1205 static int mshv_prepare_pinned_region(struct mshv_mem_region *region)
1206 {
1207 	struct mshv_partition *partition = region->partition;
1208 	int ret;
1209 
1210 	ret = mshv_region_pin(region);
1211 	if (ret) {
1212 		pt_err(partition, "Failed to pin memory region: %d\n",
1213 		       ret);
1214 		goto err_out;
1215 	}
1216 
1217 	/*
1218 	 * For an SNP partition it is a requirement that for every memory region
1219 	 * that we are going to map for this partition we should make sure that
1220 	 * host access to that region is released. This is ensured by doing an
1221 	 * additional hypercall which will update the SLAT to release host
1222 	 * access to guest memory regions.
1223 	 */
1224 	if (mshv_partition_encrypted(partition)) {
1225 		ret = mshv_region_unshare(region);
1226 		if (ret) {
1227 			pt_err(partition,
1228 			       "Failed to unshare memory region (guest_pfn: %llu): %d\n",
1229 			       region->start_gfn, ret);
1230 			goto invalidate_region;
1231 		}
1232 	}
1233 
1234 	ret = mshv_region_map(region);
1235 	if (ret && mshv_partition_encrypted(partition)) {
1236 		int shrc;
1237 
1238 		shrc = mshv_region_share(region);
1239 		if (!shrc)
1240 			goto invalidate_region;
1241 
1242 		pt_err(partition,
1243 		       "Failed to share memory region (guest_pfn: %llu): %d\n",
1244 		       region->start_gfn, shrc);
1245 		/*
1246 		 * Don't unpin if marking shared failed because pages are no
1247 		 * longer mapped in the host, ie root, anymore.
1248 		 */
1249 		goto err_out;
1250 	}
1251 
1252 	return 0;
1253 
1254 invalidate_region:
1255 	mshv_region_invalidate(region);
1256 err_out:
1257 	return ret;
1258 }
1259 
1260 /*
1261  * This maps two things: guest RAM and for pci passthru mmio space.
1262  *
1263  * mmio:
1264  *  - vfio overloads vm_pgoff to store the mmio start pfn/spa.
1265  *  - Two things need to happen for mapping mmio range:
1266  *	1. mapped in the uaddr so VMM can access it.
1267  *	2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it.
1268  *
1269  *   This function takes care of the second. The first one is managed by vfio,
1270  *   and hence is taken care of via vfio_pci_mmap_fault().
1271  */
1272 static long
mshv_map_user_memory(struct mshv_partition * partition,struct mshv_user_mem_region mem)1273 mshv_map_user_memory(struct mshv_partition *partition,
1274 		     struct mshv_user_mem_region mem)
1275 {
1276 	struct mshv_mem_region *region;
1277 	struct vm_area_struct *vma;
1278 	bool is_mmio;
1279 	ulong mmio_pfn;
1280 	long ret;
1281 
1282 	if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP) ||
1283 	    !access_ok((const void *)mem.userspace_addr, mem.size))
1284 		return -EINVAL;
1285 
1286 	mmap_read_lock(current->mm);
1287 	vma = vma_lookup(current->mm, mem.userspace_addr);
1288 	is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
1289 	mmio_pfn = is_mmio ? vma->vm_pgoff : 0;
1290 	mmap_read_unlock(current->mm);
1291 
1292 	if (!vma)
1293 		return -EINVAL;
1294 
1295 	ret = mshv_partition_create_region(partition, &mem, &region,
1296 					   is_mmio);
1297 	if (ret)
1298 		return ret;
1299 
1300 	switch (region->type) {
1301 	case MSHV_REGION_TYPE_MEM_PINNED:
1302 		ret = mshv_prepare_pinned_region(region);
1303 		break;
1304 	case MSHV_REGION_TYPE_MEM_MOVABLE:
1305 		/*
1306 		 * For movable memory regions, remap with no access to let
1307 		 * the hypervisor track dirty pages, enabling pre-copy live
1308 		 * migration.
1309 		 */
1310 		ret = hv_call_map_gpa_pages(partition->pt_id,
1311 					    region->start_gfn,
1312 					    region->nr_pages,
1313 					    HV_MAP_GPA_NO_ACCESS, NULL);
1314 		break;
1315 	case MSHV_REGION_TYPE_MMIO:
1316 		ret = hv_call_map_mmio_pages(partition->pt_id,
1317 					     region->start_gfn,
1318 					     mmio_pfn,
1319 					     region->nr_pages);
1320 		break;
1321 	}
1322 
1323 	if (ret)
1324 		goto errout;
1325 
1326 	spin_lock(&partition->pt_mem_regions_lock);
1327 	hlist_add_head(&region->hnode, &partition->pt_mem_regions);
1328 	spin_unlock(&partition->pt_mem_regions_lock);
1329 
1330 	return 0;
1331 
1332 errout:
1333 	vfree(region);
1334 	return ret;
1335 }
1336 
1337 /* Called for unmapping both the guest ram and the mmio space */
1338 static long
mshv_unmap_user_memory(struct mshv_partition * partition,struct mshv_user_mem_region mem)1339 mshv_unmap_user_memory(struct mshv_partition *partition,
1340 		       struct mshv_user_mem_region mem)
1341 {
1342 	struct mshv_mem_region *region;
1343 
1344 	if (!(mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP)))
1345 		return -EINVAL;
1346 
1347 	spin_lock(&partition->pt_mem_regions_lock);
1348 
1349 	region = mshv_partition_region_by_gfn(partition, mem.guest_pfn);
1350 	if (!region) {
1351 		spin_unlock(&partition->pt_mem_regions_lock);
1352 		return -ENOENT;
1353 	}
1354 
1355 	/* Paranoia check */
1356 	if (region->start_uaddr != mem.userspace_addr ||
1357 	    region->start_gfn != mem.guest_pfn ||
1358 	    region->nr_pages != HVPFN_DOWN(mem.size)) {
1359 		spin_unlock(&partition->pt_mem_regions_lock);
1360 		return -EINVAL;
1361 	}
1362 
1363 	hlist_del(&region->hnode);
1364 
1365 	spin_unlock(&partition->pt_mem_regions_lock);
1366 
1367 	mshv_region_put(region);
1368 
1369 	return 0;
1370 }
1371 
1372 static long
mshv_partition_ioctl_set_memory(struct mshv_partition * partition,struct mshv_user_mem_region __user * user_mem)1373 mshv_partition_ioctl_set_memory(struct mshv_partition *partition,
1374 				struct mshv_user_mem_region __user *user_mem)
1375 {
1376 	struct mshv_user_mem_region mem;
1377 
1378 	if (copy_from_user(&mem, user_mem, sizeof(mem)))
1379 		return -EFAULT;
1380 
1381 	if (!mem.size ||
1382 	    !PAGE_ALIGNED(mem.size) ||
1383 	    !PAGE_ALIGNED(mem.userspace_addr) ||
1384 	    (mem.flags & ~MSHV_SET_MEM_FLAGS_MASK) ||
1385 	    mshv_field_nonzero(mem, rsvd))
1386 		return -EINVAL;
1387 
1388 	if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP))
1389 		return mshv_unmap_user_memory(partition, mem);
1390 
1391 	return mshv_map_user_memory(partition, mem);
1392 }
1393 
1394 static long
mshv_partition_ioctl_ioeventfd(struct mshv_partition * partition,void __user * user_args)1395 mshv_partition_ioctl_ioeventfd(struct mshv_partition *partition,
1396 			       void __user *user_args)
1397 {
1398 	struct mshv_user_ioeventfd args;
1399 
1400 	if (copy_from_user(&args, user_args, sizeof(args)))
1401 		return -EFAULT;
1402 
1403 	return mshv_set_unset_ioeventfd(partition, &args);
1404 }
1405 
1406 static long
mshv_partition_ioctl_irqfd(struct mshv_partition * partition,void __user * user_args)1407 mshv_partition_ioctl_irqfd(struct mshv_partition *partition,
1408 			   void __user *user_args)
1409 {
1410 	struct mshv_user_irqfd args;
1411 
1412 	if (copy_from_user(&args, user_args, sizeof(args)))
1413 		return -EFAULT;
1414 
1415 	return mshv_set_unset_irqfd(partition, &args);
1416 }
1417 
1418 static long
mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition * partition,void __user * user_args)1419 mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition *partition,
1420 					    void __user *user_args)
1421 {
1422 	struct mshv_gpap_access_bitmap args;
1423 	union hv_gpa_page_access_state *states;
1424 	long ret, i;
1425 	union hv_gpa_page_access_state_flags hv_flags = {};
1426 	u8 hv_type_mask;
1427 	ulong bitmap_buf_sz, states_buf_sz;
1428 	int written = 0;
1429 
1430 	if (copy_from_user(&args, user_args, sizeof(args)))
1431 		return -EFAULT;
1432 
1433 	if (args.access_type >= MSHV_GPAP_ACCESS_TYPE_COUNT ||
1434 	    args.access_op >= MSHV_GPAP_ACCESS_OP_COUNT ||
1435 	    mshv_field_nonzero(args, rsvd) || !args.page_count ||
1436 	    !args.bitmap_ptr)
1437 		return -EINVAL;
1438 
1439 	if (check_mul_overflow(args.page_count, sizeof(*states), &states_buf_sz))
1440 		return -E2BIG;
1441 
1442 	/* Num bytes needed to store bitmap; one bit per page rounded up */
1443 	bitmap_buf_sz = DIV_ROUND_UP(args.page_count, 8);
1444 
1445 	/* Sanity check */
1446 	if (bitmap_buf_sz > states_buf_sz)
1447 		return -EBADFD;
1448 
1449 	switch (args.access_type) {
1450 	case MSHV_GPAP_ACCESS_TYPE_ACCESSED:
1451 		hv_type_mask = 1;
1452 		if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) {
1453 			hv_flags.clear_accessed = 1;
1454 			/* not accessed implies not dirty */
1455 			hv_flags.clear_dirty = 1;
1456 		} else { /* MSHV_GPAP_ACCESS_OP_SET */
1457 			hv_flags.set_accessed = 1;
1458 		}
1459 		break;
1460 	case MSHV_GPAP_ACCESS_TYPE_DIRTY:
1461 		hv_type_mask = 2;
1462 		if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) {
1463 			hv_flags.clear_dirty = 1;
1464 		} else { /* MSHV_GPAP_ACCESS_OP_SET */
1465 			hv_flags.set_dirty = 1;
1466 			/* dirty implies accessed */
1467 			hv_flags.set_accessed = 1;
1468 		}
1469 		break;
1470 	}
1471 
1472 	states = vzalloc(states_buf_sz);
1473 	if (!states)
1474 		return -ENOMEM;
1475 
1476 	ret = hv_call_get_gpa_access_states(partition->pt_id, args.page_count,
1477 					    args.gpap_base, hv_flags, &written,
1478 					    states);
1479 	if (ret)
1480 		goto free_return;
1481 
1482 	/*
1483 	 * Overwrite states buffer with bitmap - the bits in hv_type_mask
1484 	 * correspond to bitfields in hv_gpa_page_access_state
1485 	 */
1486 	for (i = 0; i < written; ++i)
1487 		__assign_bit(i, (ulong *)states,
1488 			     states[i].as_uint8 & hv_type_mask);
1489 
1490 	/* zero the unused bits in the last byte(s) of the returned bitmap */
1491 	for (i = written; i < bitmap_buf_sz * 8; ++i)
1492 		__clear_bit(i, (ulong *)states);
1493 
1494 	if (copy_to_user((void __user *)args.bitmap_ptr, states, bitmap_buf_sz))
1495 		ret = -EFAULT;
1496 
1497 free_return:
1498 	vfree(states);
1499 	return ret;
1500 }
1501 
1502 static long
mshv_partition_ioctl_set_msi_routing(struct mshv_partition * partition,void __user * user_args)1503 mshv_partition_ioctl_set_msi_routing(struct mshv_partition *partition,
1504 				     void __user *user_args)
1505 {
1506 	struct mshv_user_irq_entry *entries = NULL;
1507 	struct mshv_user_irq_table args;
1508 	long ret;
1509 
1510 	if (copy_from_user(&args, user_args, sizeof(args)))
1511 		return -EFAULT;
1512 
1513 	if (args.nr > MSHV_MAX_GUEST_IRQS ||
1514 	    mshv_field_nonzero(args, rsvd))
1515 		return -EINVAL;
1516 
1517 	if (args.nr) {
1518 		struct mshv_user_irq_table __user *urouting = user_args;
1519 
1520 		entries = vmemdup_user(urouting->entries,
1521 				       array_size(sizeof(*entries),
1522 						  args.nr));
1523 		if (IS_ERR(entries))
1524 			return PTR_ERR(entries);
1525 	}
1526 	ret = mshv_update_routing_table(partition, entries, args.nr);
1527 	kvfree(entries);
1528 
1529 	return ret;
1530 }
1531 
1532 static long
mshv_partition_ioctl_initialize(struct mshv_partition * partition)1533 mshv_partition_ioctl_initialize(struct mshv_partition *partition)
1534 {
1535 	long ret;
1536 
1537 	if (partition->pt_initialized)
1538 		return 0;
1539 
1540 	ret = hv_call_initialize_partition(partition->pt_id);
1541 	if (ret)
1542 		goto withdraw_mem;
1543 
1544 	partition->pt_initialized = true;
1545 
1546 	return 0;
1547 
1548 withdraw_mem:
1549 	hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);
1550 
1551 	return ret;
1552 }
1553 
1554 static long
mshv_partition_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)1555 mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
1556 {
1557 	struct mshv_partition *partition = filp->private_data;
1558 	long ret;
1559 	void __user *uarg = (void __user *)arg;
1560 
1561 	if (mutex_lock_killable(&partition->pt_mutex))
1562 		return -EINTR;
1563 
1564 	switch (ioctl) {
1565 	case MSHV_INITIALIZE_PARTITION:
1566 		ret = mshv_partition_ioctl_initialize(partition);
1567 		break;
1568 	case MSHV_SET_GUEST_MEMORY:
1569 		ret = mshv_partition_ioctl_set_memory(partition, uarg);
1570 		break;
1571 	case MSHV_CREATE_VP:
1572 		ret = mshv_partition_ioctl_create_vp(partition, uarg);
1573 		break;
1574 	case MSHV_IRQFD:
1575 		ret = mshv_partition_ioctl_irqfd(partition, uarg);
1576 		break;
1577 	case MSHV_IOEVENTFD:
1578 		ret = mshv_partition_ioctl_ioeventfd(partition, uarg);
1579 		break;
1580 	case MSHV_SET_MSI_ROUTING:
1581 		ret = mshv_partition_ioctl_set_msi_routing(partition, uarg);
1582 		break;
1583 	case MSHV_GET_GPAP_ACCESS_BITMAP:
1584 		ret = mshv_partition_ioctl_get_gpap_access_bitmap(partition,
1585 								  uarg);
1586 		break;
1587 	case MSHV_ROOT_HVCALL:
1588 		ret = mshv_ioctl_passthru_hvcall(partition, true, uarg);
1589 		break;
1590 	default:
1591 		ret = -ENOTTY;
1592 	}
1593 
1594 	mutex_unlock(&partition->pt_mutex);
1595 	return ret;
1596 }
1597 
1598 static int
disable_vp_dispatch(struct mshv_vp * vp)1599 disable_vp_dispatch(struct mshv_vp *vp)
1600 {
1601 	int ret;
1602 	struct hv_register_assoc dispatch_suspend = {
1603 		.name = HV_REGISTER_DISPATCH_SUSPEND,
1604 		.value.dispatch_suspend.suspended = 1,
1605 	};
1606 
1607 	ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
1608 				    1, &dispatch_suspend);
1609 	if (ret)
1610 		vp_err(vp, "failed to suspend\n");
1611 
1612 	return ret;
1613 }
1614 
1615 static int
get_vp_signaled_count(struct mshv_vp * vp,u64 * count)1616 get_vp_signaled_count(struct mshv_vp *vp, u64 *count)
1617 {
1618 	int ret;
1619 	struct hv_register_assoc root_signal_count = {
1620 		.name = HV_REGISTER_VP_ROOT_SIGNAL_COUNT,
1621 	};
1622 
1623 	ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
1624 				    1, &root_signal_count);
1625 
1626 	if (ret) {
1627 		vp_err(vp, "Failed to get root signal count");
1628 		*count = 0;
1629 		return ret;
1630 	}
1631 
1632 	*count = root_signal_count.value.reg64;
1633 
1634 	return ret;
1635 }
1636 
1637 static void
drain_vp_signals(struct mshv_vp * vp)1638 drain_vp_signals(struct mshv_vp *vp)
1639 {
1640 	u64 hv_signal_count;
1641 	u64 vp_signal_count;
1642 
1643 	get_vp_signaled_count(vp, &hv_signal_count);
1644 
1645 	vp_signal_count = atomic64_read(&vp->run.vp_signaled_count);
1646 
1647 	/*
1648 	 * There should be at most 1 outstanding notification, but be extra
1649 	 * careful anyway.
1650 	 */
1651 	while (hv_signal_count != vp_signal_count) {
1652 		WARN_ON(hv_signal_count - vp_signal_count != 1);
1653 
1654 		if (wait_event_interruptible(vp->run.vp_suspend_queue,
1655 					     vp->run.kicked_by_hv == 1))
1656 			break;
1657 		vp->run.kicked_by_hv = 0;
1658 		vp_signal_count = atomic64_read(&vp->run.vp_signaled_count);
1659 	}
1660 }
1661 
drain_all_vps(const struct mshv_partition * partition)1662 static void drain_all_vps(const struct mshv_partition *partition)
1663 {
1664 	int i;
1665 	struct mshv_vp *vp;
1666 
1667 	/*
1668 	 * VPs are reachable from ISR. It is safe to not take the partition
1669 	 * lock because nobody else can enter this function and drop the
1670 	 * partition from the list.
1671 	 */
1672 	for (i = 0; i < MSHV_MAX_VPS; i++) {
1673 		vp = partition->pt_vp_array[i];
1674 		if (!vp)
1675 			continue;
1676 		/*
1677 		 * Disable dispatching of the VP in the hypervisor. After this
1678 		 * the hypervisor guarantees it won't generate any signals for
1679 		 * the VP and the hypervisor's VP signal count won't change.
1680 		 */
1681 		disable_vp_dispatch(vp);
1682 		drain_vp_signals(vp);
1683 	}
1684 }
1685 
1686 static void
remove_partition(struct mshv_partition * partition)1687 remove_partition(struct mshv_partition *partition)
1688 {
1689 	spin_lock(&mshv_root.pt_ht_lock);
1690 	hlist_del_rcu(&partition->pt_hnode);
1691 	spin_unlock(&mshv_root.pt_ht_lock);
1692 
1693 	synchronize_rcu();
1694 }
1695 
1696 /*
1697  * Tear down a partition and remove it from the list.
1698  * Partition's refcount must be 0
1699  */
destroy_partition(struct mshv_partition * partition)1700 static void destroy_partition(struct mshv_partition *partition)
1701 {
1702 	struct mshv_vp *vp;
1703 	struct mshv_mem_region *region;
1704 	struct hlist_node *n;
1705 	int i;
1706 
1707 	if (refcount_read(&partition->pt_ref_count)) {
1708 		pt_err(partition,
1709 		       "Attempt to destroy partition but refcount > 0\n");
1710 		return;
1711 	}
1712 
1713 	if (partition->pt_initialized) {
1714 		/*
1715 		 * We only need to drain signals for root scheduler. This should be
1716 		 * done before removing the partition from the partition list.
1717 		 */
1718 		if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
1719 			drain_all_vps(partition);
1720 
1721 		/* Remove vps */
1722 		for (i = 0; i < MSHV_MAX_VPS; ++i) {
1723 			vp = partition->pt_vp_array[i];
1724 			if (!vp)
1725 				continue;
1726 
1727 			if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
1728 				mshv_vp_stats_unmap(partition->pt_id, vp->vp_index,
1729 						    (void **)vp->vp_stats_pages);
1730 
1731 			if (vp->vp_register_page) {
1732 				(void)hv_unmap_vp_state_page(partition->pt_id,
1733 							     vp->vp_index,
1734 							     HV_VP_STATE_PAGE_REGISTERS,
1735 							     virt_to_page(vp->vp_register_page),
1736 							     input_vtl_zero);
1737 				vp->vp_register_page = NULL;
1738 			}
1739 
1740 			(void)hv_unmap_vp_state_page(partition->pt_id,
1741 						     vp->vp_index,
1742 						     HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
1743 						     virt_to_page(vp->vp_intercept_msg_page),
1744 						     input_vtl_zero);
1745 			vp->vp_intercept_msg_page = NULL;
1746 
1747 			if (vp->vp_ghcb_page) {
1748 				(void)hv_unmap_vp_state_page(partition->pt_id,
1749 							     vp->vp_index,
1750 							     HV_VP_STATE_PAGE_GHCB,
1751 							     virt_to_page(vp->vp_ghcb_page),
1752 							     input_vtl_normal);
1753 				vp->vp_ghcb_page = NULL;
1754 			}
1755 
1756 			kfree(vp);
1757 
1758 			partition->pt_vp_array[i] = NULL;
1759 		}
1760 
1761 		/* Deallocates and unmaps everything including vcpus, GPA mappings etc */
1762 		hv_call_finalize_partition(partition->pt_id);
1763 
1764 		partition->pt_initialized = false;
1765 	}
1766 
1767 	remove_partition(partition);
1768 
1769 	hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions,
1770 				  hnode) {
1771 		hlist_del(&region->hnode);
1772 		mshv_region_put(region);
1773 	}
1774 
1775 	/* Withdraw and free all pages we deposited */
1776 	hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);
1777 	hv_call_delete_partition(partition->pt_id);
1778 
1779 	mshv_free_routing_table(partition);
1780 	kfree(partition);
1781 }
1782 
1783 struct
mshv_partition_get(struct mshv_partition * partition)1784 mshv_partition *mshv_partition_get(struct mshv_partition *partition)
1785 {
1786 	if (refcount_inc_not_zero(&partition->pt_ref_count))
1787 		return partition;
1788 	return NULL;
1789 }
1790 
1791 struct
mshv_partition_find(u64 partition_id)1792 mshv_partition *mshv_partition_find(u64 partition_id)
1793 	__must_hold(RCU)
1794 {
1795 	struct mshv_partition *p;
1796 
1797 	hash_for_each_possible_rcu(mshv_root.pt_htable, p, pt_hnode,
1798 				   partition_id)
1799 		if (p->pt_id == partition_id)
1800 			return p;
1801 
1802 	return NULL;
1803 }
1804 
1805 void
mshv_partition_put(struct mshv_partition * partition)1806 mshv_partition_put(struct mshv_partition *partition)
1807 {
1808 	if (refcount_dec_and_test(&partition->pt_ref_count))
1809 		destroy_partition(partition);
1810 }
1811 
1812 static int
mshv_partition_release(struct inode * inode,struct file * filp)1813 mshv_partition_release(struct inode *inode, struct file *filp)
1814 {
1815 	struct mshv_partition *partition = filp->private_data;
1816 
1817 	mshv_eventfd_release(partition);
1818 
1819 	cleanup_srcu_struct(&partition->pt_irq_srcu);
1820 
1821 	mshv_partition_put(partition);
1822 
1823 	return 0;
1824 }
1825 
1826 static int
add_partition(struct mshv_partition * partition)1827 add_partition(struct mshv_partition *partition)
1828 {
1829 	spin_lock(&mshv_root.pt_ht_lock);
1830 
1831 	hash_add_rcu(mshv_root.pt_htable, &partition->pt_hnode,
1832 		     partition->pt_id);
1833 
1834 	spin_unlock(&mshv_root.pt_ht_lock);
1835 
1836 	return 0;
1837 }
1838 
1839 static_assert(MSHV_NUM_CPU_FEATURES_BANKS ==
1840 	      HV_PARTITION_PROCESSOR_FEATURES_BANKS);
1841 
mshv_ioctl_process_pt_flags(void __user * user_arg,u64 * pt_flags,struct hv_partition_creation_properties * cr_props,union hv_partition_isolation_properties * isol_props)1842 static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags,
1843 					struct hv_partition_creation_properties *cr_props,
1844 					union hv_partition_isolation_properties *isol_props)
1845 {
1846 	int i;
1847 	struct mshv_create_partition_v2 args;
1848 	union hv_partition_processor_features *disabled_procs;
1849 	union hv_partition_processor_xsave_features *disabled_xsave;
1850 
1851 	/* First, copy v1 struct in case user is on previous versions */
1852 	if (copy_from_user(&args, user_arg,
1853 			   sizeof(struct mshv_create_partition)))
1854 		return -EFAULT;
1855 
1856 	if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
1857 	    args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
1858 		return -EINVAL;
1859 
1860 	disabled_procs = &cr_props->disabled_processor_features;
1861 	disabled_xsave = &cr_props->disabled_processor_xsave_features;
1862 
1863 	/* Check if user provided newer struct with feature fields */
1864 	if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES)) {
1865 		if (copy_from_user(&args, user_arg, sizeof(args)))
1866 			return -EFAULT;
1867 
1868 		/* Re-validate v1 fields after second copy_from_user() */
1869 		if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
1870 		    args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
1871 			return -EINVAL;
1872 
1873 		if (args.pt_num_cpu_fbanks != MSHV_NUM_CPU_FEATURES_BANKS ||
1874 		    mshv_field_nonzero(args, pt_rsvd) ||
1875 		    mshv_field_nonzero(args, pt_rsvd1))
1876 			return -EINVAL;
1877 
1878 		/*
1879 		 * Note this assumes MSHV_NUM_CPU_FEATURES_BANKS will never
1880 		 * change and equals HV_PARTITION_PROCESSOR_FEATURES_BANKS
1881 		 * (i.e. 2).
1882 		 *
1883 		 * Further banks (index >= 2) will be modifiable as 'early'
1884 		 * properties via the set partition property hypercall.
1885 		 */
1886 		for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
1887 			disabled_procs->as_uint64[i] = args.pt_cpu_fbanks[i];
1888 
1889 #if IS_ENABLED(CONFIG_X86_64)
1890 		disabled_xsave->as_uint64 = args.pt_disabled_xsave;
1891 #else
1892 		/*
1893 		 * In practice this field is ignored on arm64, but safer to
1894 		 * zero it in case it is ever used.
1895 		 */
1896 		disabled_xsave->as_uint64 = 0;
1897 
1898 		if (mshv_field_nonzero(args, pt_rsvd2))
1899 			return -EINVAL;
1900 #endif
1901 	} else {
1902 		/*
1903 		 * v1 behavior: try to enable everything. The hypervisor will
1904 		 * disable features that are not supported. The banks can be
1905 		 * queried via the get partition property hypercall.
1906 		 */
1907 		for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
1908 			disabled_procs->as_uint64[i] = 0;
1909 
1910 		disabled_xsave->as_uint64 = 0;
1911 	}
1912 
1913 	/* Only support EXO partitions */
1914 	*pt_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION |
1915 		    HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED;
1916 
1917 	if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_LAPIC))
1918 		*pt_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED;
1919 	if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_X2APIC))
1920 		*pt_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE;
1921 	if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_GPA_SUPER_PAGES))
1922 		*pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED;
1923 
1924 	isol_props->as_uint64 = 0;
1925 
1926 	switch (args.pt_isolation) {
1927 	case MSHV_PT_ISOLATION_NONE:
1928 		isol_props->isolation_type = HV_PARTITION_ISOLATION_TYPE_NONE;
1929 		break;
1930 	}
1931 
1932 	return 0;
1933 }
1934 
1935 static long
mshv_ioctl_create_partition(void __user * user_arg,struct device * module_dev)1936 mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
1937 {
1938 	u64 creation_flags;
1939 	struct hv_partition_creation_properties creation_properties;
1940 	union hv_partition_isolation_properties isolation_properties;
1941 	struct mshv_partition *partition;
1942 	long ret;
1943 
1944 	ret = mshv_ioctl_process_pt_flags(user_arg, &creation_flags,
1945 					  &creation_properties,
1946 					  &isolation_properties);
1947 	if (ret)
1948 		return ret;
1949 
1950 	partition = kzalloc(sizeof(*partition), GFP_KERNEL);
1951 	if (!partition)
1952 		return -ENOMEM;
1953 
1954 	partition->pt_module_dev = module_dev;
1955 	partition->isolation_type = isolation_properties.isolation_type;
1956 
1957 	refcount_set(&partition->pt_ref_count, 1);
1958 
1959 	mutex_init(&partition->pt_mutex);
1960 
1961 	mutex_init(&partition->pt_irq_lock);
1962 
1963 	init_completion(&partition->async_hypercall);
1964 
1965 	INIT_HLIST_HEAD(&partition->irq_ack_notifier_list);
1966 
1967 	INIT_HLIST_HEAD(&partition->pt_devices);
1968 
1969 	spin_lock_init(&partition->pt_mem_regions_lock);
1970 	INIT_HLIST_HEAD(&partition->pt_mem_regions);
1971 
1972 	mshv_eventfd_init(partition);
1973 
1974 	ret = init_srcu_struct(&partition->pt_irq_srcu);
1975 	if (ret)
1976 		goto free_partition;
1977 
1978 	ret = hv_call_create_partition(creation_flags,
1979 				       creation_properties,
1980 				       isolation_properties,
1981 				       &partition->pt_id);
1982 	if (ret)
1983 		goto cleanup_irq_srcu;
1984 
1985 	ret = add_partition(partition);
1986 	if (ret)
1987 		goto delete_partition;
1988 
1989 	ret = mshv_init_async_handler(partition);
1990 	if (!ret) {
1991 		ret = FD_ADD(O_CLOEXEC, anon_inode_getfile("mshv_partition",
1992 							   &mshv_partition_fops,
1993 							   partition, O_RDWR));
1994 		if (ret >= 0)
1995 			return ret;
1996 	}
1997 	remove_partition(partition);
1998 delete_partition:
1999 	hv_call_delete_partition(partition->pt_id);
2000 cleanup_irq_srcu:
2001 	cleanup_srcu_struct(&partition->pt_irq_srcu);
2002 free_partition:
2003 	kfree(partition);
2004 
2005 	return ret;
2006 }
2007 
mshv_dev_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)2008 static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl,
2009 			   unsigned long arg)
2010 {
2011 	struct miscdevice *misc = filp->private_data;
2012 
2013 	switch (ioctl) {
2014 	case MSHV_CREATE_PARTITION:
2015 		return mshv_ioctl_create_partition((void __user *)arg,
2016 						misc->this_device);
2017 	case MSHV_ROOT_HVCALL:
2018 		return mshv_ioctl_passthru_hvcall(NULL, false,
2019 					(void __user *)arg);
2020 	}
2021 
2022 	return -ENOTTY;
2023 }
2024 
2025 static int
mshv_dev_open(struct inode * inode,struct file * filp)2026 mshv_dev_open(struct inode *inode, struct file *filp)
2027 {
2028 	return 0;
2029 }
2030 
2031 static int
mshv_dev_release(struct inode * inode,struct file * filp)2032 mshv_dev_release(struct inode *inode, struct file *filp)
2033 {
2034 	return 0;
2035 }
2036 
2037 static int mshv_cpuhp_online;
2038 static int mshv_root_sched_online;
2039 
scheduler_type_to_string(enum hv_scheduler_type type)2040 static const char *scheduler_type_to_string(enum hv_scheduler_type type)
2041 {
2042 	switch (type) {
2043 	case HV_SCHEDULER_TYPE_LP:
2044 		return "classic scheduler without SMT";
2045 	case HV_SCHEDULER_TYPE_LP_SMT:
2046 		return "classic scheduler with SMT";
2047 	case HV_SCHEDULER_TYPE_CORE_SMT:
2048 		return "core scheduler";
2049 	case HV_SCHEDULER_TYPE_ROOT:
2050 		return "root scheduler";
2051 	default:
2052 		return "unknown scheduler";
2053 	};
2054 }
2055 
2056 /* TODO move this to hv_common.c when needed outside */
hv_retrieve_scheduler_type(enum hv_scheduler_type * out)2057 static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out)
2058 {
2059 	struct hv_input_get_system_property *input;
2060 	struct hv_output_get_system_property *output;
2061 	unsigned long flags;
2062 	u64 status;
2063 
2064 	local_irq_save(flags);
2065 	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
2066 	output = *this_cpu_ptr(hyperv_pcpu_output_arg);
2067 
2068 	memset(input, 0, sizeof(*input));
2069 	memset(output, 0, sizeof(*output));
2070 	input->property_id = HV_SYSTEM_PROPERTY_SCHEDULER_TYPE;
2071 
2072 	status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output);
2073 	if (!hv_result_success(status)) {
2074 		local_irq_restore(flags);
2075 		pr_err("%s: %s\n", __func__, hv_result_to_string(status));
2076 		return hv_result_to_errno(status);
2077 	}
2078 
2079 	*out = output->scheduler_type;
2080 	local_irq_restore(flags);
2081 
2082 	return 0;
2083 }
2084 
2085 /* Retrieve and stash the supported scheduler type */
mshv_retrieve_scheduler_type(struct device * dev)2086 static int __init mshv_retrieve_scheduler_type(struct device *dev)
2087 {
2088 	int ret = 0;
2089 
2090 	if (hv_l1vh_partition())
2091 		hv_scheduler_type = HV_SCHEDULER_TYPE_CORE_SMT;
2092 	else
2093 		ret = hv_retrieve_scheduler_type(&hv_scheduler_type);
2094 
2095 	if (ret)
2096 		return ret;
2097 
2098 	dev_info(dev, "Hypervisor using %s\n",
2099 		 scheduler_type_to_string(hv_scheduler_type));
2100 
2101 	switch (hv_scheduler_type) {
2102 	case HV_SCHEDULER_TYPE_CORE_SMT:
2103 	case HV_SCHEDULER_TYPE_LP_SMT:
2104 	case HV_SCHEDULER_TYPE_ROOT:
2105 	case HV_SCHEDULER_TYPE_LP:
2106 		/* Supported scheduler, nothing to do */
2107 		break;
2108 	default:
2109 		dev_err(dev, "unsupported scheduler 0x%x, bailing.\n",
2110 			hv_scheduler_type);
2111 		return -EOPNOTSUPP;
2112 	}
2113 
2114 	return 0;
2115 }
2116 
mshv_root_scheduler_init(unsigned int cpu)2117 static int mshv_root_scheduler_init(unsigned int cpu)
2118 {
2119 	void **inputarg, **outputarg, *p;
2120 
2121 	inputarg = (void **)this_cpu_ptr(root_scheduler_input);
2122 	outputarg = (void **)this_cpu_ptr(root_scheduler_output);
2123 
2124 	/* Allocate two consecutive pages. One for input, one for output. */
2125 	p = kmalloc(2 * HV_HYP_PAGE_SIZE, GFP_KERNEL);
2126 	if (!p)
2127 		return -ENOMEM;
2128 
2129 	*inputarg = p;
2130 	*outputarg = (char *)p + HV_HYP_PAGE_SIZE;
2131 
2132 	return 0;
2133 }
2134 
mshv_root_scheduler_cleanup(unsigned int cpu)2135 static int mshv_root_scheduler_cleanup(unsigned int cpu)
2136 {
2137 	void *p, **inputarg, **outputarg;
2138 
2139 	inputarg = (void **)this_cpu_ptr(root_scheduler_input);
2140 	outputarg = (void **)this_cpu_ptr(root_scheduler_output);
2141 
2142 	p = *inputarg;
2143 
2144 	*inputarg = NULL;
2145 	*outputarg = NULL;
2146 
2147 	kfree(p);
2148 
2149 	return 0;
2150 }
2151 
2152 /* Must be called after retrieving the scheduler type */
2153 static int
root_scheduler_init(struct device * dev)2154 root_scheduler_init(struct device *dev)
2155 {
2156 	int ret;
2157 
2158 	if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
2159 		return 0;
2160 
2161 	root_scheduler_input = alloc_percpu(void *);
2162 	root_scheduler_output = alloc_percpu(void *);
2163 
2164 	if (!root_scheduler_input || !root_scheduler_output) {
2165 		dev_err(dev, "Failed to allocate root scheduler buffers\n");
2166 		ret = -ENOMEM;
2167 		goto out;
2168 	}
2169 
2170 	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_root_sched",
2171 				mshv_root_scheduler_init,
2172 				mshv_root_scheduler_cleanup);
2173 
2174 	if (ret < 0) {
2175 		dev_err(dev, "Failed to setup root scheduler state: %i\n", ret);
2176 		goto out;
2177 	}
2178 
2179 	mshv_root_sched_online = ret;
2180 
2181 	return 0;
2182 
2183 out:
2184 	free_percpu(root_scheduler_input);
2185 	free_percpu(root_scheduler_output);
2186 	return ret;
2187 }
2188 
2189 static void
root_scheduler_deinit(void)2190 root_scheduler_deinit(void)
2191 {
2192 	if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
2193 		return;
2194 
2195 	cpuhp_remove_state(mshv_root_sched_online);
2196 	free_percpu(root_scheduler_input);
2197 	free_percpu(root_scheduler_output);
2198 }
2199 
mshv_reboot_notify(struct notifier_block * nb,unsigned long code,void * unused)2200 static int mshv_reboot_notify(struct notifier_block *nb,
2201 			      unsigned long code, void *unused)
2202 {
2203 	cpuhp_remove_state(mshv_cpuhp_online);
2204 	return 0;
2205 }
2206 
2207 struct notifier_block mshv_reboot_nb = {
2208 	.notifier_call = mshv_reboot_notify,
2209 };
2210 
mshv_root_partition_exit(void)2211 static void mshv_root_partition_exit(void)
2212 {
2213 	unregister_reboot_notifier(&mshv_reboot_nb);
2214 	root_scheduler_deinit();
2215 }
2216 
mshv_root_partition_init(struct device * dev)2217 static int __init mshv_root_partition_init(struct device *dev)
2218 {
2219 	int err;
2220 
2221 	err = root_scheduler_init(dev);
2222 	if (err)
2223 		return err;
2224 
2225 	err = register_reboot_notifier(&mshv_reboot_nb);
2226 	if (err)
2227 		goto root_sched_deinit;
2228 
2229 	return 0;
2230 
2231 root_sched_deinit:
2232 	root_scheduler_deinit();
2233 	return err;
2234 }
2235 
mshv_init_vmm_caps(struct device * dev)2236 static void mshv_init_vmm_caps(struct device *dev)
2237 {
2238 	/*
2239 	 * This can only fail here if HVCALL_GET_PARTITION_PROPERTY_EX or
2240 	 * HV_PARTITION_PROPERTY_VMM_CAPABILITIES are not supported. In that
2241 	 * case it's valid to proceed as if all vmm_caps are disabled (zero).
2242 	 */
2243 	if (hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF,
2244 					      HV_PARTITION_PROPERTY_VMM_CAPABILITIES,
2245 					      0, &mshv_root.vmm_caps,
2246 					      sizeof(mshv_root.vmm_caps)))
2247 		dev_warn(dev, "Unable to get VMM capabilities\n");
2248 
2249 	dev_dbg(dev, "vmm_caps = %#llx\n", mshv_root.vmm_caps.as_uint64[0]);
2250 }
2251 
mshv_parent_partition_init(void)2252 static int __init mshv_parent_partition_init(void)
2253 {
2254 	int ret;
2255 	struct device *dev;
2256 	union hv_hypervisor_version_info version_info;
2257 
2258 	if (!hv_parent_partition() || is_kdump_kernel())
2259 		return -ENODEV;
2260 
2261 	if (hv_get_hypervisor_version(&version_info))
2262 		return -ENODEV;
2263 
2264 	ret = misc_register(&mshv_dev);
2265 	if (ret)
2266 		return ret;
2267 
2268 	dev = mshv_dev.this_device;
2269 
2270 	if (version_info.build_number < MSHV_HV_MIN_VERSION ||
2271 	    version_info.build_number > MSHV_HV_MAX_VERSION) {
2272 		dev_err(dev, "Running on unvalidated Hyper-V version\n");
2273 		dev_err(dev, "Versions: current: %u  min: %u  max: %u\n",
2274 			version_info.build_number, MSHV_HV_MIN_VERSION,
2275 			MSHV_HV_MAX_VERSION);
2276 	}
2277 
2278 	mshv_root.synic_pages = alloc_percpu(struct hv_synic_pages);
2279 	if (!mshv_root.synic_pages) {
2280 		dev_err(dev, "Failed to allocate percpu synic page\n");
2281 		ret = -ENOMEM;
2282 		goto device_deregister;
2283 	}
2284 
2285 	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic",
2286 				mshv_synic_init,
2287 				mshv_synic_cleanup);
2288 	if (ret < 0) {
2289 		dev_err(dev, "Failed to setup cpu hotplug state: %i\n", ret);
2290 		goto free_synic_pages;
2291 	}
2292 
2293 	mshv_cpuhp_online = ret;
2294 
2295 	ret = mshv_retrieve_scheduler_type(dev);
2296 	if (ret)
2297 		goto remove_cpu_state;
2298 
2299 	if (hv_root_partition())
2300 		ret = mshv_root_partition_init(dev);
2301 	if (ret)
2302 		goto remove_cpu_state;
2303 
2304 	mshv_init_vmm_caps(dev);
2305 
2306 	ret = mshv_irqfd_wq_init();
2307 	if (ret)
2308 		goto exit_partition;
2309 
2310 	spin_lock_init(&mshv_root.pt_ht_lock);
2311 	hash_init(mshv_root.pt_htable);
2312 
2313 	hv_setup_mshv_handler(mshv_isr);
2314 
2315 	return 0;
2316 
2317 exit_partition:
2318 	if (hv_root_partition())
2319 		mshv_root_partition_exit();
2320 remove_cpu_state:
2321 	cpuhp_remove_state(mshv_cpuhp_online);
2322 free_synic_pages:
2323 	free_percpu(mshv_root.synic_pages);
2324 device_deregister:
2325 	misc_deregister(&mshv_dev);
2326 	return ret;
2327 }
2328 
mshv_parent_partition_exit(void)2329 static void __exit mshv_parent_partition_exit(void)
2330 {
2331 	hv_setup_mshv_handler(NULL);
2332 	mshv_port_table_fini();
2333 	misc_deregister(&mshv_dev);
2334 	mshv_irqfd_wq_cleanup();
2335 	if (hv_root_partition())
2336 		mshv_root_partition_exit();
2337 	cpuhp_remove_state(mshv_cpuhp_online);
2338 	free_percpu(mshv_root.synic_pages);
2339 }
2340 
2341 module_init(mshv_parent_partition_init);
2342 module_exit(mshv_parent_partition_exit);
2343