xref: /linux/drivers/hv/mshv_root_main.c (revision bf4afc53b77aeaa48b5409da5c8da6bb4eff7f43)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2024, Microsoft Corporation.
4  *
5  * The main part of the mshv_root module, providing APIs to create
6  * and manage guest partitions.
7  *
8  * Authors: Microsoft Linux virtualization team
9  */
10 
11 #include <linux/entry-virt.h>
12 #include <linux/kernel.h>
13 #include <linux/module.h>
14 #include <linux/fs.h>
15 #include <linux/miscdevice.h>
16 #include <linux/slab.h>
17 #include <linux/file.h>
18 #include <linux/anon_inodes.h>
19 #include <linux/mm.h>
20 #include <linux/io.h>
21 #include <linux/cpuhotplug.h>
22 #include <linux/random.h>
23 #include <asm/mshyperv.h>
24 #include <linux/hyperv.h>
25 #include <linux/notifier.h>
26 #include <linux/reboot.h>
27 #include <linux/kexec.h>
28 #include <linux/page-flags.h>
29 #include <linux/crash_dump.h>
30 #include <linux/panic_notifier.h>
31 #include <linux/vmalloc.h>
32 #include <linux/rseq.h>
33 
34 #include "mshv_eventfd.h"
35 #include "mshv.h"
36 #include "mshv_root.h"
37 
38 MODULE_AUTHOR("Microsoft");
39 MODULE_LICENSE("GPL");
40 MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv");
41 
42 /* HV_THREAD_COUNTER */
43 #if defined(CONFIG_X86_64)
44 #define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 202
45 #elif defined(CONFIG_ARM64)
46 #define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 95
47 #endif
48 
49 struct mshv_root mshv_root;
50 
51 enum hv_scheduler_type hv_scheduler_type;
52 
53 /* Once we implement the fast extended hypercall ABI they can go away. */
54 static void * __percpu *root_scheduler_input;
55 static void * __percpu *root_scheduler_output;
56 
57 static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
58 static int mshv_dev_open(struct inode *inode, struct file *filp);
59 static int mshv_dev_release(struct inode *inode, struct file *filp);
60 static int mshv_vp_release(struct inode *inode, struct file *filp);
61 static long mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
62 static int mshv_partition_release(struct inode *inode, struct file *filp);
63 static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
64 static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma);
65 static vm_fault_t mshv_vp_fault(struct vm_fault *vmf);
66 static int mshv_init_async_handler(struct mshv_partition *partition);
67 static void mshv_async_hvcall_handler(void *data, u64 *status);
68 
69 static const union hv_input_vtl input_vtl_zero;
70 static const union hv_input_vtl input_vtl_normal = {
71 	.target_vtl = HV_NORMAL_VTL,
72 	.use_target_vtl = 1,
73 };
74 
75 static const struct vm_operations_struct mshv_vp_vm_ops = {
76 	.fault = mshv_vp_fault,
77 };
78 
79 static const struct file_operations mshv_vp_fops = {
80 	.owner = THIS_MODULE,
81 	.release = mshv_vp_release,
82 	.unlocked_ioctl = mshv_vp_ioctl,
83 	.llseek = noop_llseek,
84 	.mmap = mshv_vp_mmap,
85 };
86 
87 static const struct file_operations mshv_partition_fops = {
88 	.owner = THIS_MODULE,
89 	.release = mshv_partition_release,
90 	.unlocked_ioctl = mshv_partition_ioctl,
91 	.llseek = noop_llseek,
92 };
93 
94 static const struct file_operations mshv_dev_fops = {
95 	.owner = THIS_MODULE,
96 	.open = mshv_dev_open,
97 	.release = mshv_dev_release,
98 	.unlocked_ioctl = mshv_dev_ioctl,
99 	.llseek = noop_llseek,
100 };
101 
102 static struct miscdevice mshv_dev = {
103 	.minor = MISC_DYNAMIC_MINOR,
104 	.name = "mshv",
105 	.fops = &mshv_dev_fops,
106 	.mode = 0600,
107 };
108 
109 /*
110  * Only allow hypercalls that have a u64 partition id as the first member of
111  * the input structure.
112  * These are sorted by value.
113  */
114 static u16 mshv_passthru_hvcalls[] = {
115 	HVCALL_GET_PARTITION_PROPERTY,
116 	HVCALL_GET_PARTITION_PROPERTY_EX,
117 	HVCALL_SET_PARTITION_PROPERTY,
118 	HVCALL_INSTALL_INTERCEPT,
119 	HVCALL_GET_VP_REGISTERS,
120 	HVCALL_SET_VP_REGISTERS,
121 	HVCALL_TRANSLATE_VIRTUAL_ADDRESS,
122 	HVCALL_CLEAR_VIRTUAL_INTERRUPT,
123 	HVCALL_SCRUB_PARTITION,
124 	HVCALL_REGISTER_INTERCEPT_RESULT,
125 	HVCALL_ASSERT_VIRTUAL_INTERRUPT,
126 	HVCALL_GET_GPA_PAGES_ACCESS_STATES,
127 	HVCALL_SIGNAL_EVENT_DIRECT,
128 	HVCALL_POST_MESSAGE_DIRECT,
129 	HVCALL_GET_VP_CPUID_VALUES,
130 };
131 
132 /*
133  * Only allow hypercalls that are safe to be called by the VMM with the host
134  * partition as target (i.e. HV_PARTITION_ID_SELF). Carefully audit that a
135  * hypercall cannot be misused by the VMM before adding it to this list.
136  */
137 static u16 mshv_self_passthru_hvcalls[] = {
138 	HVCALL_GET_PARTITION_PROPERTY,
139 	HVCALL_GET_PARTITION_PROPERTY_EX,
140 };
141 
mshv_hvcall_is_async(u16 code)142 static bool mshv_hvcall_is_async(u16 code)
143 {
144 	switch (code) {
145 	case HVCALL_SET_PARTITION_PROPERTY:
146 		return true;
147 	default:
148 		break;
149 	}
150 	return false;
151 }
152 
mshv_passthru_hvcall_allowed(u16 code,u64 pt_id)153 static bool mshv_passthru_hvcall_allowed(u16 code, u64 pt_id)
154 {
155 	int i;
156 	int n = ARRAY_SIZE(mshv_passthru_hvcalls);
157 	u16 *allowed_hvcalls = mshv_passthru_hvcalls;
158 
159 	if (pt_id == HV_PARTITION_ID_SELF) {
160 		n = ARRAY_SIZE(mshv_self_passthru_hvcalls);
161 		allowed_hvcalls = mshv_self_passthru_hvcalls;
162 	}
163 
164 	for (i = 0; i < n; ++i)
165 		if (allowed_hvcalls[i] == code)
166 			return true;
167 
168 	return false;
169 }
170 
mshv_ioctl_passthru_hvcall(struct mshv_partition * partition,bool partition_locked,void __user * user_args)171 static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition,
172 				      bool partition_locked,
173 				      void __user *user_args)
174 {
175 	u64 status;
176 	int ret = 0;
177 	bool is_async;
178 	struct mshv_root_hvcall args;
179 	struct page *page;
180 	unsigned int pages_order;
181 	void *input_pg = NULL;
182 	void *output_pg = NULL;
183 	u16 reps_completed;
184 	u64 pt_id = partition ? partition->pt_id : HV_PARTITION_ID_SELF;
185 
186 	if (copy_from_user(&args, user_args, sizeof(args)))
187 		return -EFAULT;
188 
189 	if (args.status || !args.in_ptr || args.in_sz < sizeof(u64) ||
190 	    mshv_field_nonzero(args, rsvd) || args.in_sz > HV_HYP_PAGE_SIZE)
191 		return -EINVAL;
192 
193 	if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE))
194 		return -EINVAL;
195 
196 	if (!mshv_passthru_hvcall_allowed(args.code, pt_id))
197 		return -EINVAL;
198 
199 	is_async = mshv_hvcall_is_async(args.code);
200 	if (is_async) {
201 		/* async hypercalls can only be called from partition fd */
202 		if (!partition || !partition_locked)
203 			return -EINVAL;
204 		ret = mshv_init_async_handler(partition);
205 		if (ret)
206 			return ret;
207 	}
208 
209 	pages_order = args.out_ptr ? 1 : 0;
210 	page = alloc_pages(GFP_KERNEL, pages_order);
211 	if (!page)
212 		return -ENOMEM;
213 	input_pg = page_address(page);
214 
215 	if (args.out_ptr)
216 		output_pg = (char *)input_pg + PAGE_SIZE;
217 	else
218 		output_pg = NULL;
219 
220 	if (copy_from_user(input_pg, (void __user *)args.in_ptr,
221 			   args.in_sz)) {
222 		ret = -EFAULT;
223 		goto free_pages_out;
224 	}
225 
226 	/*
227 	 * NOTE: This only works because all the allowed hypercalls' input
228 	 * structs begin with a u64 partition_id field.
229 	 */
230 	*(u64 *)input_pg = pt_id;
231 
232 	reps_completed = 0;
233 	do {
234 		if (args.reps) {
235 			status = hv_do_rep_hypercall_ex(args.code, args.reps,
236 							0, reps_completed,
237 							input_pg, output_pg);
238 			reps_completed = hv_repcomp(status);
239 		} else {
240 			status = hv_do_hypercall(args.code, input_pg, output_pg);
241 		}
242 
243 		if (hv_result(status) == HV_STATUS_CALL_PENDING) {
244 			if (is_async) {
245 				mshv_async_hvcall_handler(partition, &status);
246 			} else { /* Paranoia check. This shouldn't happen! */
247 				ret = -EBADFD;
248 				goto free_pages_out;
249 			}
250 		}
251 
252 		if (hv_result_success(status))
253 			break;
254 
255 		if (!hv_result_needs_memory(status))
256 			ret = hv_result_to_errno(status);
257 		else
258 			ret = hv_deposit_memory(pt_id, status);
259 	} while (!ret);
260 
261 	args.status = hv_result(status);
262 	args.reps = reps_completed;
263 	if (copy_to_user(user_args, &args, sizeof(args)))
264 		ret = -EFAULT;
265 
266 	if (!ret && output_pg &&
267 	    copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz))
268 		ret = -EFAULT;
269 
270 free_pages_out:
271 	free_pages((unsigned long)input_pg, pages_order);
272 
273 	return ret;
274 }
275 
is_ghcb_mapping_available(void)276 static inline bool is_ghcb_mapping_available(void)
277 {
278 #if IS_ENABLED(CONFIG_X86_64)
279 	return ms_hyperv.ext_features & HV_VP_GHCB_ROOT_MAPPING_AVAILABLE;
280 #else
281 	return 0;
282 #endif
283 }
284 
mshv_get_vp_registers(u32 vp_index,u64 partition_id,u16 count,struct hv_register_assoc * registers)285 static int mshv_get_vp_registers(u32 vp_index, u64 partition_id, u16 count,
286 				 struct hv_register_assoc *registers)
287 {
288 	return hv_call_get_vp_registers(vp_index, partition_id,
289 					count, input_vtl_zero, registers);
290 }
291 
mshv_set_vp_registers(u32 vp_index,u64 partition_id,u16 count,struct hv_register_assoc * registers)292 static int mshv_set_vp_registers(u32 vp_index, u64 partition_id, u16 count,
293 				 struct hv_register_assoc *registers)
294 {
295 	return hv_call_set_vp_registers(vp_index, partition_id,
296 					count, input_vtl_zero, registers);
297 }
298 
299 /*
300  * Explicit guest vCPU suspend is asynchronous by nature (as it is requested by
301  * dom0 vCPU for guest vCPU) and thus it can race with "intercept" suspend,
302  * done by the hypervisor.
303  * "Intercept" suspend leads to asynchronous message delivery to dom0 which
304  * should be awaited to keep the VP loop consistent (i.e. no message pending
305  * upon VP resume).
306  * VP intercept suspend can't be done when the VP is explicitly suspended
307  * already, and thus can be only two possible race scenarios:
308  *   1. implicit suspend bit set -> explicit suspend bit set -> message sent
309  *   2. implicit suspend bit set -> message sent -> explicit suspend bit set
310  * Checking for implicit suspend bit set after explicit suspend request has
311  * succeeded in either case allows us to reliably identify, if there is a
312  * message to receive and deliver to VMM.
313  */
314 static int
mshv_suspend_vp(const struct mshv_vp * vp,bool * message_in_flight)315 mshv_suspend_vp(const struct mshv_vp *vp, bool *message_in_flight)
316 {
317 	struct hv_register_assoc explicit_suspend = {
318 		.name = HV_REGISTER_EXPLICIT_SUSPEND
319 	};
320 	struct hv_register_assoc intercept_suspend = {
321 		.name = HV_REGISTER_INTERCEPT_SUSPEND
322 	};
323 	union hv_explicit_suspend_register *es =
324 		&explicit_suspend.value.explicit_suspend;
325 	union hv_intercept_suspend_register *is =
326 		&intercept_suspend.value.intercept_suspend;
327 	int ret;
328 
329 	es->suspended = 1;
330 
331 	ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
332 				    1, &explicit_suspend);
333 	if (ret) {
334 		vp_err(vp, "Failed to explicitly suspend vCPU\n");
335 		return ret;
336 	}
337 
338 	ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
339 				    1, &intercept_suspend);
340 	if (ret) {
341 		vp_err(vp, "Failed to get intercept suspend state\n");
342 		return ret;
343 	}
344 
345 	*message_in_flight = is->suspended;
346 
347 	return 0;
348 }
349 
350 /*
351  * This function is used when VPs are scheduled by the hypervisor's
352  * scheduler.
353  *
354  * Caller has to make sure the registers contain cleared
355  * HV_REGISTER_INTERCEPT_SUSPEND and HV_REGISTER_EXPLICIT_SUSPEND registers
356  * exactly in this order (the hypervisor clears them sequentially) to avoid
357  * potential invalid clearing a newly arrived HV_REGISTER_INTERCEPT_SUSPEND
358  * after VP is released from HV_REGISTER_EXPLICIT_SUSPEND in case of the
359  * opposite order.
360  */
mshv_run_vp_with_hyp_scheduler(struct mshv_vp * vp)361 static long mshv_run_vp_with_hyp_scheduler(struct mshv_vp *vp)
362 {
363 	long ret;
364 	struct hv_register_assoc suspend_regs[2] = {
365 			{ .name = HV_REGISTER_INTERCEPT_SUSPEND },
366 			{ .name = HV_REGISTER_EXPLICIT_SUSPEND }
367 	};
368 	size_t count = ARRAY_SIZE(suspend_regs);
369 
370 	/* Resume VP execution */
371 	ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
372 				    count, suspend_regs);
373 	if (ret) {
374 		vp_err(vp, "Failed to resume vp execution. %lx\n", ret);
375 		return ret;
376 	}
377 
378 	ret = wait_event_interruptible(vp->run.vp_suspend_queue,
379 				       vp->run.kicked_by_hv == 1);
380 	if (ret) {
381 		bool message_in_flight;
382 
383 		/*
384 		 * Otherwise the waiting was interrupted by a signal: suspend
385 		 * the vCPU explicitly and copy message in flight (if any).
386 		 */
387 		ret = mshv_suspend_vp(vp, &message_in_flight);
388 		if (ret)
389 			return ret;
390 
391 		/* Return if no message in flight */
392 		if (!message_in_flight)
393 			return -EINTR;
394 
395 		/* Wait for the message in flight. */
396 		wait_event(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1);
397 	}
398 
399 	/*
400 	 * Reset the flag to make the wait_event call above work
401 	 * next time.
402 	 */
403 	vp->run.kicked_by_hv = 0;
404 
405 	return 0;
406 }
407 
408 static int
mshv_vp_dispatch(struct mshv_vp * vp,u32 flags,struct hv_output_dispatch_vp * res)409 mshv_vp_dispatch(struct mshv_vp *vp, u32 flags,
410 		 struct hv_output_dispatch_vp *res)
411 {
412 	struct hv_input_dispatch_vp *input;
413 	struct hv_output_dispatch_vp *output;
414 	u64 status;
415 
416 	preempt_disable();
417 	input = *this_cpu_ptr(root_scheduler_input);
418 	output = *this_cpu_ptr(root_scheduler_output);
419 
420 	memset(input, 0, sizeof(*input));
421 	memset(output, 0, sizeof(*output));
422 
423 	input->partition_id = vp->vp_partition->pt_id;
424 	input->vp_index = vp->vp_index;
425 	input->time_slice = 0; /* Run forever until something happens */
426 	input->spec_ctrl = 0; /* TODO: set sensible flags */
427 	input->flags = flags;
428 
429 	vp->run.flags.root_sched_dispatched = 1;
430 	status = hv_do_hypercall(HVCALL_DISPATCH_VP, input, output);
431 	vp->run.flags.root_sched_dispatched = 0;
432 
433 	*res = *output;
434 	preempt_enable();
435 
436 	if (!hv_result_success(status))
437 		vp_err(vp, "%s: status %s\n", __func__,
438 		       hv_result_to_string(status));
439 
440 	return hv_result_to_errno(status);
441 }
442 
443 static int
mshv_vp_clear_explicit_suspend(struct mshv_vp * vp)444 mshv_vp_clear_explicit_suspend(struct mshv_vp *vp)
445 {
446 	struct hv_register_assoc explicit_suspend = {
447 		.name = HV_REGISTER_EXPLICIT_SUSPEND,
448 		.value.explicit_suspend.suspended = 0,
449 	};
450 	int ret;
451 
452 	ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
453 				    1, &explicit_suspend);
454 
455 	if (ret)
456 		vp_err(vp, "Failed to unsuspend\n");
457 
458 	return ret;
459 }
460 
461 #if IS_ENABLED(CONFIG_X86_64)
mshv_vp_interrupt_pending(struct mshv_vp * vp)462 static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp)
463 {
464 	if (!vp->vp_register_page)
465 		return 0;
466 	return vp->vp_register_page->interrupt_vectors.as_uint64;
467 }
468 #else
mshv_vp_interrupt_pending(struct mshv_vp * vp)469 static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp)
470 {
471 	return 0;
472 }
473 #endif
474 
mshv_vp_dispatch_thread_blocked(struct mshv_vp * vp)475 static bool mshv_vp_dispatch_thread_blocked(struct mshv_vp *vp)
476 {
477 	struct hv_stats_page **stats = vp->vp_stats_pages;
478 	u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->data;
479 	u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->data;
480 
481 	return parent_vp_cntrs[HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED] ||
482 	       self_vp_cntrs[HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED];
483 }
484 
485 static int
mshv_vp_wait_for_hv_kick(struct mshv_vp * vp)486 mshv_vp_wait_for_hv_kick(struct mshv_vp *vp)
487 {
488 	int ret;
489 
490 	ret = wait_event_interruptible(vp->run.vp_suspend_queue,
491 				       (vp->run.kicked_by_hv == 1 &&
492 					!mshv_vp_dispatch_thread_blocked(vp)) ||
493 				       mshv_vp_interrupt_pending(vp));
494 	if (ret)
495 		return -EINTR;
496 
497 	vp->run.flags.root_sched_blocked = 0;
498 	vp->run.kicked_by_hv = 0;
499 
500 	return 0;
501 }
502 
503 /* Must be called with interrupts enabled */
mshv_run_vp_with_root_scheduler(struct mshv_vp * vp)504 static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp)
505 {
506 	long ret;
507 
508 	if (vp->run.flags.root_sched_blocked) {
509 		/*
510 		 * Dispatch state of this VP is blocked. Need to wait
511 		 * for the hypervisor to clear the blocked state before
512 		 * dispatching it.
513 		 */
514 		ret = mshv_vp_wait_for_hv_kick(vp);
515 		if (ret)
516 			return ret;
517 	}
518 
519 	do {
520 		u32 flags = 0;
521 		struct hv_output_dispatch_vp output;
522 
523 		if (__xfer_to_guest_mode_work_pending()) {
524 			ret = xfer_to_guest_mode_handle_work();
525 			if (ret)
526 				break;
527 		}
528 
529 		if (vp->run.flags.intercept_suspend)
530 			flags |= HV_DISPATCH_VP_FLAG_CLEAR_INTERCEPT_SUSPEND;
531 
532 		if (mshv_vp_interrupt_pending(vp))
533 			flags |= HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION;
534 
535 		ret = mshv_vp_dispatch(vp, flags, &output);
536 		if (ret)
537 			break;
538 
539 		vp->run.flags.intercept_suspend = 0;
540 
541 		if (output.dispatch_state == HV_VP_DISPATCH_STATE_BLOCKED) {
542 			if (output.dispatch_event ==
543 						HV_VP_DISPATCH_EVENT_SUSPEND) {
544 				/*
545 				 * TODO: remove the warning once VP canceling
546 				 *	 is supported
547 				 */
548 				WARN_ONCE(atomic64_read(&vp->run.vp_signaled_count),
549 					  "%s: vp#%d: unexpected explicit suspend\n",
550 					  __func__, vp->vp_index);
551 				/*
552 				 * Need to clear explicit suspend before
553 				 * dispatching.
554 				 * Explicit suspend is either:
555 				 * - set right after the first VP dispatch or
556 				 * - set explicitly via hypercall
557 				 * Since the latter case is not yet supported,
558 				 * simply clear it here.
559 				 */
560 				ret = mshv_vp_clear_explicit_suspend(vp);
561 				if (ret)
562 					break;
563 
564 				ret = mshv_vp_wait_for_hv_kick(vp);
565 				if (ret)
566 					break;
567 			} else {
568 				vp->run.flags.root_sched_blocked = 1;
569 				ret = mshv_vp_wait_for_hv_kick(vp);
570 				if (ret)
571 					break;
572 			}
573 		} else {
574 			/* HV_VP_DISPATCH_STATE_READY */
575 			if (output.dispatch_event ==
576 						HV_VP_DISPATCH_EVENT_INTERCEPT)
577 				vp->run.flags.intercept_suspend = 1;
578 		}
579 	} while (!vp->run.flags.intercept_suspend);
580 
581 	rseq_virt_userspace_exit();
582 
583 	return ret;
584 }
585 
586 static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ,
587 	      "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ");
588 
589 static struct mshv_mem_region *
mshv_partition_region_by_gfn(struct mshv_partition * partition,u64 gfn)590 mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
591 {
592 	struct mshv_mem_region *region;
593 
594 	hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) {
595 		if (gfn >= region->start_gfn &&
596 		    gfn < region->start_gfn + region->nr_pages)
597 			return region;
598 	}
599 
600 	return NULL;
601 }
602 
603 static struct mshv_mem_region *
mshv_partition_region_by_gfn_get(struct mshv_partition * p,u64 gfn)604 mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
605 {
606 	struct mshv_mem_region *region;
607 
608 	spin_lock(&p->pt_mem_regions_lock);
609 	region = mshv_partition_region_by_gfn(p, gfn);
610 	if (!region || !mshv_region_get(region)) {
611 		spin_unlock(&p->pt_mem_regions_lock);
612 		return NULL;
613 	}
614 	spin_unlock(&p->pt_mem_regions_lock);
615 
616 	return region;
617 }
618 
619 /**
620  * mshv_handle_gpa_intercept - Handle GPA (Guest Physical Address) intercepts.
621  * @vp: Pointer to the virtual processor structure.
622  *
623  * This function processes GPA intercepts by identifying the memory region
624  * corresponding to the intercepted GPA, aligning the page offset, and
625  * mapping the required pages. It ensures that the region is valid and
626  * handles faults efficiently by mapping multiple pages at once.
627  *
628  * Return: true if the intercept was handled successfully, false otherwise.
629  */
mshv_handle_gpa_intercept(struct mshv_vp * vp)630 static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
631 {
632 	struct mshv_partition *p = vp->vp_partition;
633 	struct mshv_mem_region *region;
634 	bool ret;
635 	u64 gfn;
636 #if defined(CONFIG_X86_64)
637 	struct hv_x64_memory_intercept_message *msg =
638 		(struct hv_x64_memory_intercept_message *)
639 		vp->vp_intercept_msg_page->u.payload;
640 #elif defined(CONFIG_ARM64)
641 	struct hv_arm64_memory_intercept_message *msg =
642 		(struct hv_arm64_memory_intercept_message *)
643 		vp->vp_intercept_msg_page->u.payload;
644 #endif
645 
646 	gfn = HVPFN_DOWN(msg->guest_physical_address);
647 
648 	region = mshv_partition_region_by_gfn_get(p, gfn);
649 	if (!region)
650 		return false;
651 
652 	/* Only movable memory ranges are supported for GPA intercepts */
653 	if (region->mreg_type == MSHV_REGION_TYPE_MEM_MOVABLE)
654 		ret = mshv_region_handle_gfn_fault(region, gfn);
655 	else
656 		ret = false;
657 
658 	mshv_region_put(region);
659 
660 	return ret;
661 }
662 
mshv_vp_handle_intercept(struct mshv_vp * vp)663 static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
664 {
665 	switch (vp->vp_intercept_msg_page->header.message_type) {
666 	case HVMSG_GPA_INTERCEPT:
667 		return mshv_handle_gpa_intercept(vp);
668 	}
669 	return false;
670 }
671 
mshv_vp_ioctl_run_vp(struct mshv_vp * vp,void __user * ret_msg)672 static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg)
673 {
674 	long rc;
675 
676 	do {
677 		if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
678 			rc = mshv_run_vp_with_root_scheduler(vp);
679 		else
680 			rc = mshv_run_vp_with_hyp_scheduler(vp);
681 	} while (rc == 0 && mshv_vp_handle_intercept(vp));
682 
683 	if (rc)
684 		return rc;
685 
686 	if (copy_to_user(ret_msg, vp->vp_intercept_msg_page,
687 			 sizeof(struct hv_message)))
688 		rc = -EFAULT;
689 
690 	return rc;
691 }
692 
693 static int
mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp * vp,struct hv_vp_state_data state_data,unsigned long user_pfn,size_t page_count,bool is_set)694 mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp *vp,
695 				struct hv_vp_state_data state_data,
696 				unsigned long user_pfn, size_t page_count,
697 				bool is_set)
698 {
699 	int completed, ret = 0;
700 	unsigned long check;
701 	struct page **pages;
702 
703 	if (page_count > INT_MAX)
704 		return -EINVAL;
705 	/*
706 	 * Check the arithmetic for wraparound/overflow.
707 	 * The last page address in the buffer is:
708 	 * (user_pfn + (page_count - 1)) * PAGE_SIZE
709 	 */
710 	if (check_add_overflow(user_pfn, (page_count - 1), &check))
711 		return -EOVERFLOW;
712 	if (check_mul_overflow(check, PAGE_SIZE, &check))
713 		return -EOVERFLOW;
714 
715 	/* Pin user pages so hypervisor can copy directly to them */
716 	pages = kzalloc_objs(struct page *, page_count);
717 	if (!pages)
718 		return -ENOMEM;
719 
720 	for (completed = 0; completed < page_count; completed += ret) {
721 		unsigned long user_addr = (user_pfn + completed) * PAGE_SIZE;
722 		int remaining = page_count - completed;
723 
724 		ret = pin_user_pages_fast(user_addr, remaining, FOLL_WRITE,
725 					  &pages[completed]);
726 		if (ret < 0) {
727 			vp_err(vp, "%s: Failed to pin user pages error %i\n",
728 			       __func__, ret);
729 			goto unpin_pages;
730 		}
731 	}
732 
733 	if (is_set)
734 		ret = hv_call_set_vp_state(vp->vp_index,
735 					   vp->vp_partition->pt_id,
736 					   state_data, page_count, pages,
737 					   0, NULL);
738 	else
739 		ret = hv_call_get_vp_state(vp->vp_index,
740 					   vp->vp_partition->pt_id,
741 					   state_data, page_count, pages,
742 					   NULL);
743 
744 unpin_pages:
745 	unpin_user_pages(pages, completed);
746 	kfree(pages);
747 	return ret;
748 }
749 
750 static long
mshv_vp_ioctl_get_set_state(struct mshv_vp * vp,struct mshv_get_set_vp_state __user * user_args,bool is_set)751 mshv_vp_ioctl_get_set_state(struct mshv_vp *vp,
752 			    struct mshv_get_set_vp_state __user *user_args,
753 			    bool is_set)
754 {
755 	struct mshv_get_set_vp_state args;
756 	long ret = 0;
757 	union hv_output_get_vp_state vp_state;
758 	u32 data_sz;
759 	struct hv_vp_state_data state_data = {};
760 
761 	if (copy_from_user(&args, user_args, sizeof(args)))
762 		return -EFAULT;
763 
764 	if (args.type >= MSHV_VP_STATE_COUNT || mshv_field_nonzero(args, rsvd) ||
765 	    !args.buf_sz || !PAGE_ALIGNED(args.buf_sz) ||
766 	    !PAGE_ALIGNED(args.buf_ptr))
767 		return -EINVAL;
768 
769 	if (!access_ok((void __user *)args.buf_ptr, args.buf_sz))
770 		return -EFAULT;
771 
772 	switch (args.type) {
773 	case MSHV_VP_STATE_LAPIC:
774 		state_data.type = HV_GET_SET_VP_STATE_LAPIC_STATE;
775 		data_sz = HV_HYP_PAGE_SIZE;
776 		break;
777 	case MSHV_VP_STATE_XSAVE:
778 	{
779 		u64 data_sz_64;
780 
781 		ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
782 						     HV_PARTITION_PROPERTY_XSAVE_STATES,
783 						     &state_data.xsave.states.as_uint64);
784 		if (ret)
785 			return ret;
786 
787 		ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
788 						     HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE,
789 						     &data_sz_64);
790 		if (ret)
791 			return ret;
792 
793 		data_sz = (u32)data_sz_64;
794 		state_data.xsave.flags = 0;
795 		/* Always request legacy states */
796 		state_data.xsave.states.legacy_x87 = 1;
797 		state_data.xsave.states.legacy_sse = 1;
798 		state_data.type = HV_GET_SET_VP_STATE_XSAVE;
799 		break;
800 	}
801 	case MSHV_VP_STATE_SIMP:
802 		state_data.type = HV_GET_SET_VP_STATE_SIM_PAGE;
803 		data_sz = HV_HYP_PAGE_SIZE;
804 		break;
805 	case MSHV_VP_STATE_SIEFP:
806 		state_data.type = HV_GET_SET_VP_STATE_SIEF_PAGE;
807 		data_sz = HV_HYP_PAGE_SIZE;
808 		break;
809 	case MSHV_VP_STATE_SYNTHETIC_TIMERS:
810 		state_data.type = HV_GET_SET_VP_STATE_SYNTHETIC_TIMERS;
811 		data_sz = sizeof(vp_state.synthetic_timers_state);
812 		break;
813 	default:
814 		return -EINVAL;
815 	}
816 
817 	if (copy_to_user(&user_args->buf_sz, &data_sz, sizeof(user_args->buf_sz)))
818 		return -EFAULT;
819 
820 	if (data_sz > args.buf_sz)
821 		return -EINVAL;
822 
823 	/* If the data is transmitted via pfns, delegate to helper */
824 	if (state_data.type & HV_GET_SET_VP_STATE_TYPE_PFN) {
825 		unsigned long user_pfn = PFN_DOWN(args.buf_ptr);
826 		size_t page_count = PFN_DOWN(args.buf_sz);
827 
828 		return mshv_vp_ioctl_get_set_state_pfn(vp, state_data, user_pfn,
829 						       page_count, is_set);
830 	}
831 
832 	/* Paranoia check - this shouldn't happen! */
833 	if (data_sz > sizeof(vp_state)) {
834 		vp_err(vp, "Invalid vp state data size!\n");
835 		return -EINVAL;
836 	}
837 
838 	if (is_set) {
839 		if (copy_from_user(&vp_state, (__user void *)args.buf_ptr, data_sz))
840 			return -EFAULT;
841 
842 		return hv_call_set_vp_state(vp->vp_index,
843 					    vp->vp_partition->pt_id,
844 					    state_data, 0, NULL,
845 					    sizeof(vp_state), (u8 *)&vp_state);
846 	}
847 
848 	ret = hv_call_get_vp_state(vp->vp_index, vp->vp_partition->pt_id,
849 				   state_data, 0, NULL, &vp_state);
850 	if (ret)
851 		return ret;
852 
853 	if (copy_to_user((void __user *)args.buf_ptr, &vp_state, data_sz))
854 		return -EFAULT;
855 
856 	return 0;
857 }
858 
859 static long
mshv_vp_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)860 mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
861 {
862 	struct mshv_vp *vp = filp->private_data;
863 	long r = -ENOTTY;
864 
865 	if (mutex_lock_killable(&vp->vp_mutex))
866 		return -EINTR;
867 
868 	switch (ioctl) {
869 	case MSHV_RUN_VP:
870 		r = mshv_vp_ioctl_run_vp(vp, (void __user *)arg);
871 		break;
872 	case MSHV_GET_VP_STATE:
873 		r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, false);
874 		break;
875 	case MSHV_SET_VP_STATE:
876 		r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, true);
877 		break;
878 	case MSHV_ROOT_HVCALL:
879 		r = mshv_ioctl_passthru_hvcall(vp->vp_partition, false,
880 					       (void __user *)arg);
881 		break;
882 	default:
883 		vp_warn(vp, "Invalid ioctl: %#x\n", ioctl);
884 		break;
885 	}
886 	mutex_unlock(&vp->vp_mutex);
887 
888 	return r;
889 }
890 
mshv_vp_fault(struct vm_fault * vmf)891 static vm_fault_t mshv_vp_fault(struct vm_fault *vmf)
892 {
893 	struct mshv_vp *vp = vmf->vma->vm_file->private_data;
894 
895 	switch (vmf->vma->vm_pgoff) {
896 	case MSHV_VP_MMAP_OFFSET_REGISTERS:
897 		vmf->page = virt_to_page(vp->vp_register_page);
898 		break;
899 	case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE:
900 		vmf->page = virt_to_page(vp->vp_intercept_msg_page);
901 		break;
902 	case MSHV_VP_MMAP_OFFSET_GHCB:
903 		vmf->page = virt_to_page(vp->vp_ghcb_page);
904 		break;
905 	default:
906 		return VM_FAULT_SIGBUS;
907 	}
908 
909 	get_page(vmf->page);
910 
911 	return 0;
912 }
913 
mshv_vp_mmap(struct file * file,struct vm_area_struct * vma)914 static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma)
915 {
916 	struct mshv_vp *vp = file->private_data;
917 
918 	switch (vma->vm_pgoff) {
919 	case MSHV_VP_MMAP_OFFSET_REGISTERS:
920 		if (!vp->vp_register_page)
921 			return -ENODEV;
922 		break;
923 	case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE:
924 		if (!vp->vp_intercept_msg_page)
925 			return -ENODEV;
926 		break;
927 	case MSHV_VP_MMAP_OFFSET_GHCB:
928 		if (!vp->vp_ghcb_page)
929 			return -ENODEV;
930 		break;
931 	default:
932 		return -EINVAL;
933 	}
934 
935 	vma->vm_ops = &mshv_vp_vm_ops;
936 	return 0;
937 }
938 
939 static int
mshv_vp_release(struct inode * inode,struct file * filp)940 mshv_vp_release(struct inode *inode, struct file *filp)
941 {
942 	struct mshv_vp *vp = filp->private_data;
943 
944 	/* Rest of VP cleanup happens in destroy_partition() */
945 	mshv_partition_put(vp->vp_partition);
946 	return 0;
947 }
948 
mshv_vp_stats_unmap(u64 partition_id,u32 vp_index,struct hv_stats_page * stats_pages[])949 void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index,
950 			 struct hv_stats_page *stats_pages[])
951 {
952 	union hv_stats_object_identity identity = {
953 		.vp.partition_id = partition_id,
954 		.vp.vp_index = vp_index,
955 	};
956 	int err;
957 
958 	identity.vp.stats_area_type = HV_STATS_AREA_SELF;
959 	err = hv_unmap_stats_page(HV_STATS_OBJECT_VP,
960 				  stats_pages[HV_STATS_AREA_SELF],
961 				  &identity);
962 	if (err)
963 		pr_err("%s: failed to unmap partition %llu vp %u self stats, err: %d\n",
964 		       __func__, partition_id, vp_index, err);
965 
966 	if (stats_pages[HV_STATS_AREA_PARENT] != stats_pages[HV_STATS_AREA_SELF]) {
967 		identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
968 		err = hv_unmap_stats_page(HV_STATS_OBJECT_VP,
969 					  stats_pages[HV_STATS_AREA_PARENT],
970 					  &identity);
971 		if (err)
972 			pr_err("%s: failed to unmap partition %llu vp %u parent stats, err: %d\n",
973 			       __func__, partition_id, vp_index, err);
974 	}
975 }
976 
mshv_vp_stats_map(u64 partition_id,u32 vp_index,struct hv_stats_page * stats_pages[])977 int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
978 		      struct hv_stats_page *stats_pages[])
979 {
980 	union hv_stats_object_identity identity = {
981 		.vp.partition_id = partition_id,
982 		.vp.vp_index = vp_index,
983 	};
984 	int err;
985 
986 	identity.vp.stats_area_type = HV_STATS_AREA_SELF;
987 	err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
988 				&stats_pages[HV_STATS_AREA_SELF]);
989 	if (err) {
990 		pr_err("%s: failed to map partition %llu vp %u self stats, err: %d\n",
991 		       __func__, partition_id, vp_index, err);
992 		return err;
993 	}
994 
995 	/*
996 	 * L1VH partition cannot access its vp stats in parent area.
997 	 */
998 	if (is_l1vh_parent(partition_id)) {
999 		stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF];
1000 	} else {
1001 		identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
1002 		err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
1003 					&stats_pages[HV_STATS_AREA_PARENT]);
1004 		if (err) {
1005 			pr_err("%s: failed to map partition %llu vp %u parent stats, err: %d\n",
1006 			       __func__, partition_id, vp_index, err);
1007 			goto unmap_self;
1008 		}
1009 		if (!stats_pages[HV_STATS_AREA_PARENT])
1010 			stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF];
1011 	}
1012 
1013 	return 0;
1014 
1015 unmap_self:
1016 	identity.vp.stats_area_type = HV_STATS_AREA_SELF;
1017 	hv_unmap_stats_page(HV_STATS_OBJECT_VP,
1018 			    stats_pages[HV_STATS_AREA_SELF],
1019 			    &identity);
1020 	return err;
1021 }
1022 
1023 static long
mshv_partition_ioctl_create_vp(struct mshv_partition * partition,void __user * arg)1024 mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
1025 			       void __user *arg)
1026 {
1027 	struct mshv_create_vp args;
1028 	struct mshv_vp *vp;
1029 	struct page *intercept_msg_page, *register_page, *ghcb_page;
1030 	struct hv_stats_page *stats_pages[2];
1031 	long ret;
1032 
1033 	if (copy_from_user(&args, arg, sizeof(args)))
1034 		return -EFAULT;
1035 
1036 	if (args.vp_index >= MSHV_MAX_VPS)
1037 		return -EINVAL;
1038 
1039 	if (partition->pt_vp_array[args.vp_index])
1040 		return -EEXIST;
1041 
1042 	ret = hv_call_create_vp(NUMA_NO_NODE, partition->pt_id, args.vp_index,
1043 				0 /* Only valid for root partition VPs */);
1044 	if (ret)
1045 		return ret;
1046 
1047 	ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
1048 				   HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
1049 				   input_vtl_zero, &intercept_msg_page);
1050 	if (ret)
1051 		goto destroy_vp;
1052 
1053 	if (!mshv_partition_encrypted(partition)) {
1054 		ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
1055 					   HV_VP_STATE_PAGE_REGISTERS,
1056 					   input_vtl_zero, &register_page);
1057 		if (ret)
1058 			goto unmap_intercept_message_page;
1059 	}
1060 
1061 	if (mshv_partition_encrypted(partition) &&
1062 	    is_ghcb_mapping_available()) {
1063 		ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
1064 					   HV_VP_STATE_PAGE_GHCB,
1065 					   input_vtl_normal, &ghcb_page);
1066 		if (ret)
1067 			goto unmap_register_page;
1068 	}
1069 
1070 	ret = mshv_vp_stats_map(partition->pt_id, args.vp_index,
1071 				stats_pages);
1072 	if (ret)
1073 		goto unmap_ghcb_page;
1074 
1075 	vp = kzalloc_obj(*vp);
1076 	if (!vp)
1077 		goto unmap_stats_pages;
1078 
1079 	vp->vp_partition = mshv_partition_get(partition);
1080 	if (!vp->vp_partition) {
1081 		ret = -EBADF;
1082 		goto free_vp;
1083 	}
1084 
1085 	mutex_init(&vp->vp_mutex);
1086 	init_waitqueue_head(&vp->run.vp_suspend_queue);
1087 	atomic64_set(&vp->run.vp_signaled_count, 0);
1088 
1089 	vp->vp_index = args.vp_index;
1090 	vp->vp_intercept_msg_page = page_to_virt(intercept_msg_page);
1091 	if (!mshv_partition_encrypted(partition))
1092 		vp->vp_register_page = page_to_virt(register_page);
1093 
1094 	if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
1095 		vp->vp_ghcb_page = page_to_virt(ghcb_page);
1096 
1097 	memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages));
1098 
1099 	ret = mshv_debugfs_vp_create(vp);
1100 	if (ret)
1101 		goto put_partition;
1102 
1103 	/*
1104 	 * Keep anon_inode_getfd last: it installs fd in the file struct and
1105 	 * thus makes the state accessible in user space.
1106 	 */
1107 	ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp,
1108 			       O_RDWR | O_CLOEXEC);
1109 	if (ret < 0)
1110 		goto remove_debugfs_vp;
1111 
1112 	/* already exclusive with the partition mutex for all ioctls */
1113 	partition->pt_vp_count++;
1114 	partition->pt_vp_array[args.vp_index] = vp;
1115 
1116 	return ret;
1117 
1118 remove_debugfs_vp:
1119 	mshv_debugfs_vp_remove(vp);
1120 put_partition:
1121 	mshv_partition_put(partition);
1122 free_vp:
1123 	kfree(vp);
1124 unmap_stats_pages:
1125 	mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages);
1126 unmap_ghcb_page:
1127 	if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
1128 		hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
1129 				       HV_VP_STATE_PAGE_GHCB, ghcb_page,
1130 				       input_vtl_normal);
1131 unmap_register_page:
1132 	if (!mshv_partition_encrypted(partition))
1133 		hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
1134 				       HV_VP_STATE_PAGE_REGISTERS,
1135 				       register_page, input_vtl_zero);
1136 unmap_intercept_message_page:
1137 	hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
1138 			       HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
1139 			       intercept_msg_page, input_vtl_zero);
1140 destroy_vp:
1141 	hv_call_delete_vp(partition->pt_id, args.vp_index);
1142 	return ret;
1143 }
1144 
mshv_init_async_handler(struct mshv_partition * partition)1145 static int mshv_init_async_handler(struct mshv_partition *partition)
1146 {
1147 	if (completion_done(&partition->async_hypercall)) {
1148 		pt_err(partition,
1149 		       "Cannot issue async hypercall while another one in progress!\n");
1150 		return -EPERM;
1151 	}
1152 
1153 	reinit_completion(&partition->async_hypercall);
1154 	return 0;
1155 }
1156 
mshv_async_hvcall_handler(void * data,u64 * status)1157 static void mshv_async_hvcall_handler(void *data, u64 *status)
1158 {
1159 	struct mshv_partition *partition = data;
1160 
1161 	wait_for_completion(&partition->async_hypercall);
1162 	pt_dbg(partition, "Async hypercall completed!\n");
1163 
1164 	*status = partition->async_hypercall_status;
1165 }
1166 
1167 /*
1168  * NB: caller checks and makes sure mem->size is page aligned
1169  * Returns: 0 with regionpp updated on success, or -errno
1170  */
mshv_partition_create_region(struct mshv_partition * partition,struct mshv_user_mem_region * mem,struct mshv_mem_region ** regionpp,bool is_mmio)1171 static int mshv_partition_create_region(struct mshv_partition *partition,
1172 					struct mshv_user_mem_region *mem,
1173 					struct mshv_mem_region **regionpp,
1174 					bool is_mmio)
1175 {
1176 	struct mshv_mem_region *rg;
1177 	u64 nr_pages = HVPFN_DOWN(mem->size);
1178 
1179 	/* Reject overlapping regions */
1180 	spin_lock(&partition->pt_mem_regions_lock);
1181 	hlist_for_each_entry(rg, &partition->pt_mem_regions, hnode) {
1182 		if (mem->guest_pfn + nr_pages <= rg->start_gfn ||
1183 		    rg->start_gfn + rg->nr_pages <= mem->guest_pfn)
1184 			continue;
1185 		spin_unlock(&partition->pt_mem_regions_lock);
1186 		return -EEXIST;
1187 	}
1188 	spin_unlock(&partition->pt_mem_regions_lock);
1189 
1190 	rg = mshv_region_create(mem->guest_pfn, nr_pages,
1191 				mem->userspace_addr, mem->flags);
1192 	if (IS_ERR(rg))
1193 		return PTR_ERR(rg);
1194 
1195 	if (is_mmio)
1196 		rg->mreg_type = MSHV_REGION_TYPE_MMIO;
1197 	else if (mshv_partition_encrypted(partition) ||
1198 		 !mshv_region_movable_init(rg))
1199 		rg->mreg_type = MSHV_REGION_TYPE_MEM_PINNED;
1200 	else
1201 		rg->mreg_type = MSHV_REGION_TYPE_MEM_MOVABLE;
1202 
1203 	rg->partition = partition;
1204 
1205 	*regionpp = rg;
1206 
1207 	return 0;
1208 }
1209 
1210 /**
1211  * mshv_prepare_pinned_region - Pin and map memory regions
1212  * @region: Pointer to the memory region structure
1213  *
1214  * This function processes memory regions that are explicitly marked as pinned.
1215  * Pinned regions are preallocated, mapped upfront, and do not rely on fault-based
1216  * population. The function ensures the region is properly populated, handles
1217  * encryption requirements for SNP partitions if applicable, maps the region,
1218  * and performs necessary sharing or eviction operations based on the mapping
1219  * result.
1220  *
1221  * Return: 0 on success, negative error code on failure.
1222  */
mshv_prepare_pinned_region(struct mshv_mem_region * region)1223 static int mshv_prepare_pinned_region(struct mshv_mem_region *region)
1224 {
1225 	struct mshv_partition *partition = region->partition;
1226 	int ret;
1227 
1228 	ret = mshv_region_pin(region);
1229 	if (ret) {
1230 		pt_err(partition, "Failed to pin memory region: %d\n",
1231 		       ret);
1232 		goto err_out;
1233 	}
1234 
1235 	/*
1236 	 * For an SNP partition it is a requirement that for every memory region
1237 	 * that we are going to map for this partition we should make sure that
1238 	 * host access to that region is released. This is ensured by doing an
1239 	 * additional hypercall which will update the SLAT to release host
1240 	 * access to guest memory regions.
1241 	 */
1242 	if (mshv_partition_encrypted(partition)) {
1243 		ret = mshv_region_unshare(region);
1244 		if (ret) {
1245 			pt_err(partition,
1246 			       "Failed to unshare memory region (guest_pfn: %llu): %d\n",
1247 			       region->start_gfn, ret);
1248 			goto invalidate_region;
1249 		}
1250 	}
1251 
1252 	ret = mshv_region_map(region);
1253 	if (ret && mshv_partition_encrypted(partition)) {
1254 		int shrc;
1255 
1256 		shrc = mshv_region_share(region);
1257 		if (!shrc)
1258 			goto invalidate_region;
1259 
1260 		pt_err(partition,
1261 		       "Failed to share memory region (guest_pfn: %llu): %d\n",
1262 		       region->start_gfn, shrc);
1263 		/*
1264 		 * Don't unpin if marking shared failed because pages are no
1265 		 * longer mapped in the host, ie root, anymore.
1266 		 */
1267 		goto err_out;
1268 	}
1269 
1270 	return 0;
1271 
1272 invalidate_region:
1273 	mshv_region_invalidate(region);
1274 err_out:
1275 	return ret;
1276 }
1277 
1278 /*
1279  * This maps two things: guest RAM and for pci passthru mmio space.
1280  *
1281  * mmio:
1282  *  - vfio overloads vm_pgoff to store the mmio start pfn/spa.
1283  *  - Two things need to happen for mapping mmio range:
1284  *	1. mapped in the uaddr so VMM can access it.
1285  *	2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it.
1286  *
1287  *   This function takes care of the second. The first one is managed by vfio,
1288  *   and hence is taken care of via vfio_pci_mmap_fault().
1289  */
1290 static long
mshv_map_user_memory(struct mshv_partition * partition,struct mshv_user_mem_region mem)1291 mshv_map_user_memory(struct mshv_partition *partition,
1292 		     struct mshv_user_mem_region mem)
1293 {
1294 	struct mshv_mem_region *region;
1295 	struct vm_area_struct *vma;
1296 	bool is_mmio;
1297 	ulong mmio_pfn;
1298 	long ret;
1299 
1300 	if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP) ||
1301 	    !access_ok((const void __user *)mem.userspace_addr, mem.size))
1302 		return -EINVAL;
1303 
1304 	mmap_read_lock(current->mm);
1305 	vma = vma_lookup(current->mm, mem.userspace_addr);
1306 	is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
1307 	mmio_pfn = is_mmio ? vma->vm_pgoff : 0;
1308 	mmap_read_unlock(current->mm);
1309 
1310 	if (!vma)
1311 		return -EINVAL;
1312 
1313 	ret = mshv_partition_create_region(partition, &mem, &region,
1314 					   is_mmio);
1315 	if (ret)
1316 		return ret;
1317 
1318 	switch (region->mreg_type) {
1319 	case MSHV_REGION_TYPE_MEM_PINNED:
1320 		ret = mshv_prepare_pinned_region(region);
1321 		break;
1322 	case MSHV_REGION_TYPE_MEM_MOVABLE:
1323 		/*
1324 		 * For movable memory regions, remap with no access to let
1325 		 * the hypervisor track dirty pages, enabling pre-copy live
1326 		 * migration.
1327 		 */
1328 		ret = hv_call_map_gpa_pages(partition->pt_id,
1329 					    region->start_gfn,
1330 					    region->nr_pages,
1331 					    HV_MAP_GPA_NO_ACCESS, NULL);
1332 		break;
1333 	case MSHV_REGION_TYPE_MMIO:
1334 		ret = hv_call_map_mmio_pages(partition->pt_id,
1335 					     region->start_gfn,
1336 					     mmio_pfn,
1337 					     region->nr_pages);
1338 		break;
1339 	}
1340 
1341 	if (ret)
1342 		goto errout;
1343 
1344 	spin_lock(&partition->pt_mem_regions_lock);
1345 	hlist_add_head(&region->hnode, &partition->pt_mem_regions);
1346 	spin_unlock(&partition->pt_mem_regions_lock);
1347 
1348 	return 0;
1349 
1350 errout:
1351 	vfree(region);
1352 	return ret;
1353 }
1354 
1355 /* Called for unmapping both the guest ram and the mmio space */
1356 static long
mshv_unmap_user_memory(struct mshv_partition * partition,struct mshv_user_mem_region mem)1357 mshv_unmap_user_memory(struct mshv_partition *partition,
1358 		       struct mshv_user_mem_region mem)
1359 {
1360 	struct mshv_mem_region *region;
1361 
1362 	if (!(mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP)))
1363 		return -EINVAL;
1364 
1365 	spin_lock(&partition->pt_mem_regions_lock);
1366 
1367 	region = mshv_partition_region_by_gfn(partition, mem.guest_pfn);
1368 	if (!region) {
1369 		spin_unlock(&partition->pt_mem_regions_lock);
1370 		return -ENOENT;
1371 	}
1372 
1373 	/* Paranoia check */
1374 	if (region->start_uaddr != mem.userspace_addr ||
1375 	    region->start_gfn != mem.guest_pfn ||
1376 	    region->nr_pages != HVPFN_DOWN(mem.size)) {
1377 		spin_unlock(&partition->pt_mem_regions_lock);
1378 		return -EINVAL;
1379 	}
1380 
1381 	hlist_del(&region->hnode);
1382 
1383 	spin_unlock(&partition->pt_mem_regions_lock);
1384 
1385 	mshv_region_put(region);
1386 
1387 	return 0;
1388 }
1389 
1390 static long
mshv_partition_ioctl_set_memory(struct mshv_partition * partition,struct mshv_user_mem_region __user * user_mem)1391 mshv_partition_ioctl_set_memory(struct mshv_partition *partition,
1392 				struct mshv_user_mem_region __user *user_mem)
1393 {
1394 	struct mshv_user_mem_region mem;
1395 
1396 	if (copy_from_user(&mem, user_mem, sizeof(mem)))
1397 		return -EFAULT;
1398 
1399 	if (!mem.size ||
1400 	    !PAGE_ALIGNED(mem.size) ||
1401 	    !PAGE_ALIGNED(mem.userspace_addr) ||
1402 	    (mem.flags & ~MSHV_SET_MEM_FLAGS_MASK) ||
1403 	    mshv_field_nonzero(mem, rsvd))
1404 		return -EINVAL;
1405 
1406 	if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP))
1407 		return mshv_unmap_user_memory(partition, mem);
1408 
1409 	return mshv_map_user_memory(partition, mem);
1410 }
1411 
1412 static long
mshv_partition_ioctl_ioeventfd(struct mshv_partition * partition,void __user * user_args)1413 mshv_partition_ioctl_ioeventfd(struct mshv_partition *partition,
1414 			       void __user *user_args)
1415 {
1416 	struct mshv_user_ioeventfd args;
1417 
1418 	if (copy_from_user(&args, user_args, sizeof(args)))
1419 		return -EFAULT;
1420 
1421 	return mshv_set_unset_ioeventfd(partition, &args);
1422 }
1423 
1424 static long
mshv_partition_ioctl_irqfd(struct mshv_partition * partition,void __user * user_args)1425 mshv_partition_ioctl_irqfd(struct mshv_partition *partition,
1426 			   void __user *user_args)
1427 {
1428 	struct mshv_user_irqfd args;
1429 
1430 	if (copy_from_user(&args, user_args, sizeof(args)))
1431 		return -EFAULT;
1432 
1433 	return mshv_set_unset_irqfd(partition, &args);
1434 }
1435 
1436 static long
mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition * partition,void __user * user_args)1437 mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition *partition,
1438 					    void __user *user_args)
1439 {
1440 	struct mshv_gpap_access_bitmap args;
1441 	union hv_gpa_page_access_state *states;
1442 	long ret, i;
1443 	union hv_gpa_page_access_state_flags hv_flags = {};
1444 	u8 hv_type_mask;
1445 	ulong bitmap_buf_sz, states_buf_sz;
1446 	int written = 0;
1447 
1448 	if (copy_from_user(&args, user_args, sizeof(args)))
1449 		return -EFAULT;
1450 
1451 	if (args.access_type >= MSHV_GPAP_ACCESS_TYPE_COUNT ||
1452 	    args.access_op >= MSHV_GPAP_ACCESS_OP_COUNT ||
1453 	    mshv_field_nonzero(args, rsvd) || !args.page_count ||
1454 	    !args.bitmap_ptr)
1455 		return -EINVAL;
1456 
1457 	if (check_mul_overflow(args.page_count, sizeof(*states), &states_buf_sz))
1458 		return -E2BIG;
1459 
1460 	/* Num bytes needed to store bitmap; one bit per page rounded up */
1461 	bitmap_buf_sz = DIV_ROUND_UP(args.page_count, 8);
1462 
1463 	/* Sanity check */
1464 	if (bitmap_buf_sz > states_buf_sz)
1465 		return -EBADFD;
1466 
1467 	switch (args.access_type) {
1468 	case MSHV_GPAP_ACCESS_TYPE_ACCESSED:
1469 		hv_type_mask = 1;
1470 		if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) {
1471 			hv_flags.clear_accessed = 1;
1472 			/* not accessed implies not dirty */
1473 			hv_flags.clear_dirty = 1;
1474 		} else { /* MSHV_GPAP_ACCESS_OP_SET */
1475 			hv_flags.set_accessed = 1;
1476 		}
1477 		break;
1478 	case MSHV_GPAP_ACCESS_TYPE_DIRTY:
1479 		hv_type_mask = 2;
1480 		if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) {
1481 			hv_flags.clear_dirty = 1;
1482 		} else { /* MSHV_GPAP_ACCESS_OP_SET */
1483 			hv_flags.set_dirty = 1;
1484 			/* dirty implies accessed */
1485 			hv_flags.set_accessed = 1;
1486 		}
1487 		break;
1488 	}
1489 
1490 	states = vzalloc(states_buf_sz);
1491 	if (!states)
1492 		return -ENOMEM;
1493 
1494 	ret = hv_call_get_gpa_access_states(partition->pt_id, args.page_count,
1495 					    args.gpap_base, hv_flags, &written,
1496 					    states);
1497 	if (ret)
1498 		goto free_return;
1499 
1500 	/*
1501 	 * Overwrite states buffer with bitmap - the bits in hv_type_mask
1502 	 * correspond to bitfields in hv_gpa_page_access_state
1503 	 */
1504 	for (i = 0; i < written; ++i)
1505 		__assign_bit(i, (ulong *)states,
1506 			     states[i].as_uint8 & hv_type_mask);
1507 
1508 	/* zero the unused bits in the last byte(s) of the returned bitmap */
1509 	for (i = written; i < bitmap_buf_sz * 8; ++i)
1510 		__clear_bit(i, (ulong *)states);
1511 
1512 	if (copy_to_user((void __user *)args.bitmap_ptr, states, bitmap_buf_sz))
1513 		ret = -EFAULT;
1514 
1515 free_return:
1516 	vfree(states);
1517 	return ret;
1518 }
1519 
1520 static long
mshv_partition_ioctl_set_msi_routing(struct mshv_partition * partition,void __user * user_args)1521 mshv_partition_ioctl_set_msi_routing(struct mshv_partition *partition,
1522 				     void __user *user_args)
1523 {
1524 	struct mshv_user_irq_entry *entries = NULL;
1525 	struct mshv_user_irq_table args;
1526 	long ret;
1527 
1528 	if (copy_from_user(&args, user_args, sizeof(args)))
1529 		return -EFAULT;
1530 
1531 	if (args.nr > MSHV_MAX_GUEST_IRQS ||
1532 	    mshv_field_nonzero(args, rsvd))
1533 		return -EINVAL;
1534 
1535 	if (args.nr) {
1536 		struct mshv_user_irq_table __user *urouting = user_args;
1537 
1538 		entries = vmemdup_user(urouting->entries,
1539 				       array_size(sizeof(*entries),
1540 						  args.nr));
1541 		if (IS_ERR(entries))
1542 			return PTR_ERR(entries);
1543 	}
1544 	ret = mshv_update_routing_table(partition, entries, args.nr);
1545 	kvfree(entries);
1546 
1547 	return ret;
1548 }
1549 
1550 static long
mshv_partition_ioctl_initialize(struct mshv_partition * partition)1551 mshv_partition_ioctl_initialize(struct mshv_partition *partition)
1552 {
1553 	long ret;
1554 
1555 	if (partition->pt_initialized)
1556 		return 0;
1557 
1558 	ret = hv_call_initialize_partition(partition->pt_id);
1559 	if (ret)
1560 		goto withdraw_mem;
1561 
1562 	ret = mshv_debugfs_partition_create(partition);
1563 	if (ret)
1564 		goto finalize_partition;
1565 
1566 	partition->pt_initialized = true;
1567 
1568 	return 0;
1569 
1570 finalize_partition:
1571 	hv_call_finalize_partition(partition->pt_id);
1572 withdraw_mem:
1573 	hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);
1574 
1575 	return ret;
1576 }
1577 
1578 static long
mshv_partition_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)1579 mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
1580 {
1581 	struct mshv_partition *partition = filp->private_data;
1582 	long ret;
1583 	void __user *uarg = (void __user *)arg;
1584 
1585 	if (mutex_lock_killable(&partition->pt_mutex))
1586 		return -EINTR;
1587 
1588 	switch (ioctl) {
1589 	case MSHV_INITIALIZE_PARTITION:
1590 		ret = mshv_partition_ioctl_initialize(partition);
1591 		break;
1592 	case MSHV_SET_GUEST_MEMORY:
1593 		ret = mshv_partition_ioctl_set_memory(partition, uarg);
1594 		break;
1595 	case MSHV_CREATE_VP:
1596 		ret = mshv_partition_ioctl_create_vp(partition, uarg);
1597 		break;
1598 	case MSHV_IRQFD:
1599 		ret = mshv_partition_ioctl_irqfd(partition, uarg);
1600 		break;
1601 	case MSHV_IOEVENTFD:
1602 		ret = mshv_partition_ioctl_ioeventfd(partition, uarg);
1603 		break;
1604 	case MSHV_SET_MSI_ROUTING:
1605 		ret = mshv_partition_ioctl_set_msi_routing(partition, uarg);
1606 		break;
1607 	case MSHV_GET_GPAP_ACCESS_BITMAP:
1608 		ret = mshv_partition_ioctl_get_gpap_access_bitmap(partition,
1609 								  uarg);
1610 		break;
1611 	case MSHV_ROOT_HVCALL:
1612 		ret = mshv_ioctl_passthru_hvcall(partition, true, uarg);
1613 		break;
1614 	default:
1615 		ret = -ENOTTY;
1616 	}
1617 
1618 	mutex_unlock(&partition->pt_mutex);
1619 	return ret;
1620 }
1621 
1622 static int
disable_vp_dispatch(struct mshv_vp * vp)1623 disable_vp_dispatch(struct mshv_vp *vp)
1624 {
1625 	int ret;
1626 	struct hv_register_assoc dispatch_suspend = {
1627 		.name = HV_REGISTER_DISPATCH_SUSPEND,
1628 		.value.dispatch_suspend.suspended = 1,
1629 	};
1630 
1631 	ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
1632 				    1, &dispatch_suspend);
1633 	if (ret)
1634 		vp_err(vp, "failed to suspend\n");
1635 
1636 	return ret;
1637 }
1638 
1639 static int
get_vp_signaled_count(struct mshv_vp * vp,u64 * count)1640 get_vp_signaled_count(struct mshv_vp *vp, u64 *count)
1641 {
1642 	int ret;
1643 	struct hv_register_assoc root_signal_count = {
1644 		.name = HV_REGISTER_VP_ROOT_SIGNAL_COUNT,
1645 	};
1646 
1647 	ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
1648 				    1, &root_signal_count);
1649 
1650 	if (ret) {
1651 		vp_err(vp, "Failed to get root signal count");
1652 		*count = 0;
1653 		return ret;
1654 	}
1655 
1656 	*count = root_signal_count.value.reg64;
1657 
1658 	return ret;
1659 }
1660 
1661 static void
drain_vp_signals(struct mshv_vp * vp)1662 drain_vp_signals(struct mshv_vp *vp)
1663 {
1664 	u64 hv_signal_count;
1665 	u64 vp_signal_count;
1666 
1667 	get_vp_signaled_count(vp, &hv_signal_count);
1668 
1669 	vp_signal_count = atomic64_read(&vp->run.vp_signaled_count);
1670 
1671 	/*
1672 	 * There should be at most 1 outstanding notification, but be extra
1673 	 * careful anyway.
1674 	 */
1675 	while (hv_signal_count != vp_signal_count) {
1676 		WARN_ON(hv_signal_count - vp_signal_count != 1);
1677 
1678 		if (wait_event_interruptible(vp->run.vp_suspend_queue,
1679 					     vp->run.kicked_by_hv == 1))
1680 			break;
1681 		vp->run.kicked_by_hv = 0;
1682 		vp_signal_count = atomic64_read(&vp->run.vp_signaled_count);
1683 	}
1684 }
1685 
drain_all_vps(const struct mshv_partition * partition)1686 static void drain_all_vps(const struct mshv_partition *partition)
1687 {
1688 	int i;
1689 	struct mshv_vp *vp;
1690 
1691 	/*
1692 	 * VPs are reachable from ISR. It is safe to not take the partition
1693 	 * lock because nobody else can enter this function and drop the
1694 	 * partition from the list.
1695 	 */
1696 	for (i = 0; i < MSHV_MAX_VPS; i++) {
1697 		vp = partition->pt_vp_array[i];
1698 		if (!vp)
1699 			continue;
1700 		/*
1701 		 * Disable dispatching of the VP in the hypervisor. After this
1702 		 * the hypervisor guarantees it won't generate any signals for
1703 		 * the VP and the hypervisor's VP signal count won't change.
1704 		 */
1705 		disable_vp_dispatch(vp);
1706 		drain_vp_signals(vp);
1707 	}
1708 }
1709 
1710 static void
remove_partition(struct mshv_partition * partition)1711 remove_partition(struct mshv_partition *partition)
1712 {
1713 	spin_lock(&mshv_root.pt_ht_lock);
1714 	hlist_del_rcu(&partition->pt_hnode);
1715 	spin_unlock(&mshv_root.pt_ht_lock);
1716 
1717 	synchronize_rcu();
1718 }
1719 
1720 /*
1721  * Tear down a partition and remove it from the list.
1722  * Partition's refcount must be 0
1723  */
destroy_partition(struct mshv_partition * partition)1724 static void destroy_partition(struct mshv_partition *partition)
1725 {
1726 	struct mshv_vp *vp;
1727 	struct mshv_mem_region *region;
1728 	struct hlist_node *n;
1729 	int i;
1730 
1731 	if (refcount_read(&partition->pt_ref_count)) {
1732 		pt_err(partition,
1733 		       "Attempt to destroy partition but refcount > 0\n");
1734 		return;
1735 	}
1736 
1737 	if (partition->pt_initialized) {
1738 		/*
1739 		 * We only need to drain signals for root scheduler. This should be
1740 		 * done before removing the partition from the partition list.
1741 		 */
1742 		if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
1743 			drain_all_vps(partition);
1744 
1745 		/* Remove vps */
1746 		for (i = 0; i < MSHV_MAX_VPS; ++i) {
1747 			vp = partition->pt_vp_array[i];
1748 			if (!vp)
1749 				continue;
1750 
1751 			mshv_debugfs_vp_remove(vp);
1752 			mshv_vp_stats_unmap(partition->pt_id, vp->vp_index,
1753 					    vp->vp_stats_pages);
1754 
1755 			if (vp->vp_register_page) {
1756 				(void)hv_unmap_vp_state_page(partition->pt_id,
1757 							     vp->vp_index,
1758 							     HV_VP_STATE_PAGE_REGISTERS,
1759 							     virt_to_page(vp->vp_register_page),
1760 							     input_vtl_zero);
1761 				vp->vp_register_page = NULL;
1762 			}
1763 
1764 			(void)hv_unmap_vp_state_page(partition->pt_id,
1765 						     vp->vp_index,
1766 						     HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
1767 						     virt_to_page(vp->vp_intercept_msg_page),
1768 						     input_vtl_zero);
1769 			vp->vp_intercept_msg_page = NULL;
1770 
1771 			if (vp->vp_ghcb_page) {
1772 				(void)hv_unmap_vp_state_page(partition->pt_id,
1773 							     vp->vp_index,
1774 							     HV_VP_STATE_PAGE_GHCB,
1775 							     virt_to_page(vp->vp_ghcb_page),
1776 							     input_vtl_normal);
1777 				vp->vp_ghcb_page = NULL;
1778 			}
1779 
1780 			kfree(vp);
1781 
1782 			partition->pt_vp_array[i] = NULL;
1783 		}
1784 
1785 		mshv_debugfs_partition_remove(partition);
1786 
1787 		/* Deallocates and unmaps everything including vcpus, GPA mappings etc */
1788 		hv_call_finalize_partition(partition->pt_id);
1789 
1790 		partition->pt_initialized = false;
1791 	}
1792 
1793 	remove_partition(partition);
1794 
1795 	hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions,
1796 				  hnode) {
1797 		hlist_del(&region->hnode);
1798 		mshv_region_put(region);
1799 	}
1800 
1801 	/* Withdraw and free all pages we deposited */
1802 	hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);
1803 	hv_call_delete_partition(partition->pt_id);
1804 
1805 	mshv_free_routing_table(partition);
1806 	kfree(partition);
1807 }
1808 
1809 struct
mshv_partition_get(struct mshv_partition * partition)1810 mshv_partition *mshv_partition_get(struct mshv_partition *partition)
1811 {
1812 	if (refcount_inc_not_zero(&partition->pt_ref_count))
1813 		return partition;
1814 	return NULL;
1815 }
1816 
1817 struct
mshv_partition_find(u64 partition_id)1818 mshv_partition *mshv_partition_find(u64 partition_id)
1819 	__must_hold(RCU)
1820 {
1821 	struct mshv_partition *p;
1822 
1823 	hash_for_each_possible_rcu(mshv_root.pt_htable, p, pt_hnode,
1824 				   partition_id)
1825 		if (p->pt_id == partition_id)
1826 			return p;
1827 
1828 	return NULL;
1829 }
1830 
1831 void
mshv_partition_put(struct mshv_partition * partition)1832 mshv_partition_put(struct mshv_partition *partition)
1833 {
1834 	if (refcount_dec_and_test(&partition->pt_ref_count))
1835 		destroy_partition(partition);
1836 }
1837 
1838 static int
mshv_partition_release(struct inode * inode,struct file * filp)1839 mshv_partition_release(struct inode *inode, struct file *filp)
1840 {
1841 	struct mshv_partition *partition = filp->private_data;
1842 
1843 	mshv_eventfd_release(partition);
1844 
1845 	cleanup_srcu_struct(&partition->pt_irq_srcu);
1846 
1847 	mshv_partition_put(partition);
1848 
1849 	return 0;
1850 }
1851 
1852 static int
add_partition(struct mshv_partition * partition)1853 add_partition(struct mshv_partition *partition)
1854 {
1855 	spin_lock(&mshv_root.pt_ht_lock);
1856 
1857 	hash_add_rcu(mshv_root.pt_htable, &partition->pt_hnode,
1858 		     partition->pt_id);
1859 
1860 	spin_unlock(&mshv_root.pt_ht_lock);
1861 
1862 	return 0;
1863 }
1864 
1865 static_assert(MSHV_NUM_CPU_FEATURES_BANKS ==
1866 	      HV_PARTITION_PROCESSOR_FEATURES_BANKS);
1867 
mshv_ioctl_process_pt_flags(void __user * user_arg,u64 * pt_flags,struct hv_partition_creation_properties * cr_props,union hv_partition_isolation_properties * isol_props)1868 static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags,
1869 					struct hv_partition_creation_properties *cr_props,
1870 					union hv_partition_isolation_properties *isol_props)
1871 {
1872 	int i;
1873 	struct mshv_create_partition_v2 args;
1874 	union hv_partition_processor_features *disabled_procs;
1875 	union hv_partition_processor_xsave_features *disabled_xsave;
1876 
1877 	/* First, copy v1 struct in case user is on previous versions */
1878 	if (copy_from_user(&args, user_arg,
1879 			   sizeof(struct mshv_create_partition)))
1880 		return -EFAULT;
1881 
1882 	if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
1883 	    args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
1884 		return -EINVAL;
1885 
1886 	disabled_procs = &cr_props->disabled_processor_features;
1887 	disabled_xsave = &cr_props->disabled_processor_xsave_features;
1888 
1889 	/* Check if user provided newer struct with feature fields */
1890 	if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES)) {
1891 		if (copy_from_user(&args, user_arg, sizeof(args)))
1892 			return -EFAULT;
1893 
1894 		/* Re-validate v1 fields after second copy_from_user() */
1895 		if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
1896 		    args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
1897 			return -EINVAL;
1898 
1899 		if (args.pt_num_cpu_fbanks != MSHV_NUM_CPU_FEATURES_BANKS ||
1900 		    mshv_field_nonzero(args, pt_rsvd) ||
1901 		    mshv_field_nonzero(args, pt_rsvd1))
1902 			return -EINVAL;
1903 
1904 		/*
1905 		 * Note this assumes MSHV_NUM_CPU_FEATURES_BANKS will never
1906 		 * change and equals HV_PARTITION_PROCESSOR_FEATURES_BANKS
1907 		 * (i.e. 2).
1908 		 *
1909 		 * Further banks (index >= 2) will be modifiable as 'early'
1910 		 * properties via the set partition property hypercall.
1911 		 */
1912 		for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
1913 			disabled_procs->as_uint64[i] = args.pt_cpu_fbanks[i];
1914 
1915 #if IS_ENABLED(CONFIG_X86_64)
1916 		disabled_xsave->as_uint64 = args.pt_disabled_xsave;
1917 #else
1918 		/*
1919 		 * In practice this field is ignored on arm64, but safer to
1920 		 * zero it in case it is ever used.
1921 		 */
1922 		disabled_xsave->as_uint64 = 0;
1923 
1924 		if (mshv_field_nonzero(args, pt_rsvd2))
1925 			return -EINVAL;
1926 #endif
1927 	} else {
1928 		/*
1929 		 * v1 behavior: try to enable everything. The hypervisor will
1930 		 * disable features that are not supported. The banks can be
1931 		 * queried via the get partition property hypercall.
1932 		 */
1933 		for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
1934 			disabled_procs->as_uint64[i] = 0;
1935 
1936 		disabled_xsave->as_uint64 = 0;
1937 	}
1938 
1939 	/* Only support EXO partitions */
1940 	*pt_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION |
1941 		    HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED;
1942 
1943 	if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_LAPIC))
1944 		*pt_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED;
1945 	if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_X2APIC))
1946 		*pt_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE;
1947 	if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_GPA_SUPER_PAGES))
1948 		*pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED;
1949 	if (args.pt_flags & BIT(MSHV_PT_BIT_NESTED_VIRTUALIZATION))
1950 		*pt_flags |= HV_PARTITION_CREATION_FLAG_NESTED_VIRTUALIZATION_CAPABLE;
1951 	if (args.pt_flags & BIT(MSHV_PT_BIT_SMT_ENABLED_GUEST))
1952 		*pt_flags |= HV_PARTITION_CREATION_FLAG_SMT_ENABLED_GUEST;
1953 
1954 	isol_props->as_uint64 = 0;
1955 
1956 	switch (args.pt_isolation) {
1957 	case MSHV_PT_ISOLATION_NONE:
1958 		isol_props->isolation_type = HV_PARTITION_ISOLATION_TYPE_NONE;
1959 		break;
1960 	}
1961 
1962 	return 0;
1963 }
1964 
1965 static long
mshv_ioctl_create_partition(void __user * user_arg,struct device * module_dev)1966 mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
1967 {
1968 	u64 creation_flags;
1969 	struct hv_partition_creation_properties creation_properties;
1970 	union hv_partition_isolation_properties isolation_properties;
1971 	struct mshv_partition *partition;
1972 	long ret;
1973 
1974 	ret = mshv_ioctl_process_pt_flags(user_arg, &creation_flags,
1975 					  &creation_properties,
1976 					  &isolation_properties);
1977 	if (ret)
1978 		return ret;
1979 
1980 	partition = kzalloc_obj(*partition);
1981 	if (!partition)
1982 		return -ENOMEM;
1983 
1984 	partition->pt_module_dev = module_dev;
1985 	partition->isolation_type = isolation_properties.isolation_type;
1986 
1987 	refcount_set(&partition->pt_ref_count, 1);
1988 
1989 	mutex_init(&partition->pt_mutex);
1990 
1991 	mutex_init(&partition->pt_irq_lock);
1992 
1993 	init_completion(&partition->async_hypercall);
1994 
1995 	INIT_HLIST_HEAD(&partition->irq_ack_notifier_list);
1996 
1997 	INIT_HLIST_HEAD(&partition->pt_devices);
1998 
1999 	spin_lock_init(&partition->pt_mem_regions_lock);
2000 	INIT_HLIST_HEAD(&partition->pt_mem_regions);
2001 
2002 	mshv_eventfd_init(partition);
2003 
2004 	ret = init_srcu_struct(&partition->pt_irq_srcu);
2005 	if (ret)
2006 		goto free_partition;
2007 
2008 	ret = hv_call_create_partition(creation_flags,
2009 				       creation_properties,
2010 				       isolation_properties,
2011 				       &partition->pt_id);
2012 	if (ret)
2013 		goto cleanup_irq_srcu;
2014 
2015 	ret = add_partition(partition);
2016 	if (ret)
2017 		goto delete_partition;
2018 
2019 	ret = mshv_init_async_handler(partition);
2020 	if (!ret) {
2021 		ret = FD_ADD(O_CLOEXEC, anon_inode_getfile("mshv_partition",
2022 							   &mshv_partition_fops,
2023 							   partition, O_RDWR));
2024 		if (ret >= 0)
2025 			return ret;
2026 	}
2027 	remove_partition(partition);
2028 delete_partition:
2029 	hv_call_delete_partition(partition->pt_id);
2030 cleanup_irq_srcu:
2031 	cleanup_srcu_struct(&partition->pt_irq_srcu);
2032 free_partition:
2033 	kfree(partition);
2034 
2035 	return ret;
2036 }
2037 
mshv_dev_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)2038 static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl,
2039 			   unsigned long arg)
2040 {
2041 	struct miscdevice *misc = filp->private_data;
2042 
2043 	switch (ioctl) {
2044 	case MSHV_CREATE_PARTITION:
2045 		return mshv_ioctl_create_partition((void __user *)arg,
2046 						misc->this_device);
2047 	case MSHV_ROOT_HVCALL:
2048 		return mshv_ioctl_passthru_hvcall(NULL, false,
2049 					(void __user *)arg);
2050 	}
2051 
2052 	return -ENOTTY;
2053 }
2054 
2055 static int
mshv_dev_open(struct inode * inode,struct file * filp)2056 mshv_dev_open(struct inode *inode, struct file *filp)
2057 {
2058 	return 0;
2059 }
2060 
2061 static int
mshv_dev_release(struct inode * inode,struct file * filp)2062 mshv_dev_release(struct inode *inode, struct file *filp)
2063 {
2064 	return 0;
2065 }
2066 
2067 static int mshv_cpuhp_online;
2068 static int mshv_root_sched_online;
2069 
scheduler_type_to_string(enum hv_scheduler_type type)2070 static const char *scheduler_type_to_string(enum hv_scheduler_type type)
2071 {
2072 	switch (type) {
2073 	case HV_SCHEDULER_TYPE_LP:
2074 		return "classic scheduler without SMT";
2075 	case HV_SCHEDULER_TYPE_LP_SMT:
2076 		return "classic scheduler with SMT";
2077 	case HV_SCHEDULER_TYPE_CORE_SMT:
2078 		return "core scheduler";
2079 	case HV_SCHEDULER_TYPE_ROOT:
2080 		return "root scheduler";
2081 	default:
2082 		return "unknown scheduler";
2083 	};
2084 }
2085 
l1vh_retrieve_scheduler_type(enum hv_scheduler_type * out)2086 static int __init l1vh_retrieve_scheduler_type(enum hv_scheduler_type *out)
2087 {
2088 	u64 integrated_sched_enabled;
2089 	int ret;
2090 
2091 	*out = HV_SCHEDULER_TYPE_CORE_SMT;
2092 
2093 	if (!mshv_root.vmm_caps.vmm_enable_integrated_scheduler)
2094 		return 0;
2095 
2096 	ret = hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF,
2097 						HV_PARTITION_PROPERTY_INTEGRATED_SCHEDULER_ENABLED,
2098 						0, &integrated_sched_enabled,
2099 						sizeof(integrated_sched_enabled));
2100 	if (ret)
2101 		return ret;
2102 
2103 	if (integrated_sched_enabled)
2104 		*out = HV_SCHEDULER_TYPE_ROOT;
2105 
2106 	return 0;
2107 }
2108 
2109 /* TODO move this to hv_common.c when needed outside */
hv_retrieve_scheduler_type(enum hv_scheduler_type * out)2110 static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out)
2111 {
2112 	struct hv_input_get_system_property *input;
2113 	struct hv_output_get_system_property *output;
2114 	unsigned long flags;
2115 	u64 status;
2116 
2117 	local_irq_save(flags);
2118 	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
2119 	output = *this_cpu_ptr(hyperv_pcpu_output_arg);
2120 
2121 	memset(input, 0, sizeof(*input));
2122 	memset(output, 0, sizeof(*output));
2123 	input->property_id = HV_SYSTEM_PROPERTY_SCHEDULER_TYPE;
2124 
2125 	status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output);
2126 	if (!hv_result_success(status)) {
2127 		local_irq_restore(flags);
2128 		pr_err("%s: %s\n", __func__, hv_result_to_string(status));
2129 		return hv_result_to_errno(status);
2130 	}
2131 
2132 	*out = output->scheduler_type;
2133 	local_irq_restore(flags);
2134 
2135 	return 0;
2136 }
2137 
2138 /* Retrieve and stash the supported scheduler type */
mshv_retrieve_scheduler_type(struct device * dev)2139 static int __init mshv_retrieve_scheduler_type(struct device *dev)
2140 {
2141 	int ret;
2142 
2143 	if (hv_l1vh_partition())
2144 		ret = l1vh_retrieve_scheduler_type(&hv_scheduler_type);
2145 	else
2146 		ret = hv_retrieve_scheduler_type(&hv_scheduler_type);
2147 	if (ret)
2148 		return ret;
2149 
2150 	dev_info(dev, "Hypervisor using %s\n",
2151 		 scheduler_type_to_string(hv_scheduler_type));
2152 
2153 	switch (hv_scheduler_type) {
2154 	case HV_SCHEDULER_TYPE_CORE_SMT:
2155 	case HV_SCHEDULER_TYPE_LP_SMT:
2156 	case HV_SCHEDULER_TYPE_ROOT:
2157 	case HV_SCHEDULER_TYPE_LP:
2158 		/* Supported scheduler, nothing to do */
2159 		break;
2160 	default:
2161 		dev_err(dev, "unsupported scheduler 0x%x, bailing.\n",
2162 			hv_scheduler_type);
2163 		return -EOPNOTSUPP;
2164 	}
2165 
2166 	return 0;
2167 }
2168 
mshv_root_scheduler_init(unsigned int cpu)2169 static int mshv_root_scheduler_init(unsigned int cpu)
2170 {
2171 	void **inputarg, **outputarg, *p;
2172 
2173 	inputarg = (void **)this_cpu_ptr(root_scheduler_input);
2174 	outputarg = (void **)this_cpu_ptr(root_scheduler_output);
2175 
2176 	/* Allocate two consecutive pages. One for input, one for output. */
2177 	p = kmalloc(2 * HV_HYP_PAGE_SIZE, GFP_KERNEL);
2178 	if (!p)
2179 		return -ENOMEM;
2180 
2181 	*inputarg = p;
2182 	*outputarg = (char *)p + HV_HYP_PAGE_SIZE;
2183 
2184 	return 0;
2185 }
2186 
mshv_root_scheduler_cleanup(unsigned int cpu)2187 static int mshv_root_scheduler_cleanup(unsigned int cpu)
2188 {
2189 	void *p, **inputarg, **outputarg;
2190 
2191 	inputarg = (void **)this_cpu_ptr(root_scheduler_input);
2192 	outputarg = (void **)this_cpu_ptr(root_scheduler_output);
2193 
2194 	p = *inputarg;
2195 
2196 	*inputarg = NULL;
2197 	*outputarg = NULL;
2198 
2199 	kfree(p);
2200 
2201 	return 0;
2202 }
2203 
2204 /* Must be called after retrieving the scheduler type */
2205 static int
root_scheduler_init(struct device * dev)2206 root_scheduler_init(struct device *dev)
2207 {
2208 	int ret;
2209 
2210 	if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
2211 		return 0;
2212 
2213 	root_scheduler_input = alloc_percpu(void *);
2214 	root_scheduler_output = alloc_percpu(void *);
2215 
2216 	if (!root_scheduler_input || !root_scheduler_output) {
2217 		dev_err(dev, "Failed to allocate root scheduler buffers\n");
2218 		ret = -ENOMEM;
2219 		goto out;
2220 	}
2221 
2222 	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_root_sched",
2223 				mshv_root_scheduler_init,
2224 				mshv_root_scheduler_cleanup);
2225 
2226 	if (ret < 0) {
2227 		dev_err(dev, "Failed to setup root scheduler state: %i\n", ret);
2228 		goto out;
2229 	}
2230 
2231 	mshv_root_sched_online = ret;
2232 
2233 	return 0;
2234 
2235 out:
2236 	free_percpu(root_scheduler_input);
2237 	free_percpu(root_scheduler_output);
2238 	return ret;
2239 }
2240 
2241 static void
root_scheduler_deinit(void)2242 root_scheduler_deinit(void)
2243 {
2244 	if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
2245 		return;
2246 
2247 	cpuhp_remove_state(mshv_root_sched_online);
2248 	free_percpu(root_scheduler_input);
2249 	free_percpu(root_scheduler_output);
2250 }
2251 
mshv_reboot_notify(struct notifier_block * nb,unsigned long code,void * unused)2252 static int mshv_reboot_notify(struct notifier_block *nb,
2253 			      unsigned long code, void *unused)
2254 {
2255 	cpuhp_remove_state(mshv_cpuhp_online);
2256 	return 0;
2257 }
2258 
2259 struct notifier_block mshv_reboot_nb = {
2260 	.notifier_call = mshv_reboot_notify,
2261 };
2262 
mshv_root_partition_exit(void)2263 static void mshv_root_partition_exit(void)
2264 {
2265 	unregister_reboot_notifier(&mshv_reboot_nb);
2266 }
2267 
mshv_root_partition_init(struct device * dev)2268 static int __init mshv_root_partition_init(struct device *dev)
2269 {
2270 	return register_reboot_notifier(&mshv_reboot_nb);
2271 }
2272 
mshv_init_vmm_caps(struct device * dev)2273 static int __init mshv_init_vmm_caps(struct device *dev)
2274 {
2275 	int ret;
2276 
2277 	ret = hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF,
2278 						HV_PARTITION_PROPERTY_VMM_CAPABILITIES,
2279 						0, &mshv_root.vmm_caps,
2280 						sizeof(mshv_root.vmm_caps));
2281 	if (ret && hv_l1vh_partition()) {
2282 		dev_err(dev, "Failed to get VMM capabilities: %d\n", ret);
2283 		return ret;
2284 	}
2285 
2286 	dev_dbg(dev, "vmm_caps = %#llx\n", mshv_root.vmm_caps.as_uint64[0]);
2287 
2288 	return 0;
2289 }
2290 
mshv_parent_partition_init(void)2291 static int __init mshv_parent_partition_init(void)
2292 {
2293 	int ret;
2294 	struct device *dev;
2295 	union hv_hypervisor_version_info version_info;
2296 
2297 	if (!hv_parent_partition() || is_kdump_kernel())
2298 		return -ENODEV;
2299 
2300 	if (hv_get_hypervisor_version(&version_info))
2301 		return -ENODEV;
2302 
2303 	ret = misc_register(&mshv_dev);
2304 	if (ret)
2305 		return ret;
2306 
2307 	dev = mshv_dev.this_device;
2308 
2309 	if (version_info.build_number < MSHV_HV_MIN_VERSION ||
2310 	    version_info.build_number > MSHV_HV_MAX_VERSION) {
2311 		dev_err(dev, "Running on unvalidated Hyper-V version\n");
2312 		dev_err(dev, "Versions: current: %u  min: %u  max: %u\n",
2313 			version_info.build_number, MSHV_HV_MIN_VERSION,
2314 			MSHV_HV_MAX_VERSION);
2315 	}
2316 
2317 	mshv_root.synic_pages = alloc_percpu(struct hv_synic_pages);
2318 	if (!mshv_root.synic_pages) {
2319 		dev_err(dev, "Failed to allocate percpu synic page\n");
2320 		ret = -ENOMEM;
2321 		goto device_deregister;
2322 	}
2323 
2324 	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic",
2325 				mshv_synic_init,
2326 				mshv_synic_cleanup);
2327 	if (ret < 0) {
2328 		dev_err(dev, "Failed to setup cpu hotplug state: %i\n", ret);
2329 		goto free_synic_pages;
2330 	}
2331 
2332 	mshv_cpuhp_online = ret;
2333 
2334 	ret = mshv_init_vmm_caps(dev);
2335 	if (ret)
2336 		goto remove_cpu_state;
2337 
2338 	ret = mshv_retrieve_scheduler_type(dev);
2339 	if (ret)
2340 		goto remove_cpu_state;
2341 
2342 	if (hv_root_partition())
2343 		ret = mshv_root_partition_init(dev);
2344 	if (ret)
2345 		goto remove_cpu_state;
2346 
2347 	ret = root_scheduler_init(dev);
2348 	if (ret)
2349 		goto exit_partition;
2350 
2351 	ret = mshv_debugfs_init();
2352 	if (ret)
2353 		goto deinit_root_scheduler;
2354 
2355 	ret = mshv_irqfd_wq_init();
2356 	if (ret)
2357 		goto exit_debugfs;
2358 
2359 	spin_lock_init(&mshv_root.pt_ht_lock);
2360 	hash_init(mshv_root.pt_htable);
2361 
2362 	hv_setup_mshv_handler(mshv_isr);
2363 
2364 	return 0;
2365 
2366 exit_debugfs:
2367 	mshv_debugfs_exit();
2368 deinit_root_scheduler:
2369 	root_scheduler_deinit();
2370 exit_partition:
2371 	if (hv_root_partition())
2372 		mshv_root_partition_exit();
2373 remove_cpu_state:
2374 	cpuhp_remove_state(mshv_cpuhp_online);
2375 free_synic_pages:
2376 	free_percpu(mshv_root.synic_pages);
2377 device_deregister:
2378 	misc_deregister(&mshv_dev);
2379 	return ret;
2380 }
2381 
mshv_parent_partition_exit(void)2382 static void __exit mshv_parent_partition_exit(void)
2383 {
2384 	hv_setup_mshv_handler(NULL);
2385 	mshv_port_table_fini();
2386 	mshv_debugfs_exit();
2387 	misc_deregister(&mshv_dev);
2388 	mshv_irqfd_wq_cleanup();
2389 	root_scheduler_deinit();
2390 	if (hv_root_partition())
2391 		mshv_root_partition_exit();
2392 	cpuhp_remove_state(mshv_cpuhp_online);
2393 	free_percpu(mshv_root.synic_pages);
2394 }
2395 
2396 module_init(mshv_parent_partition_init);
2397 module_exit(mshv_parent_partition_exit);
2398