xref: /linux/drivers/hv/mshv_root_main.c (revision c3d13784d5b200fc4b4a1f5d5f5585b8e3a5777e)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2024, Microsoft Corporation.
4  *
5  * The main part of the mshv_root module, providing APIs to create
6  * and manage guest partitions.
7  *
8  * Authors: Microsoft Linux virtualization team
9  */
10 
11 #include <linux/entry-virt.h>
12 #include <linux/kernel.h>
13 #include <linux/module.h>
14 #include <linux/fs.h>
15 #include <linux/miscdevice.h>
16 #include <linux/slab.h>
17 #include <linux/file.h>
18 #include <linux/anon_inodes.h>
19 #include <linux/mm.h>
20 #include <linux/io.h>
21 #include <linux/cpuhotplug.h>
22 #include <linux/random.h>
23 #include <asm/mshyperv.h>
24 #include <linux/hyperv.h>
25 #include <linux/notifier.h>
26 #include <linux/reboot.h>
27 #include <linux/kexec.h>
28 #include <linux/page-flags.h>
29 #include <linux/crash_dump.h>
30 #include <linux/panic_notifier.h>
31 #include <linux/vmalloc.h>
32 #include <linux/rseq.h>
33 
34 #include "mshv_eventfd.h"
35 #include "mshv.h"
36 #include "mshv_root.h"
37 
38 MODULE_AUTHOR("Microsoft");
39 MODULE_LICENSE("GPL");
40 MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv");
41 
42 /* HV_THREAD_COUNTER */
43 #if defined(CONFIG_X86_64)
44 #define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 202
45 #elif defined(CONFIG_ARM64)
46 #define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 95
47 #endif
48 
49 struct mshv_root mshv_root;
50 
51 enum hv_scheduler_type hv_scheduler_type;
52 
53 /* Once we implement the fast extended hypercall ABI they can go away. */
54 static void * __percpu *root_scheduler_input;
55 static void * __percpu *root_scheduler_output;
56 
57 static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
58 static int mshv_dev_open(struct inode *inode, struct file *filp);
59 static int mshv_dev_release(struct inode *inode, struct file *filp);
60 static int mshv_vp_release(struct inode *inode, struct file *filp);
61 static long mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
62 static int mshv_partition_release(struct inode *inode, struct file *filp);
63 static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
64 static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma);
65 static vm_fault_t mshv_vp_fault(struct vm_fault *vmf);
66 static int mshv_init_async_handler(struct mshv_partition *partition);
67 static void mshv_async_hvcall_handler(void *data, u64 *status);
68 
69 static const union hv_input_vtl input_vtl_zero;
70 static const union hv_input_vtl input_vtl_normal = {
71 	.target_vtl = HV_NORMAL_VTL,
72 	.use_target_vtl = 1,
73 };
74 
75 static const struct vm_operations_struct mshv_vp_vm_ops = {
76 	.fault = mshv_vp_fault,
77 };
78 
79 static const struct file_operations mshv_vp_fops = {
80 	.owner = THIS_MODULE,
81 	.release = mshv_vp_release,
82 	.unlocked_ioctl = mshv_vp_ioctl,
83 	.llseek = noop_llseek,
84 	.mmap = mshv_vp_mmap,
85 };
86 
87 static const struct file_operations mshv_partition_fops = {
88 	.owner = THIS_MODULE,
89 	.release = mshv_partition_release,
90 	.unlocked_ioctl = mshv_partition_ioctl,
91 	.llseek = noop_llseek,
92 };
93 
94 static const struct file_operations mshv_dev_fops = {
95 	.owner = THIS_MODULE,
96 	.open = mshv_dev_open,
97 	.release = mshv_dev_release,
98 	.unlocked_ioctl = mshv_dev_ioctl,
99 	.llseek = noop_llseek,
100 };
101 
102 static struct miscdevice mshv_dev = {
103 	.minor = MISC_DYNAMIC_MINOR,
104 	.name = "mshv",
105 	.fops = &mshv_dev_fops,
106 	.mode = 0600,
107 };
108 
109 /*
110  * Only allow hypercalls that have a u64 partition id as the first member of
111  * the input structure.
112  * These are sorted by value.
113  */
114 static u16 mshv_passthru_hvcalls[] = {
115 	HVCALL_GET_PARTITION_PROPERTY,
116 	HVCALL_GET_PARTITION_PROPERTY_EX,
117 	HVCALL_SET_PARTITION_PROPERTY,
118 	HVCALL_INSTALL_INTERCEPT,
119 	HVCALL_GET_VP_REGISTERS,
120 	HVCALL_SET_VP_REGISTERS,
121 	HVCALL_TRANSLATE_VIRTUAL_ADDRESS,
122 	HVCALL_CLEAR_VIRTUAL_INTERRUPT,
123 	HVCALL_REGISTER_INTERCEPT_RESULT,
124 	HVCALL_ASSERT_VIRTUAL_INTERRUPT,
125 	HVCALL_GET_GPA_PAGES_ACCESS_STATES,
126 	HVCALL_SIGNAL_EVENT_DIRECT,
127 	HVCALL_POST_MESSAGE_DIRECT,
128 	HVCALL_GET_VP_CPUID_VALUES,
129 };
130 
131 /*
132  * Only allow hypercalls that are safe to be called by the VMM with the host
133  * partition as target (i.e. HV_PARTITION_ID_SELF). Carefully audit that a
134  * hypercall cannot be misused by the VMM before adding it to this list.
135  */
136 static u16 mshv_self_passthru_hvcalls[] = {
137 	HVCALL_GET_PARTITION_PROPERTY,
138 	HVCALL_GET_PARTITION_PROPERTY_EX,
139 };
140 
mshv_hvcall_is_async(u16 code)141 static bool mshv_hvcall_is_async(u16 code)
142 {
143 	switch (code) {
144 	case HVCALL_SET_PARTITION_PROPERTY:
145 		return true;
146 	default:
147 		break;
148 	}
149 	return false;
150 }
151 
mshv_passthru_hvcall_allowed(u16 code,u64 pt_id)152 static bool mshv_passthru_hvcall_allowed(u16 code, u64 pt_id)
153 {
154 	int i;
155 	int n = ARRAY_SIZE(mshv_passthru_hvcalls);
156 	u16 *allowed_hvcalls = mshv_passthru_hvcalls;
157 
158 	if (pt_id == HV_PARTITION_ID_SELF) {
159 		n = ARRAY_SIZE(mshv_self_passthru_hvcalls);
160 		allowed_hvcalls = mshv_self_passthru_hvcalls;
161 	}
162 
163 	for (i = 0; i < n; ++i)
164 		if (allowed_hvcalls[i] == code)
165 			return true;
166 
167 	return false;
168 }
169 
mshv_ioctl_passthru_hvcall(struct mshv_partition * partition,bool partition_locked,void __user * user_args)170 static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition,
171 				      bool partition_locked,
172 				      void __user *user_args)
173 {
174 	u64 status;
175 	int ret = 0;
176 	bool is_async;
177 	struct mshv_root_hvcall args;
178 	struct page *page;
179 	unsigned int pages_order;
180 	void *input_pg = NULL;
181 	void *output_pg = NULL;
182 	u16 reps_completed;
183 	u64 pt_id = partition ? partition->pt_id : HV_PARTITION_ID_SELF;
184 
185 	if (copy_from_user(&args, user_args, sizeof(args)))
186 		return -EFAULT;
187 
188 	if (args.status || !args.in_ptr || args.in_sz < sizeof(u64) ||
189 	    mshv_field_nonzero(args, rsvd) || args.in_sz > HV_HYP_PAGE_SIZE)
190 		return -EINVAL;
191 
192 	if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE))
193 		return -EINVAL;
194 
195 	if (!mshv_passthru_hvcall_allowed(args.code, pt_id))
196 		return -EINVAL;
197 
198 	is_async = mshv_hvcall_is_async(args.code);
199 	if (is_async) {
200 		/* async hypercalls can only be called from partition fd */
201 		if (!partition || !partition_locked)
202 			return -EINVAL;
203 		ret = mshv_init_async_handler(partition);
204 		if (ret)
205 			return ret;
206 	}
207 
208 	pages_order = args.out_ptr ? 1 : 0;
209 	page = alloc_pages(GFP_KERNEL, pages_order);
210 	if (!page)
211 		return -ENOMEM;
212 	input_pg = page_address(page);
213 
214 	if (args.out_ptr)
215 		output_pg = (char *)input_pg + PAGE_SIZE;
216 	else
217 		output_pg = NULL;
218 
219 	if (copy_from_user(input_pg, (void __user *)args.in_ptr,
220 			   args.in_sz)) {
221 		ret = -EFAULT;
222 		goto free_pages_out;
223 	}
224 
225 	/*
226 	 * NOTE: This only works because all the allowed hypercalls' input
227 	 * structs begin with a u64 partition_id field.
228 	 */
229 	*(u64 *)input_pg = pt_id;
230 
231 	reps_completed = 0;
232 	do {
233 		if (args.reps) {
234 			status = hv_do_rep_hypercall_ex(args.code, args.reps,
235 							0, reps_completed,
236 							input_pg, output_pg);
237 			reps_completed = hv_repcomp(status);
238 		} else {
239 			status = hv_do_hypercall(args.code, input_pg, output_pg);
240 		}
241 
242 		if (hv_result(status) == HV_STATUS_CALL_PENDING) {
243 			if (is_async) {
244 				mshv_async_hvcall_handler(partition, &status);
245 			} else { /* Paranoia check. This shouldn't happen! */
246 				ret = -EBADFD;
247 				goto free_pages_out;
248 			}
249 		}
250 
251 		if (hv_result_success(status))
252 			break;
253 
254 		if (!hv_result_needs_memory(status))
255 			ret = hv_result_to_errno(status);
256 		else
257 			ret = hv_deposit_memory(pt_id, status);
258 	} while (!ret);
259 
260 	args.status = hv_result(status);
261 	args.reps = reps_completed;
262 	if (copy_to_user(user_args, &args, sizeof(args)))
263 		ret = -EFAULT;
264 
265 	if (!ret && output_pg &&
266 	    copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz))
267 		ret = -EFAULT;
268 
269 free_pages_out:
270 	free_pages((unsigned long)input_pg, pages_order);
271 
272 	return ret;
273 }
274 
is_ghcb_mapping_available(void)275 static inline bool is_ghcb_mapping_available(void)
276 {
277 #if IS_ENABLED(CONFIG_X86_64)
278 	return ms_hyperv.ext_features & HV_VP_GHCB_ROOT_MAPPING_AVAILABLE;
279 #else
280 	return 0;
281 #endif
282 }
283 
mshv_get_vp_registers(u32 vp_index,u64 partition_id,u16 count,struct hv_register_assoc * registers)284 static int mshv_get_vp_registers(u32 vp_index, u64 partition_id, u16 count,
285 				 struct hv_register_assoc *registers)
286 {
287 	return hv_call_get_vp_registers(vp_index, partition_id,
288 					count, input_vtl_zero, registers);
289 }
290 
mshv_set_vp_registers(u32 vp_index,u64 partition_id,u16 count,struct hv_register_assoc * registers)291 static int mshv_set_vp_registers(u32 vp_index, u64 partition_id, u16 count,
292 				 struct hv_register_assoc *registers)
293 {
294 	return hv_call_set_vp_registers(vp_index, partition_id,
295 					count, input_vtl_zero, registers);
296 }
297 
298 /*
299  * Explicit guest vCPU suspend is asynchronous by nature (as it is requested by
300  * dom0 vCPU for guest vCPU) and thus it can race with "intercept" suspend,
301  * done by the hypervisor.
302  * "Intercept" suspend leads to asynchronous message delivery to dom0 which
303  * should be awaited to keep the VP loop consistent (i.e. no message pending
304  * upon VP resume).
305  * VP intercept suspend can't be done when the VP is explicitly suspended
306  * already, and thus can be only two possible race scenarios:
307  *   1. implicit suspend bit set -> explicit suspend bit set -> message sent
308  *   2. implicit suspend bit set -> message sent -> explicit suspend bit set
309  * Checking for implicit suspend bit set after explicit suspend request has
310  * succeeded in either case allows us to reliably identify, if there is a
311  * message to receive and deliver to VMM.
312  */
313 static int
mshv_suspend_vp(const struct mshv_vp * vp,bool * message_in_flight)314 mshv_suspend_vp(const struct mshv_vp *vp, bool *message_in_flight)
315 {
316 	struct hv_register_assoc explicit_suspend = {
317 		.name = HV_REGISTER_EXPLICIT_SUSPEND
318 	};
319 	struct hv_register_assoc intercept_suspend = {
320 		.name = HV_REGISTER_INTERCEPT_SUSPEND
321 	};
322 	union hv_explicit_suspend_register *es =
323 		&explicit_suspend.value.explicit_suspend;
324 	union hv_intercept_suspend_register *is =
325 		&intercept_suspend.value.intercept_suspend;
326 	int ret;
327 
328 	es->suspended = 1;
329 
330 	ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
331 				    1, &explicit_suspend);
332 	if (ret) {
333 		vp_err(vp, "Failed to explicitly suspend vCPU\n");
334 		return ret;
335 	}
336 
337 	ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
338 				    1, &intercept_suspend);
339 	if (ret) {
340 		vp_err(vp, "Failed to get intercept suspend state\n");
341 		return ret;
342 	}
343 
344 	*message_in_flight = is->suspended;
345 
346 	return 0;
347 }
348 
349 /*
350  * This function is used when VPs are scheduled by the hypervisor's
351  * scheduler.
352  *
353  * Caller has to make sure the registers contain cleared
354  * HV_REGISTER_INTERCEPT_SUSPEND and HV_REGISTER_EXPLICIT_SUSPEND registers
355  * exactly in this order (the hypervisor clears them sequentially) to avoid
356  * potential invalid clearing a newly arrived HV_REGISTER_INTERCEPT_SUSPEND
357  * after VP is released from HV_REGISTER_EXPLICIT_SUSPEND in case of the
358  * opposite order.
359  */
mshv_run_vp_with_hyp_scheduler(struct mshv_vp * vp)360 static long mshv_run_vp_with_hyp_scheduler(struct mshv_vp *vp)
361 {
362 	long ret;
363 	struct hv_register_assoc suspend_regs[2] = {
364 			{ .name = HV_REGISTER_INTERCEPT_SUSPEND },
365 			{ .name = HV_REGISTER_EXPLICIT_SUSPEND }
366 	};
367 	size_t count = ARRAY_SIZE(suspend_regs);
368 
369 	/* Resume VP execution */
370 	ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
371 				    count, suspend_regs);
372 	if (ret) {
373 		vp_err(vp, "Failed to resume vp execution. %lx\n", ret);
374 		return ret;
375 	}
376 
377 	ret = wait_event_interruptible(vp->run.vp_suspend_queue,
378 				       vp->run.kicked_by_hv == 1);
379 	if (ret) {
380 		bool message_in_flight;
381 
382 		/*
383 		 * Otherwise the waiting was interrupted by a signal: suspend
384 		 * the vCPU explicitly and copy message in flight (if any).
385 		 */
386 		ret = mshv_suspend_vp(vp, &message_in_flight);
387 		if (ret)
388 			return ret;
389 
390 		/* Return if no message in flight */
391 		if (!message_in_flight)
392 			return -EINTR;
393 
394 		/* Wait for the message in flight. */
395 		wait_event(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1);
396 	}
397 
398 	/*
399 	 * Reset the flag to make the wait_event call above work
400 	 * next time.
401 	 */
402 	vp->run.kicked_by_hv = 0;
403 
404 	return 0;
405 }
406 
407 static int
mshv_vp_dispatch(struct mshv_vp * vp,u32 flags,struct hv_output_dispatch_vp * res)408 mshv_vp_dispatch(struct mshv_vp *vp, u32 flags,
409 		 struct hv_output_dispatch_vp *res)
410 {
411 	struct hv_input_dispatch_vp *input;
412 	struct hv_output_dispatch_vp *output;
413 	u64 status;
414 
415 	preempt_disable();
416 	input = *this_cpu_ptr(root_scheduler_input);
417 	output = *this_cpu_ptr(root_scheduler_output);
418 
419 	memset(input, 0, sizeof(*input));
420 	memset(output, 0, sizeof(*output));
421 
422 	input->partition_id = vp->vp_partition->pt_id;
423 	input->vp_index = vp->vp_index;
424 	input->time_slice = 0; /* Run forever until something happens */
425 	input->spec_ctrl = 0; /* TODO: set sensible flags */
426 	input->flags = flags;
427 
428 	vp->run.flags.root_sched_dispatched = 1;
429 	status = hv_do_hypercall(HVCALL_DISPATCH_VP, input, output);
430 	vp->run.flags.root_sched_dispatched = 0;
431 
432 	*res = *output;
433 	preempt_enable();
434 
435 	if (!hv_result_success(status))
436 		vp_err(vp, "%s: status %s\n", __func__,
437 		       hv_result_to_string(status));
438 
439 	return hv_result_to_errno(status);
440 }
441 
442 static int
mshv_vp_clear_explicit_suspend(struct mshv_vp * vp)443 mshv_vp_clear_explicit_suspend(struct mshv_vp *vp)
444 {
445 	struct hv_register_assoc explicit_suspend = {
446 		.name = HV_REGISTER_EXPLICIT_SUSPEND,
447 		.value.explicit_suspend.suspended = 0,
448 	};
449 	int ret;
450 
451 	ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
452 				    1, &explicit_suspend);
453 
454 	if (ret)
455 		vp_err(vp, "Failed to unsuspend\n");
456 
457 	return ret;
458 }
459 
460 #if IS_ENABLED(CONFIG_X86_64)
mshv_vp_interrupt_pending(struct mshv_vp * vp)461 static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp)
462 {
463 	if (!vp->vp_register_page)
464 		return 0;
465 	return vp->vp_register_page->interrupt_vectors.as_uint64;
466 }
467 #else
mshv_vp_interrupt_pending(struct mshv_vp * vp)468 static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp)
469 {
470 	return 0;
471 }
472 #endif
473 
mshv_vp_dispatch_thread_blocked(struct mshv_vp * vp)474 static bool mshv_vp_dispatch_thread_blocked(struct mshv_vp *vp)
475 {
476 	struct hv_stats_page **stats = vp->vp_stats_pages;
477 	u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->data;
478 	u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->data;
479 
480 	return parent_vp_cntrs[HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED] ||
481 	       self_vp_cntrs[HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED];
482 }
483 
484 static int
mshv_vp_wait_for_hv_kick(struct mshv_vp * vp)485 mshv_vp_wait_for_hv_kick(struct mshv_vp *vp)
486 {
487 	int ret;
488 
489 	ret = wait_event_interruptible(vp->run.vp_suspend_queue,
490 				       (vp->run.kicked_by_hv == 1 &&
491 					!mshv_vp_dispatch_thread_blocked(vp)) ||
492 				       mshv_vp_interrupt_pending(vp));
493 	if (ret)
494 		return -EINTR;
495 
496 	vp->run.flags.root_sched_blocked = 0;
497 	vp->run.kicked_by_hv = 0;
498 
499 	return 0;
500 }
501 
502 /* Must be called with interrupts enabled */
mshv_run_vp_with_root_scheduler(struct mshv_vp * vp)503 static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp)
504 {
505 	long ret;
506 
507 	if (vp->run.flags.root_sched_blocked) {
508 		/*
509 		 * Dispatch state of this VP is blocked. Need to wait
510 		 * for the hypervisor to clear the blocked state before
511 		 * dispatching it.
512 		 */
513 		ret = mshv_vp_wait_for_hv_kick(vp);
514 		if (ret)
515 			return ret;
516 	}
517 
518 	do {
519 		u32 flags = 0;
520 		struct hv_output_dispatch_vp output;
521 
522 		if (__xfer_to_guest_mode_work_pending()) {
523 			ret = xfer_to_guest_mode_handle_work();
524 			if (ret)
525 				break;
526 		}
527 
528 		if (vp->run.flags.intercept_suspend)
529 			flags |= HV_DISPATCH_VP_FLAG_CLEAR_INTERCEPT_SUSPEND;
530 
531 		if (mshv_vp_interrupt_pending(vp))
532 			flags |= HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION;
533 
534 		ret = mshv_vp_dispatch(vp, flags, &output);
535 		if (ret)
536 			break;
537 
538 		vp->run.flags.intercept_suspend = 0;
539 
540 		if (output.dispatch_state == HV_VP_DISPATCH_STATE_BLOCKED) {
541 			if (output.dispatch_event ==
542 						HV_VP_DISPATCH_EVENT_SUSPEND) {
543 				/*
544 				 * TODO: remove the warning once VP canceling
545 				 *	 is supported
546 				 */
547 				WARN_ONCE(atomic64_read(&vp->run.vp_signaled_count),
548 					  "%s: vp#%d: unexpected explicit suspend\n",
549 					  __func__, vp->vp_index);
550 				/*
551 				 * Need to clear explicit suspend before
552 				 * dispatching.
553 				 * Explicit suspend is either:
554 				 * - set right after the first VP dispatch or
555 				 * - set explicitly via hypercall
556 				 * Since the latter case is not yet supported,
557 				 * simply clear it here.
558 				 */
559 				ret = mshv_vp_clear_explicit_suspend(vp);
560 				if (ret)
561 					break;
562 
563 				ret = mshv_vp_wait_for_hv_kick(vp);
564 				if (ret)
565 					break;
566 			} else {
567 				vp->run.flags.root_sched_blocked = 1;
568 				ret = mshv_vp_wait_for_hv_kick(vp);
569 				if (ret)
570 					break;
571 			}
572 		} else {
573 			/* HV_VP_DISPATCH_STATE_READY */
574 			if (output.dispatch_event ==
575 						HV_VP_DISPATCH_EVENT_INTERCEPT)
576 				vp->run.flags.intercept_suspend = 1;
577 		}
578 	} while (!vp->run.flags.intercept_suspend);
579 
580 	rseq_virt_userspace_exit();
581 
582 	return ret;
583 }
584 
585 static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ,
586 	      "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ");
587 
588 static struct mshv_mem_region *
mshv_partition_region_by_gfn(struct mshv_partition * partition,u64 gfn)589 mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
590 {
591 	struct mshv_mem_region *region;
592 
593 	hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) {
594 		if (gfn >= region->start_gfn &&
595 		    gfn < region->start_gfn + region->nr_pages)
596 			return region;
597 	}
598 
599 	return NULL;
600 }
601 
602 static struct mshv_mem_region *
mshv_partition_region_by_gfn_get(struct mshv_partition * p,u64 gfn)603 mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
604 {
605 	struct mshv_mem_region *region;
606 
607 	spin_lock(&p->pt_mem_regions_lock);
608 	region = mshv_partition_region_by_gfn(p, gfn);
609 	if (!region || !mshv_region_get(region)) {
610 		spin_unlock(&p->pt_mem_regions_lock);
611 		return NULL;
612 	}
613 	spin_unlock(&p->pt_mem_regions_lock);
614 
615 	return region;
616 }
617 
618 /**
619  * mshv_handle_gpa_intercept - Handle GPA (Guest Physical Address) intercepts.
620  * @vp: Pointer to the virtual processor structure.
621  *
622  * This function processes GPA intercepts by identifying the memory region
623  * corresponding to the intercepted GPA, aligning the page offset, and
624  * mapping the required pages. It ensures that the region is valid and
625  * handles faults efficiently by mapping multiple pages at once.
626  *
627  * Return: true if the intercept was handled successfully, false otherwise.
628  */
mshv_handle_gpa_intercept(struct mshv_vp * vp)629 static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
630 {
631 	struct mshv_partition *p = vp->vp_partition;
632 	struct mshv_mem_region *region;
633 	bool ret;
634 	u64 gfn;
635 #if defined(CONFIG_X86_64)
636 	struct hv_x64_memory_intercept_message *msg =
637 		(struct hv_x64_memory_intercept_message *)
638 		vp->vp_intercept_msg_page->u.payload;
639 #elif defined(CONFIG_ARM64)
640 	struct hv_arm64_memory_intercept_message *msg =
641 		(struct hv_arm64_memory_intercept_message *)
642 		vp->vp_intercept_msg_page->u.payload;
643 #endif
644 
645 	gfn = HVPFN_DOWN(msg->guest_physical_address);
646 
647 	region = mshv_partition_region_by_gfn_get(p, gfn);
648 	if (!region)
649 		return false;
650 
651 	/* Only movable memory ranges are supported for GPA intercepts */
652 	if (region->mreg_type == MSHV_REGION_TYPE_MEM_MOVABLE)
653 		ret = mshv_region_handle_gfn_fault(region, gfn);
654 	else
655 		ret = false;
656 
657 	mshv_region_put(region);
658 
659 	return ret;
660 }
661 
mshv_vp_handle_intercept(struct mshv_vp * vp)662 static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
663 {
664 	switch (vp->vp_intercept_msg_page->header.message_type) {
665 	case HVMSG_GPA_INTERCEPT:
666 		return mshv_handle_gpa_intercept(vp);
667 	}
668 	return false;
669 }
670 
mshv_vp_ioctl_run_vp(struct mshv_vp * vp,void __user * ret_msg)671 static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg)
672 {
673 	long rc;
674 
675 	do {
676 		if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
677 			rc = mshv_run_vp_with_root_scheduler(vp);
678 		else
679 			rc = mshv_run_vp_with_hyp_scheduler(vp);
680 	} while (rc == 0 && mshv_vp_handle_intercept(vp));
681 
682 	if (rc)
683 		return rc;
684 
685 	if (copy_to_user(ret_msg, vp->vp_intercept_msg_page,
686 			 sizeof(struct hv_message)))
687 		rc = -EFAULT;
688 
689 	return rc;
690 }
691 
692 static int
mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp * vp,struct hv_vp_state_data state_data,unsigned long user_pfn,size_t page_count,bool is_set)693 mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp *vp,
694 				struct hv_vp_state_data state_data,
695 				unsigned long user_pfn, size_t page_count,
696 				bool is_set)
697 {
698 	int completed, ret = 0;
699 	unsigned long check;
700 	struct page **pages;
701 
702 	if (page_count > INT_MAX)
703 		return -EINVAL;
704 	/*
705 	 * Check the arithmetic for wraparound/overflow.
706 	 * The last page address in the buffer is:
707 	 * (user_pfn + (page_count - 1)) * PAGE_SIZE
708 	 */
709 	if (check_add_overflow(user_pfn, (page_count - 1), &check))
710 		return -EOVERFLOW;
711 	if (check_mul_overflow(check, PAGE_SIZE, &check))
712 		return -EOVERFLOW;
713 
714 	/* Pin user pages so hypervisor can copy directly to them */
715 	pages = kzalloc_objs(struct page *, page_count);
716 	if (!pages)
717 		return -ENOMEM;
718 
719 	for (completed = 0; completed < page_count; completed += ret) {
720 		unsigned long user_addr = (user_pfn + completed) * PAGE_SIZE;
721 		int remaining = page_count - completed;
722 
723 		ret = pin_user_pages_fast(user_addr, remaining, FOLL_WRITE,
724 					  &pages[completed]);
725 		if (ret < 0) {
726 			vp_err(vp, "%s: Failed to pin user pages error %i\n",
727 			       __func__, ret);
728 			goto unpin_pages;
729 		}
730 	}
731 
732 	if (is_set)
733 		ret = hv_call_set_vp_state(vp->vp_index,
734 					   vp->vp_partition->pt_id,
735 					   state_data, page_count, pages,
736 					   0, NULL);
737 	else
738 		ret = hv_call_get_vp_state(vp->vp_index,
739 					   vp->vp_partition->pt_id,
740 					   state_data, page_count, pages,
741 					   NULL);
742 
743 unpin_pages:
744 	unpin_user_pages(pages, completed);
745 	kfree(pages);
746 	return ret;
747 }
748 
749 static long
mshv_vp_ioctl_get_set_state(struct mshv_vp * vp,struct mshv_get_set_vp_state __user * user_args,bool is_set)750 mshv_vp_ioctl_get_set_state(struct mshv_vp *vp,
751 			    struct mshv_get_set_vp_state __user *user_args,
752 			    bool is_set)
753 {
754 	struct mshv_get_set_vp_state args;
755 	long ret = 0;
756 	union hv_output_get_vp_state vp_state;
757 	u32 data_sz;
758 	struct hv_vp_state_data state_data = {};
759 
760 	if (copy_from_user(&args, user_args, sizeof(args)))
761 		return -EFAULT;
762 
763 	if (args.type >= MSHV_VP_STATE_COUNT || mshv_field_nonzero(args, rsvd) ||
764 	    !args.buf_sz || !PAGE_ALIGNED(args.buf_sz) ||
765 	    !PAGE_ALIGNED(args.buf_ptr))
766 		return -EINVAL;
767 
768 	if (!access_ok((void __user *)args.buf_ptr, args.buf_sz))
769 		return -EFAULT;
770 
771 	switch (args.type) {
772 	case MSHV_VP_STATE_LAPIC:
773 		state_data.type = HV_GET_SET_VP_STATE_LAPIC_STATE;
774 		data_sz = HV_HYP_PAGE_SIZE;
775 		break;
776 	case MSHV_VP_STATE_XSAVE:
777 	{
778 		u64 data_sz_64;
779 
780 		ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
781 						     HV_PARTITION_PROPERTY_XSAVE_STATES,
782 						     &state_data.xsave.states.as_uint64);
783 		if (ret)
784 			return ret;
785 
786 		ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
787 						     HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE,
788 						     &data_sz_64);
789 		if (ret)
790 			return ret;
791 
792 		data_sz = (u32)data_sz_64;
793 		state_data.xsave.flags = 0;
794 		/* Always request legacy states */
795 		state_data.xsave.states.legacy_x87 = 1;
796 		state_data.xsave.states.legacy_sse = 1;
797 		state_data.type = HV_GET_SET_VP_STATE_XSAVE;
798 		break;
799 	}
800 	case MSHV_VP_STATE_SIMP:
801 		state_data.type = HV_GET_SET_VP_STATE_SIM_PAGE;
802 		data_sz = HV_HYP_PAGE_SIZE;
803 		break;
804 	case MSHV_VP_STATE_SIEFP:
805 		state_data.type = HV_GET_SET_VP_STATE_SIEF_PAGE;
806 		data_sz = HV_HYP_PAGE_SIZE;
807 		break;
808 	case MSHV_VP_STATE_SYNTHETIC_TIMERS:
809 		state_data.type = HV_GET_SET_VP_STATE_SYNTHETIC_TIMERS;
810 		data_sz = sizeof(vp_state.synthetic_timers_state);
811 		break;
812 	default:
813 		return -EINVAL;
814 	}
815 
816 	if (copy_to_user(&user_args->buf_sz, &data_sz, sizeof(user_args->buf_sz)))
817 		return -EFAULT;
818 
819 	if (data_sz > args.buf_sz)
820 		return -EINVAL;
821 
822 	/* If the data is transmitted via pfns, delegate to helper */
823 	if (state_data.type & HV_GET_SET_VP_STATE_TYPE_PFN) {
824 		unsigned long user_pfn = PFN_DOWN(args.buf_ptr);
825 		size_t page_count = PFN_DOWN(args.buf_sz);
826 
827 		return mshv_vp_ioctl_get_set_state_pfn(vp, state_data, user_pfn,
828 						       page_count, is_set);
829 	}
830 
831 	/* Paranoia check - this shouldn't happen! */
832 	if (data_sz > sizeof(vp_state)) {
833 		vp_err(vp, "Invalid vp state data size!\n");
834 		return -EINVAL;
835 	}
836 
837 	if (is_set) {
838 		if (copy_from_user(&vp_state, (__user void *)args.buf_ptr, data_sz))
839 			return -EFAULT;
840 
841 		return hv_call_set_vp_state(vp->vp_index,
842 					    vp->vp_partition->pt_id,
843 					    state_data, 0, NULL,
844 					    sizeof(vp_state), (u8 *)&vp_state);
845 	}
846 
847 	ret = hv_call_get_vp_state(vp->vp_index, vp->vp_partition->pt_id,
848 				   state_data, 0, NULL, &vp_state);
849 	if (ret)
850 		return ret;
851 
852 	if (copy_to_user((void __user *)args.buf_ptr, &vp_state, data_sz))
853 		return -EFAULT;
854 
855 	return 0;
856 }
857 
858 static long
mshv_vp_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)859 mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
860 {
861 	struct mshv_vp *vp = filp->private_data;
862 	long r = -ENOTTY;
863 
864 	if (mutex_lock_killable(&vp->vp_mutex))
865 		return -EINTR;
866 
867 	switch (ioctl) {
868 	case MSHV_RUN_VP:
869 		r = mshv_vp_ioctl_run_vp(vp, (void __user *)arg);
870 		break;
871 	case MSHV_GET_VP_STATE:
872 		r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, false);
873 		break;
874 	case MSHV_SET_VP_STATE:
875 		r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, true);
876 		break;
877 	case MSHV_ROOT_HVCALL:
878 		r = mshv_ioctl_passthru_hvcall(vp->vp_partition, false,
879 					       (void __user *)arg);
880 		break;
881 	default:
882 		vp_warn(vp, "Invalid ioctl: %#x\n", ioctl);
883 		break;
884 	}
885 	mutex_unlock(&vp->vp_mutex);
886 
887 	return r;
888 }
889 
mshv_vp_fault(struct vm_fault * vmf)890 static vm_fault_t mshv_vp_fault(struct vm_fault *vmf)
891 {
892 	struct mshv_vp *vp = vmf->vma->vm_file->private_data;
893 
894 	switch (vmf->vma->vm_pgoff) {
895 	case MSHV_VP_MMAP_OFFSET_REGISTERS:
896 		vmf->page = virt_to_page(vp->vp_register_page);
897 		break;
898 	case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE:
899 		vmf->page = virt_to_page(vp->vp_intercept_msg_page);
900 		break;
901 	case MSHV_VP_MMAP_OFFSET_GHCB:
902 		vmf->page = virt_to_page(vp->vp_ghcb_page);
903 		break;
904 	default:
905 		return VM_FAULT_SIGBUS;
906 	}
907 
908 	get_page(vmf->page);
909 
910 	return 0;
911 }
912 
mshv_vp_mmap(struct file * file,struct vm_area_struct * vma)913 static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma)
914 {
915 	struct mshv_vp *vp = file->private_data;
916 
917 	switch (vma->vm_pgoff) {
918 	case MSHV_VP_MMAP_OFFSET_REGISTERS:
919 		if (!vp->vp_register_page)
920 			return -ENODEV;
921 		break;
922 	case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE:
923 		if (!vp->vp_intercept_msg_page)
924 			return -ENODEV;
925 		break;
926 	case MSHV_VP_MMAP_OFFSET_GHCB:
927 		if (!vp->vp_ghcb_page)
928 			return -ENODEV;
929 		break;
930 	default:
931 		return -EINVAL;
932 	}
933 
934 	vma->vm_ops = &mshv_vp_vm_ops;
935 	return 0;
936 }
937 
938 static int
mshv_vp_release(struct inode * inode,struct file * filp)939 mshv_vp_release(struct inode *inode, struct file *filp)
940 {
941 	struct mshv_vp *vp = filp->private_data;
942 
943 	/* Rest of VP cleanup happens in destroy_partition() */
944 	mshv_partition_put(vp->vp_partition);
945 	return 0;
946 }
947 
mshv_vp_stats_unmap(u64 partition_id,u32 vp_index,struct hv_stats_page * stats_pages[])948 void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index,
949 			 struct hv_stats_page *stats_pages[])
950 {
951 	union hv_stats_object_identity identity = {
952 		.vp.partition_id = partition_id,
953 		.vp.vp_index = vp_index,
954 	};
955 	int err;
956 
957 	identity.vp.stats_area_type = HV_STATS_AREA_SELF;
958 	err = hv_unmap_stats_page(HV_STATS_OBJECT_VP,
959 				  stats_pages[HV_STATS_AREA_SELF],
960 				  &identity);
961 	if (err)
962 		pr_err("%s: failed to unmap partition %llu vp %u self stats, err: %d\n",
963 		       __func__, partition_id, vp_index, err);
964 
965 	if (stats_pages[HV_STATS_AREA_PARENT] != stats_pages[HV_STATS_AREA_SELF]) {
966 		identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
967 		err = hv_unmap_stats_page(HV_STATS_OBJECT_VP,
968 					  stats_pages[HV_STATS_AREA_PARENT],
969 					  &identity);
970 		if (err)
971 			pr_err("%s: failed to unmap partition %llu vp %u parent stats, err: %d\n",
972 			       __func__, partition_id, vp_index, err);
973 	}
974 }
975 
mshv_vp_stats_map(u64 partition_id,u32 vp_index,struct hv_stats_page * stats_pages[])976 int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
977 		      struct hv_stats_page *stats_pages[])
978 {
979 	union hv_stats_object_identity identity = {
980 		.vp.partition_id = partition_id,
981 		.vp.vp_index = vp_index,
982 	};
983 	int err;
984 
985 	identity.vp.stats_area_type = HV_STATS_AREA_SELF;
986 	err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
987 				&stats_pages[HV_STATS_AREA_SELF]);
988 	if (err) {
989 		pr_err("%s: failed to map partition %llu vp %u self stats, err: %d\n",
990 		       __func__, partition_id, vp_index, err);
991 		return err;
992 	}
993 
994 	/*
995 	 * L1VH partition cannot access its vp stats in parent area.
996 	 */
997 	if (is_l1vh_parent(partition_id)) {
998 		stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF];
999 	} else {
1000 		identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
1001 		err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
1002 					&stats_pages[HV_STATS_AREA_PARENT]);
1003 		if (err) {
1004 			pr_err("%s: failed to map partition %llu vp %u parent stats, err: %d\n",
1005 			       __func__, partition_id, vp_index, err);
1006 			goto unmap_self;
1007 		}
1008 		if (!stats_pages[HV_STATS_AREA_PARENT])
1009 			stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF];
1010 	}
1011 
1012 	return 0;
1013 
1014 unmap_self:
1015 	identity.vp.stats_area_type = HV_STATS_AREA_SELF;
1016 	hv_unmap_stats_page(HV_STATS_OBJECT_VP,
1017 			    stats_pages[HV_STATS_AREA_SELF],
1018 			    &identity);
1019 	return err;
1020 }
1021 
1022 static long
mshv_partition_ioctl_create_vp(struct mshv_partition * partition,void __user * arg)1023 mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
1024 			       void __user *arg)
1025 {
1026 	struct mshv_create_vp args;
1027 	struct mshv_vp *vp;
1028 	struct page *intercept_msg_page, *register_page, *ghcb_page;
1029 	struct hv_stats_page *stats_pages[2];
1030 	long ret;
1031 
1032 	if (copy_from_user(&args, arg, sizeof(args)))
1033 		return -EFAULT;
1034 
1035 	if (args.vp_index >= MSHV_MAX_VPS)
1036 		return -EINVAL;
1037 
1038 	if (partition->pt_vp_array[args.vp_index])
1039 		return -EEXIST;
1040 
1041 	ret = hv_call_create_vp(NUMA_NO_NODE, partition->pt_id, args.vp_index,
1042 				0 /* Only valid for root partition VPs */);
1043 	if (ret)
1044 		return ret;
1045 
1046 	ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
1047 				   HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
1048 				   input_vtl_zero, &intercept_msg_page);
1049 	if (ret)
1050 		goto destroy_vp;
1051 
1052 	if (!mshv_partition_encrypted(partition)) {
1053 		ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
1054 					   HV_VP_STATE_PAGE_REGISTERS,
1055 					   input_vtl_zero, &register_page);
1056 		if (ret)
1057 			goto unmap_intercept_message_page;
1058 	}
1059 
1060 	if (mshv_partition_encrypted(partition) &&
1061 	    is_ghcb_mapping_available()) {
1062 		ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
1063 					   HV_VP_STATE_PAGE_GHCB,
1064 					   input_vtl_normal, &ghcb_page);
1065 		if (ret)
1066 			goto unmap_register_page;
1067 	}
1068 
1069 	ret = mshv_vp_stats_map(partition->pt_id, args.vp_index,
1070 				stats_pages);
1071 	if (ret)
1072 		goto unmap_ghcb_page;
1073 
1074 	vp = kzalloc_obj(*vp);
1075 	if (!vp)
1076 		goto unmap_stats_pages;
1077 
1078 	vp->vp_partition = mshv_partition_get(partition);
1079 	if (!vp->vp_partition) {
1080 		ret = -EBADF;
1081 		goto free_vp;
1082 	}
1083 
1084 	mutex_init(&vp->vp_mutex);
1085 	init_waitqueue_head(&vp->run.vp_suspend_queue);
1086 	atomic64_set(&vp->run.vp_signaled_count, 0);
1087 
1088 	vp->vp_index = args.vp_index;
1089 	vp->vp_intercept_msg_page = page_to_virt(intercept_msg_page);
1090 	if (!mshv_partition_encrypted(partition))
1091 		vp->vp_register_page = page_to_virt(register_page);
1092 
1093 	if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
1094 		vp->vp_ghcb_page = page_to_virt(ghcb_page);
1095 
1096 	memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages));
1097 
1098 	ret = mshv_debugfs_vp_create(vp);
1099 	if (ret)
1100 		goto put_partition;
1101 
1102 	/*
1103 	 * Keep anon_inode_getfd last: it installs fd in the file struct and
1104 	 * thus makes the state accessible in user space.
1105 	 */
1106 	ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp,
1107 			       O_RDWR | O_CLOEXEC);
1108 	if (ret < 0)
1109 		goto remove_debugfs_vp;
1110 
1111 	/* already exclusive with the partition mutex for all ioctls */
1112 	partition->pt_vp_count++;
1113 	partition->pt_vp_array[args.vp_index] = vp;
1114 
1115 	return ret;
1116 
1117 remove_debugfs_vp:
1118 	mshv_debugfs_vp_remove(vp);
1119 put_partition:
1120 	mshv_partition_put(partition);
1121 free_vp:
1122 	kfree(vp);
1123 unmap_stats_pages:
1124 	mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages);
1125 unmap_ghcb_page:
1126 	if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
1127 		hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
1128 				       HV_VP_STATE_PAGE_GHCB, ghcb_page,
1129 				       input_vtl_normal);
1130 unmap_register_page:
1131 	if (!mshv_partition_encrypted(partition))
1132 		hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
1133 				       HV_VP_STATE_PAGE_REGISTERS,
1134 				       register_page, input_vtl_zero);
1135 unmap_intercept_message_page:
1136 	hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
1137 			       HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
1138 			       intercept_msg_page, input_vtl_zero);
1139 destroy_vp:
1140 	hv_call_delete_vp(partition->pt_id, args.vp_index);
1141 	return ret;
1142 }
1143 
mshv_init_async_handler(struct mshv_partition * partition)1144 static int mshv_init_async_handler(struct mshv_partition *partition)
1145 {
1146 	if (completion_done(&partition->async_hypercall)) {
1147 		pt_err(partition,
1148 		       "Cannot issue async hypercall while another one in progress!\n");
1149 		return -EPERM;
1150 	}
1151 
1152 	reinit_completion(&partition->async_hypercall);
1153 	return 0;
1154 }
1155 
mshv_async_hvcall_handler(void * data,u64 * status)1156 static void mshv_async_hvcall_handler(void *data, u64 *status)
1157 {
1158 	struct mshv_partition *partition = data;
1159 
1160 	wait_for_completion(&partition->async_hypercall);
1161 	pt_dbg(partition, "Async hypercall completed!\n");
1162 
1163 	*status = partition->async_hypercall_status;
1164 }
1165 
1166 /*
1167  * NB: caller checks and makes sure mem->size is page aligned
1168  * Returns: 0 with regionpp updated on success, or -errno
1169  */
mshv_partition_create_region(struct mshv_partition * partition,struct mshv_user_mem_region * mem,struct mshv_mem_region ** regionpp,bool is_mmio)1170 static int mshv_partition_create_region(struct mshv_partition *partition,
1171 					struct mshv_user_mem_region *mem,
1172 					struct mshv_mem_region **regionpp,
1173 					bool is_mmio)
1174 {
1175 	struct mshv_mem_region *rg;
1176 	u64 nr_pages = HVPFN_DOWN(mem->size);
1177 
1178 	/* Reject overlapping regions */
1179 	spin_lock(&partition->pt_mem_regions_lock);
1180 	hlist_for_each_entry(rg, &partition->pt_mem_regions, hnode) {
1181 		if (mem->guest_pfn + nr_pages <= rg->start_gfn ||
1182 		    rg->start_gfn + rg->nr_pages <= mem->guest_pfn)
1183 			continue;
1184 		spin_unlock(&partition->pt_mem_regions_lock);
1185 		return -EEXIST;
1186 	}
1187 	spin_unlock(&partition->pt_mem_regions_lock);
1188 
1189 	rg = mshv_region_create(mem->guest_pfn, nr_pages,
1190 				mem->userspace_addr, mem->flags);
1191 	if (IS_ERR(rg))
1192 		return PTR_ERR(rg);
1193 
1194 	if (is_mmio)
1195 		rg->mreg_type = MSHV_REGION_TYPE_MMIO;
1196 	else if (mshv_partition_encrypted(partition) ||
1197 		 !mshv_region_movable_init(rg))
1198 		rg->mreg_type = MSHV_REGION_TYPE_MEM_PINNED;
1199 	else
1200 		rg->mreg_type = MSHV_REGION_TYPE_MEM_MOVABLE;
1201 
1202 	rg->partition = partition;
1203 
1204 	*regionpp = rg;
1205 
1206 	return 0;
1207 }
1208 
1209 /**
1210  * mshv_prepare_pinned_region - Pin and map memory regions
1211  * @region: Pointer to the memory region structure
1212  *
1213  * This function processes memory regions that are explicitly marked as pinned.
1214  * Pinned regions are preallocated, mapped upfront, and do not rely on fault-based
1215  * population. The function ensures the region is properly populated, handles
1216  * encryption requirements for SNP partitions if applicable, maps the region,
1217  * and performs necessary sharing or eviction operations based on the mapping
1218  * result.
1219  *
1220  * Return: 0 on success, negative error code on failure.
1221  */
mshv_prepare_pinned_region(struct mshv_mem_region * region)1222 static int mshv_prepare_pinned_region(struct mshv_mem_region *region)
1223 {
1224 	struct mshv_partition *partition = region->partition;
1225 	int ret;
1226 
1227 	ret = mshv_region_pin(region);
1228 	if (ret) {
1229 		pt_err(partition, "Failed to pin memory region: %d\n",
1230 		       ret);
1231 		goto err_out;
1232 	}
1233 
1234 	/*
1235 	 * For an SNP partition it is a requirement that for every memory region
1236 	 * that we are going to map for this partition we should make sure that
1237 	 * host access to that region is released. This is ensured by doing an
1238 	 * additional hypercall which will update the SLAT to release host
1239 	 * access to guest memory regions.
1240 	 */
1241 	if (mshv_partition_encrypted(partition)) {
1242 		ret = mshv_region_unshare(region);
1243 		if (ret) {
1244 			pt_err(partition,
1245 			       "Failed to unshare memory region (guest_pfn: %llu): %d\n",
1246 			       region->start_gfn, ret);
1247 			goto invalidate_region;
1248 		}
1249 	}
1250 
1251 	ret = mshv_region_map(region);
1252 	if (ret && mshv_partition_encrypted(partition)) {
1253 		int shrc;
1254 
1255 		shrc = mshv_region_share(region);
1256 		if (!shrc)
1257 			goto invalidate_region;
1258 
1259 		pt_err(partition,
1260 		       "Failed to share memory region (guest_pfn: %llu): %d\n",
1261 		       region->start_gfn, shrc);
1262 		/*
1263 		 * Don't unpin if marking shared failed because pages are no
1264 		 * longer mapped in the host, ie root, anymore.
1265 		 */
1266 		goto err_out;
1267 	}
1268 
1269 	return 0;
1270 
1271 invalidate_region:
1272 	mshv_region_invalidate(region);
1273 err_out:
1274 	return ret;
1275 }
1276 
1277 /*
1278  * This maps two things: guest RAM and for pci passthru mmio space.
1279  *
1280  * mmio:
1281  *  - vfio overloads vm_pgoff to store the mmio start pfn/spa.
1282  *  - Two things need to happen for mapping mmio range:
1283  *	1. mapped in the uaddr so VMM can access it.
1284  *	2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it.
1285  *
1286  *   This function takes care of the second. The first one is managed by vfio,
1287  *   and hence is taken care of via vfio_pci_mmap_fault().
1288  */
1289 static long
mshv_map_user_memory(struct mshv_partition * partition,struct mshv_user_mem_region * mem)1290 mshv_map_user_memory(struct mshv_partition *partition,
1291 		     struct mshv_user_mem_region *mem)
1292 {
1293 	struct mshv_mem_region *region;
1294 	struct vm_area_struct *vma;
1295 	bool is_mmio;
1296 	ulong mmio_pfn;
1297 	long ret;
1298 
1299 	if (mem->flags & BIT(MSHV_SET_MEM_BIT_UNMAP) ||
1300 	    !access_ok((const void __user *)mem->userspace_addr, mem->size))
1301 		return -EINVAL;
1302 
1303 	mmap_read_lock(current->mm);
1304 	vma = vma_lookup(current->mm, mem->userspace_addr);
1305 	is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
1306 	mmio_pfn = is_mmio ? vma->vm_pgoff : 0;
1307 	mmap_read_unlock(current->mm);
1308 
1309 	if (!vma)
1310 		return -EINVAL;
1311 
1312 	ret = mshv_partition_create_region(partition, mem, &region,
1313 					   is_mmio);
1314 	if (ret)
1315 		return ret;
1316 
1317 	switch (region->mreg_type) {
1318 	case MSHV_REGION_TYPE_MEM_PINNED:
1319 		ret = mshv_prepare_pinned_region(region);
1320 		break;
1321 	case MSHV_REGION_TYPE_MEM_MOVABLE:
1322 		/*
1323 		 * For movable memory regions, remap with no access to let
1324 		 * the hypervisor track dirty pages, enabling pre-copy live
1325 		 * migration.
1326 		 */
1327 		ret = hv_call_map_gpa_pages(partition->pt_id,
1328 					    region->start_gfn,
1329 					    region->nr_pages,
1330 					    HV_MAP_GPA_NO_ACCESS, NULL);
1331 		break;
1332 	case MSHV_REGION_TYPE_MMIO:
1333 		ret = hv_call_map_mmio_pages(partition->pt_id,
1334 					     region->start_gfn,
1335 					     mmio_pfn,
1336 					     region->nr_pages);
1337 		break;
1338 	}
1339 
1340 	if (ret)
1341 		goto errout;
1342 
1343 	spin_lock(&partition->pt_mem_regions_lock);
1344 	hlist_add_head(&region->hnode, &partition->pt_mem_regions);
1345 	spin_unlock(&partition->pt_mem_regions_lock);
1346 
1347 	return 0;
1348 
1349 errout:
1350 	mshv_region_put(region);
1351 	return ret;
1352 }
1353 
1354 /* Called for unmapping both the guest ram and the mmio space */
1355 static long
mshv_unmap_user_memory(struct mshv_partition * partition,struct mshv_user_mem_region * mem)1356 mshv_unmap_user_memory(struct mshv_partition *partition,
1357 		       struct mshv_user_mem_region *mem)
1358 {
1359 	struct mshv_mem_region *region;
1360 
1361 	if (!(mem->flags & BIT(MSHV_SET_MEM_BIT_UNMAP)))
1362 		return -EINVAL;
1363 
1364 	spin_lock(&partition->pt_mem_regions_lock);
1365 
1366 	region = mshv_partition_region_by_gfn(partition, mem->guest_pfn);
1367 	if (!region) {
1368 		spin_unlock(&partition->pt_mem_regions_lock);
1369 		return -ENOENT;
1370 	}
1371 
1372 	/* Paranoia check */
1373 	if (region->start_uaddr != mem->userspace_addr ||
1374 	    region->start_gfn != mem->guest_pfn ||
1375 	    region->nr_pages != HVPFN_DOWN(mem->size)) {
1376 		spin_unlock(&partition->pt_mem_regions_lock);
1377 		return -EINVAL;
1378 	}
1379 
1380 	hlist_del(&region->hnode);
1381 
1382 	spin_unlock(&partition->pt_mem_regions_lock);
1383 
1384 	mshv_region_put(region);
1385 
1386 	return 0;
1387 }
1388 
1389 static long
mshv_partition_ioctl_set_memory(struct mshv_partition * partition,struct mshv_user_mem_region __user * user_mem)1390 mshv_partition_ioctl_set_memory(struct mshv_partition *partition,
1391 				struct mshv_user_mem_region __user *user_mem)
1392 {
1393 	struct mshv_user_mem_region mem;
1394 
1395 	if (copy_from_user(&mem, user_mem, sizeof(mem)))
1396 		return -EFAULT;
1397 
1398 	if (!mem.size ||
1399 	    !PAGE_ALIGNED(mem.size) ||
1400 	    !PAGE_ALIGNED(mem.userspace_addr) ||
1401 	    (mem.flags & ~MSHV_SET_MEM_FLAGS_MASK) ||
1402 	    mshv_field_nonzero(mem, rsvd))
1403 		return -EINVAL;
1404 
1405 	if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP))
1406 		return mshv_unmap_user_memory(partition, &mem);
1407 
1408 	return mshv_map_user_memory(partition, &mem);
1409 }
1410 
1411 static long
mshv_partition_ioctl_ioeventfd(struct mshv_partition * partition,void __user * user_args)1412 mshv_partition_ioctl_ioeventfd(struct mshv_partition *partition,
1413 			       void __user *user_args)
1414 {
1415 	struct mshv_user_ioeventfd args;
1416 
1417 	if (copy_from_user(&args, user_args, sizeof(args)))
1418 		return -EFAULT;
1419 
1420 	return mshv_set_unset_ioeventfd(partition, &args);
1421 }
1422 
1423 static long
mshv_partition_ioctl_irqfd(struct mshv_partition * partition,void __user * user_args)1424 mshv_partition_ioctl_irqfd(struct mshv_partition *partition,
1425 			   void __user *user_args)
1426 {
1427 	struct mshv_user_irqfd args;
1428 
1429 	if (copy_from_user(&args, user_args, sizeof(args)))
1430 		return -EFAULT;
1431 
1432 	return mshv_set_unset_irqfd(partition, &args);
1433 }
1434 
1435 static long
mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition * partition,void __user * user_args)1436 mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition *partition,
1437 					    void __user *user_args)
1438 {
1439 	struct mshv_gpap_access_bitmap args;
1440 	union hv_gpa_page_access_state *states;
1441 	long ret, i;
1442 	union hv_gpa_page_access_state_flags hv_flags = {};
1443 	u8 hv_type_mask;
1444 	ulong bitmap_buf_sz, states_buf_sz;
1445 	int written = 0;
1446 
1447 	if (copy_from_user(&args, user_args, sizeof(args)))
1448 		return -EFAULT;
1449 
1450 	if (args.access_type >= MSHV_GPAP_ACCESS_TYPE_COUNT ||
1451 	    args.access_op >= MSHV_GPAP_ACCESS_OP_COUNT ||
1452 	    mshv_field_nonzero(args, rsvd) || !args.page_count ||
1453 	    !args.bitmap_ptr)
1454 		return -EINVAL;
1455 
1456 	if (check_mul_overflow(args.page_count, sizeof(*states), &states_buf_sz))
1457 		return -E2BIG;
1458 
1459 	/* Num bytes needed to store bitmap; one bit per page rounded up */
1460 	bitmap_buf_sz = DIV_ROUND_UP(args.page_count, 8);
1461 
1462 	/* Sanity check */
1463 	if (bitmap_buf_sz > states_buf_sz)
1464 		return -EBADFD;
1465 
1466 	switch (args.access_type) {
1467 	case MSHV_GPAP_ACCESS_TYPE_ACCESSED:
1468 		hv_type_mask = 1;
1469 		if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) {
1470 			hv_flags.clear_accessed = 1;
1471 			/* not accessed implies not dirty */
1472 			hv_flags.clear_dirty = 1;
1473 		} else { /* MSHV_GPAP_ACCESS_OP_SET */
1474 			hv_flags.set_accessed = 1;
1475 		}
1476 		break;
1477 	case MSHV_GPAP_ACCESS_TYPE_DIRTY:
1478 		hv_type_mask = 2;
1479 		if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) {
1480 			hv_flags.clear_dirty = 1;
1481 		} else { /* MSHV_GPAP_ACCESS_OP_SET */
1482 			hv_flags.set_dirty = 1;
1483 			/* dirty implies accessed */
1484 			hv_flags.set_accessed = 1;
1485 		}
1486 		break;
1487 	}
1488 
1489 	states = vzalloc(states_buf_sz);
1490 	if (!states)
1491 		return -ENOMEM;
1492 
1493 	ret = hv_call_get_gpa_access_states(partition->pt_id, args.page_count,
1494 					    args.gpap_base, hv_flags, &written,
1495 					    states);
1496 	if (ret)
1497 		goto free_return;
1498 
1499 	/*
1500 	 * Overwrite states buffer with bitmap - the bits in hv_type_mask
1501 	 * correspond to bitfields in hv_gpa_page_access_state
1502 	 */
1503 	for (i = 0; i < written; ++i)
1504 		__assign_bit(i, (ulong *)states,
1505 			     states[i].as_uint8 & hv_type_mask);
1506 
1507 	/* zero the unused bits in the last byte(s) of the returned bitmap */
1508 	for (i = written; i < bitmap_buf_sz * 8; ++i)
1509 		__clear_bit(i, (ulong *)states);
1510 
1511 	if (copy_to_user((void __user *)args.bitmap_ptr, states, bitmap_buf_sz))
1512 		ret = -EFAULT;
1513 
1514 free_return:
1515 	vfree(states);
1516 	return ret;
1517 }
1518 
1519 static long
mshv_partition_ioctl_set_msi_routing(struct mshv_partition * partition,void __user * user_args)1520 mshv_partition_ioctl_set_msi_routing(struct mshv_partition *partition,
1521 				     void __user *user_args)
1522 {
1523 	struct mshv_user_irq_entry *entries = NULL;
1524 	struct mshv_user_irq_table args;
1525 	long ret;
1526 
1527 	if (copy_from_user(&args, user_args, sizeof(args)))
1528 		return -EFAULT;
1529 
1530 	if (args.nr > MSHV_MAX_GUEST_IRQS ||
1531 	    mshv_field_nonzero(args, rsvd))
1532 		return -EINVAL;
1533 
1534 	if (args.nr) {
1535 		struct mshv_user_irq_table __user *urouting = user_args;
1536 
1537 		entries = vmemdup_user(urouting->entries,
1538 				       array_size(sizeof(*entries),
1539 						  args.nr));
1540 		if (IS_ERR(entries))
1541 			return PTR_ERR(entries);
1542 	}
1543 	ret = mshv_update_routing_table(partition, entries, args.nr);
1544 	kvfree(entries);
1545 
1546 	return ret;
1547 }
1548 
1549 static long
mshv_partition_ioctl_initialize(struct mshv_partition * partition)1550 mshv_partition_ioctl_initialize(struct mshv_partition *partition)
1551 {
1552 	long ret;
1553 
1554 	if (partition->pt_initialized)
1555 		return 0;
1556 
1557 	ret = hv_call_initialize_partition(partition->pt_id);
1558 	if (ret)
1559 		goto withdraw_mem;
1560 
1561 	ret = mshv_debugfs_partition_create(partition);
1562 	if (ret)
1563 		goto finalize_partition;
1564 
1565 	partition->pt_initialized = true;
1566 
1567 	return 0;
1568 
1569 finalize_partition:
1570 	hv_call_finalize_partition(partition->pt_id);
1571 withdraw_mem:
1572 	hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);
1573 
1574 	return ret;
1575 }
1576 
1577 static long
mshv_partition_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)1578 mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
1579 {
1580 	struct mshv_partition *partition = filp->private_data;
1581 	long ret;
1582 	void __user *uarg = (void __user *)arg;
1583 
1584 	if (mutex_lock_killable(&partition->pt_mutex))
1585 		return -EINTR;
1586 
1587 	switch (ioctl) {
1588 	case MSHV_INITIALIZE_PARTITION:
1589 		ret = mshv_partition_ioctl_initialize(partition);
1590 		break;
1591 	case MSHV_SET_GUEST_MEMORY:
1592 		ret = mshv_partition_ioctl_set_memory(partition, uarg);
1593 		break;
1594 	case MSHV_CREATE_VP:
1595 		ret = mshv_partition_ioctl_create_vp(partition, uarg);
1596 		break;
1597 	case MSHV_IRQFD:
1598 		ret = mshv_partition_ioctl_irqfd(partition, uarg);
1599 		break;
1600 	case MSHV_IOEVENTFD:
1601 		ret = mshv_partition_ioctl_ioeventfd(partition, uarg);
1602 		break;
1603 	case MSHV_SET_MSI_ROUTING:
1604 		ret = mshv_partition_ioctl_set_msi_routing(partition, uarg);
1605 		break;
1606 	case MSHV_GET_GPAP_ACCESS_BITMAP:
1607 		ret = mshv_partition_ioctl_get_gpap_access_bitmap(partition,
1608 								  uarg);
1609 		break;
1610 	case MSHV_ROOT_HVCALL:
1611 		ret = mshv_ioctl_passthru_hvcall(partition, true, uarg);
1612 		break;
1613 	default:
1614 		ret = -ENOTTY;
1615 	}
1616 
1617 	mutex_unlock(&partition->pt_mutex);
1618 	return ret;
1619 }
1620 
1621 static int
disable_vp_dispatch(struct mshv_vp * vp)1622 disable_vp_dispatch(struct mshv_vp *vp)
1623 {
1624 	int ret;
1625 	struct hv_register_assoc dispatch_suspend = {
1626 		.name = HV_REGISTER_DISPATCH_SUSPEND,
1627 		.value.dispatch_suspend.suspended = 1,
1628 	};
1629 
1630 	ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
1631 				    1, &dispatch_suspend);
1632 	if (ret)
1633 		vp_err(vp, "failed to suspend\n");
1634 
1635 	return ret;
1636 }
1637 
1638 static int
get_vp_signaled_count(struct mshv_vp * vp,u64 * count)1639 get_vp_signaled_count(struct mshv_vp *vp, u64 *count)
1640 {
1641 	int ret;
1642 	struct hv_register_assoc root_signal_count = {
1643 		.name = HV_REGISTER_VP_ROOT_SIGNAL_COUNT,
1644 	};
1645 
1646 	ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
1647 				    1, &root_signal_count);
1648 
1649 	if (ret) {
1650 		vp_err(vp, "Failed to get root signal count");
1651 		*count = 0;
1652 		return ret;
1653 	}
1654 
1655 	*count = root_signal_count.value.reg64;
1656 
1657 	return ret;
1658 }
1659 
1660 static void
drain_vp_signals(struct mshv_vp * vp)1661 drain_vp_signals(struct mshv_vp *vp)
1662 {
1663 	u64 hv_signal_count;
1664 	u64 vp_signal_count;
1665 
1666 	get_vp_signaled_count(vp, &hv_signal_count);
1667 
1668 	vp_signal_count = atomic64_read(&vp->run.vp_signaled_count);
1669 
1670 	/*
1671 	 * There should be at most 1 outstanding notification, but be extra
1672 	 * careful anyway.
1673 	 */
1674 	while (hv_signal_count != vp_signal_count) {
1675 		WARN_ON(hv_signal_count - vp_signal_count != 1);
1676 
1677 		if (wait_event_interruptible(vp->run.vp_suspend_queue,
1678 					     vp->run.kicked_by_hv == 1))
1679 			break;
1680 		vp->run.kicked_by_hv = 0;
1681 		vp_signal_count = atomic64_read(&vp->run.vp_signaled_count);
1682 	}
1683 }
1684 
drain_all_vps(const struct mshv_partition * partition)1685 static void drain_all_vps(const struct mshv_partition *partition)
1686 {
1687 	int i;
1688 	struct mshv_vp *vp;
1689 
1690 	/*
1691 	 * VPs are reachable from ISR. It is safe to not take the partition
1692 	 * lock because nobody else can enter this function and drop the
1693 	 * partition from the list.
1694 	 */
1695 	for (i = 0; i < MSHV_MAX_VPS; i++) {
1696 		vp = partition->pt_vp_array[i];
1697 		if (!vp)
1698 			continue;
1699 		/*
1700 		 * Disable dispatching of the VP in the hypervisor. After this
1701 		 * the hypervisor guarantees it won't generate any signals for
1702 		 * the VP and the hypervisor's VP signal count won't change.
1703 		 */
1704 		disable_vp_dispatch(vp);
1705 		drain_vp_signals(vp);
1706 	}
1707 }
1708 
1709 static void
remove_partition(struct mshv_partition * partition)1710 remove_partition(struct mshv_partition *partition)
1711 {
1712 	spin_lock(&mshv_root.pt_ht_lock);
1713 	hlist_del_rcu(&partition->pt_hnode);
1714 	spin_unlock(&mshv_root.pt_ht_lock);
1715 
1716 	synchronize_rcu();
1717 }
1718 
1719 /*
1720  * Tear down a partition and remove it from the list.
1721  * Partition's refcount must be 0
1722  */
destroy_partition(struct mshv_partition * partition)1723 static void destroy_partition(struct mshv_partition *partition)
1724 {
1725 	struct mshv_vp *vp;
1726 	struct mshv_mem_region *region;
1727 	struct hlist_node *n;
1728 	int i;
1729 
1730 	if (refcount_read(&partition->pt_ref_count)) {
1731 		pt_err(partition,
1732 		       "Attempt to destroy partition but refcount > 0\n");
1733 		return;
1734 	}
1735 
1736 	if (partition->pt_initialized) {
1737 		/*
1738 		 * We only need to drain signals for root scheduler. This should be
1739 		 * done before removing the partition from the partition list.
1740 		 */
1741 		if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
1742 			drain_all_vps(partition);
1743 
1744 		/* Remove vps */
1745 		for (i = 0; i < MSHV_MAX_VPS; ++i) {
1746 			vp = partition->pt_vp_array[i];
1747 			if (!vp)
1748 				continue;
1749 
1750 			mshv_debugfs_vp_remove(vp);
1751 			mshv_vp_stats_unmap(partition->pt_id, vp->vp_index,
1752 					    vp->vp_stats_pages);
1753 
1754 			if (vp->vp_register_page) {
1755 				(void)hv_unmap_vp_state_page(partition->pt_id,
1756 							     vp->vp_index,
1757 							     HV_VP_STATE_PAGE_REGISTERS,
1758 							     virt_to_page(vp->vp_register_page),
1759 							     input_vtl_zero);
1760 				vp->vp_register_page = NULL;
1761 			}
1762 
1763 			(void)hv_unmap_vp_state_page(partition->pt_id,
1764 						     vp->vp_index,
1765 						     HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
1766 						     virt_to_page(vp->vp_intercept_msg_page),
1767 						     input_vtl_zero);
1768 			vp->vp_intercept_msg_page = NULL;
1769 
1770 			if (vp->vp_ghcb_page) {
1771 				(void)hv_unmap_vp_state_page(partition->pt_id,
1772 							     vp->vp_index,
1773 							     HV_VP_STATE_PAGE_GHCB,
1774 							     virt_to_page(vp->vp_ghcb_page),
1775 							     input_vtl_normal);
1776 				vp->vp_ghcb_page = NULL;
1777 			}
1778 
1779 			kfree(vp);
1780 
1781 			partition->pt_vp_array[i] = NULL;
1782 		}
1783 
1784 		mshv_debugfs_partition_remove(partition);
1785 
1786 		/* Deallocates and unmaps everything including vcpus, GPA mappings etc */
1787 		hv_call_finalize_partition(partition->pt_id);
1788 
1789 		partition->pt_initialized = false;
1790 	}
1791 
1792 	remove_partition(partition);
1793 
1794 	hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions,
1795 				  hnode) {
1796 		hlist_del(&region->hnode);
1797 		mshv_region_put(region);
1798 	}
1799 
1800 	/* Withdraw and free all pages we deposited */
1801 	hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);
1802 	hv_call_delete_partition(partition->pt_id);
1803 
1804 	mshv_free_routing_table(partition);
1805 	kfree(partition);
1806 }
1807 
1808 struct
mshv_partition_get(struct mshv_partition * partition)1809 mshv_partition *mshv_partition_get(struct mshv_partition *partition)
1810 {
1811 	if (refcount_inc_not_zero(&partition->pt_ref_count))
1812 		return partition;
1813 	return NULL;
1814 }
1815 
1816 struct
mshv_partition_find(u64 partition_id)1817 mshv_partition *mshv_partition_find(u64 partition_id)
1818 	__must_hold(RCU)
1819 {
1820 	struct mshv_partition *p;
1821 
1822 	hash_for_each_possible_rcu(mshv_root.pt_htable, p, pt_hnode,
1823 				   partition_id)
1824 		if (p->pt_id == partition_id)
1825 			return p;
1826 
1827 	return NULL;
1828 }
1829 
1830 void
mshv_partition_put(struct mshv_partition * partition)1831 mshv_partition_put(struct mshv_partition *partition)
1832 {
1833 	if (refcount_dec_and_test(&partition->pt_ref_count))
1834 		destroy_partition(partition);
1835 }
1836 
1837 static int
mshv_partition_release(struct inode * inode,struct file * filp)1838 mshv_partition_release(struct inode *inode, struct file *filp)
1839 {
1840 	struct mshv_partition *partition = filp->private_data;
1841 
1842 	mshv_eventfd_release(partition);
1843 
1844 	cleanup_srcu_struct(&partition->pt_irq_srcu);
1845 
1846 	mshv_partition_put(partition);
1847 
1848 	return 0;
1849 }
1850 
1851 static int
add_partition(struct mshv_partition * partition)1852 add_partition(struct mshv_partition *partition)
1853 {
1854 	spin_lock(&mshv_root.pt_ht_lock);
1855 
1856 	hash_add_rcu(mshv_root.pt_htable, &partition->pt_hnode,
1857 		     partition->pt_id);
1858 
1859 	spin_unlock(&mshv_root.pt_ht_lock);
1860 
1861 	return 0;
1862 }
1863 
1864 static_assert(MSHV_NUM_CPU_FEATURES_BANKS ==
1865 	      HV_PARTITION_PROCESSOR_FEATURES_BANKS);
1866 
mshv_ioctl_process_pt_flags(void __user * user_arg,u64 * pt_flags,struct hv_partition_creation_properties * cr_props,union hv_partition_isolation_properties * isol_props)1867 static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags,
1868 					struct hv_partition_creation_properties *cr_props,
1869 					union hv_partition_isolation_properties *isol_props)
1870 {
1871 	int i;
1872 	struct mshv_create_partition_v2 args;
1873 	union hv_partition_processor_features *disabled_procs;
1874 	union hv_partition_processor_xsave_features *disabled_xsave;
1875 
1876 	/* First, copy v1 struct in case user is on previous versions */
1877 	if (copy_from_user(&args, user_arg,
1878 			   sizeof(struct mshv_create_partition)))
1879 		return -EFAULT;
1880 
1881 	if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
1882 	    args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
1883 		return -EINVAL;
1884 
1885 	disabled_procs = &cr_props->disabled_processor_features;
1886 	disabled_xsave = &cr_props->disabled_processor_xsave_features;
1887 
1888 	/* Check if user provided newer struct with feature fields */
1889 	if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES)) {
1890 		if (copy_from_user(&args, user_arg, sizeof(args)))
1891 			return -EFAULT;
1892 
1893 		/* Re-validate v1 fields after second copy_from_user() */
1894 		if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
1895 		    args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
1896 			return -EINVAL;
1897 
1898 		if (args.pt_num_cpu_fbanks != MSHV_NUM_CPU_FEATURES_BANKS ||
1899 		    mshv_field_nonzero(args, pt_rsvd) ||
1900 		    mshv_field_nonzero(args, pt_rsvd1))
1901 			return -EINVAL;
1902 
1903 		/*
1904 		 * Note this assumes MSHV_NUM_CPU_FEATURES_BANKS will never
1905 		 * change and equals HV_PARTITION_PROCESSOR_FEATURES_BANKS
1906 		 * (i.e. 2).
1907 		 *
1908 		 * Further banks (index >= 2) will be modifiable as 'early'
1909 		 * properties via the set partition property hypercall.
1910 		 */
1911 		for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
1912 			disabled_procs->as_uint64[i] = args.pt_cpu_fbanks[i];
1913 
1914 #if IS_ENABLED(CONFIG_X86_64)
1915 		disabled_xsave->as_uint64 = args.pt_disabled_xsave;
1916 #else
1917 		/*
1918 		 * In practice this field is ignored on arm64, but safer to
1919 		 * zero it in case it is ever used.
1920 		 */
1921 		disabled_xsave->as_uint64 = 0;
1922 
1923 		if (mshv_field_nonzero(args, pt_rsvd2))
1924 			return -EINVAL;
1925 #endif
1926 	} else {
1927 		/*
1928 		 * v1 behavior: try to enable everything. The hypervisor will
1929 		 * disable features that are not supported. The banks can be
1930 		 * queried via the get partition property hypercall.
1931 		 */
1932 		for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
1933 			disabled_procs->as_uint64[i] = 0;
1934 
1935 		disabled_xsave->as_uint64 = 0;
1936 	}
1937 
1938 	/* Only support EXO partitions */
1939 	*pt_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION |
1940 		    HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED;
1941 
1942 	if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_LAPIC))
1943 		*pt_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED;
1944 	if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_X2APIC))
1945 		*pt_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE;
1946 	if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_GPA_SUPER_PAGES))
1947 		*pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED;
1948 	if (args.pt_flags & BIT(MSHV_PT_BIT_NESTED_VIRTUALIZATION))
1949 		*pt_flags |= HV_PARTITION_CREATION_FLAG_NESTED_VIRTUALIZATION_CAPABLE;
1950 	if (args.pt_flags & BIT(MSHV_PT_BIT_SMT_ENABLED_GUEST))
1951 		*pt_flags |= HV_PARTITION_CREATION_FLAG_SMT_ENABLED_GUEST;
1952 
1953 	isol_props->as_uint64 = 0;
1954 
1955 	switch (args.pt_isolation) {
1956 	case MSHV_PT_ISOLATION_NONE:
1957 		isol_props->isolation_type = HV_PARTITION_ISOLATION_TYPE_NONE;
1958 		break;
1959 	}
1960 
1961 	return 0;
1962 }
1963 
1964 static long
mshv_ioctl_create_partition(void __user * user_arg,struct device * module_dev)1965 mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
1966 {
1967 	u64 creation_flags;
1968 	struct hv_partition_creation_properties creation_properties;
1969 	union hv_partition_isolation_properties isolation_properties;
1970 	struct mshv_partition *partition;
1971 	long ret;
1972 
1973 	ret = mshv_ioctl_process_pt_flags(user_arg, &creation_flags,
1974 					  &creation_properties,
1975 					  &isolation_properties);
1976 	if (ret)
1977 		return ret;
1978 
1979 	partition = kzalloc_obj(*partition);
1980 	if (!partition)
1981 		return -ENOMEM;
1982 
1983 	partition->pt_module_dev = module_dev;
1984 	partition->isolation_type = isolation_properties.isolation_type;
1985 
1986 	refcount_set(&partition->pt_ref_count, 1);
1987 
1988 	mutex_init(&partition->pt_mutex);
1989 
1990 	mutex_init(&partition->pt_irq_lock);
1991 
1992 	init_completion(&partition->async_hypercall);
1993 
1994 	INIT_HLIST_HEAD(&partition->irq_ack_notifier_list);
1995 
1996 	INIT_HLIST_HEAD(&partition->pt_devices);
1997 
1998 	spin_lock_init(&partition->pt_mem_regions_lock);
1999 	INIT_HLIST_HEAD(&partition->pt_mem_regions);
2000 
2001 	mshv_eventfd_init(partition);
2002 
2003 	ret = init_srcu_struct(&partition->pt_irq_srcu);
2004 	if (ret)
2005 		goto free_partition;
2006 
2007 	ret = hv_call_create_partition(creation_flags,
2008 				       creation_properties,
2009 				       isolation_properties,
2010 				       &partition->pt_id);
2011 	if (ret)
2012 		goto cleanup_irq_srcu;
2013 
2014 	ret = add_partition(partition);
2015 	if (ret)
2016 		goto delete_partition;
2017 
2018 	ret = mshv_init_async_handler(partition);
2019 	if (!ret) {
2020 		ret = FD_ADD(O_CLOEXEC, anon_inode_getfile("mshv_partition",
2021 							   &mshv_partition_fops,
2022 							   partition, O_RDWR));
2023 		if (ret >= 0)
2024 			return ret;
2025 	}
2026 	remove_partition(partition);
2027 delete_partition:
2028 	hv_call_delete_partition(partition->pt_id);
2029 cleanup_irq_srcu:
2030 	cleanup_srcu_struct(&partition->pt_irq_srcu);
2031 free_partition:
2032 	kfree(partition);
2033 
2034 	return ret;
2035 }
2036 
mshv_dev_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)2037 static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl,
2038 			   unsigned long arg)
2039 {
2040 	struct miscdevice *misc = filp->private_data;
2041 
2042 	switch (ioctl) {
2043 	case MSHV_CREATE_PARTITION:
2044 		return mshv_ioctl_create_partition((void __user *)arg,
2045 						misc->this_device);
2046 	case MSHV_ROOT_HVCALL:
2047 		return mshv_ioctl_passthru_hvcall(NULL, false,
2048 					(void __user *)arg);
2049 	}
2050 
2051 	return -ENOTTY;
2052 }
2053 
2054 static int
mshv_dev_open(struct inode * inode,struct file * filp)2055 mshv_dev_open(struct inode *inode, struct file *filp)
2056 {
2057 	return 0;
2058 }
2059 
2060 static int
mshv_dev_release(struct inode * inode,struct file * filp)2061 mshv_dev_release(struct inode *inode, struct file *filp)
2062 {
2063 	return 0;
2064 }
2065 
2066 static int mshv_root_sched_online;
2067 
scheduler_type_to_string(enum hv_scheduler_type type)2068 static const char *scheduler_type_to_string(enum hv_scheduler_type type)
2069 {
2070 	switch (type) {
2071 	case HV_SCHEDULER_TYPE_LP:
2072 		return "classic scheduler without SMT";
2073 	case HV_SCHEDULER_TYPE_LP_SMT:
2074 		return "classic scheduler with SMT";
2075 	case HV_SCHEDULER_TYPE_CORE_SMT:
2076 		return "core scheduler";
2077 	case HV_SCHEDULER_TYPE_ROOT:
2078 		return "root scheduler";
2079 	default:
2080 		return "unknown scheduler";
2081 	};
2082 }
2083 
l1vh_retrieve_scheduler_type(enum hv_scheduler_type * out)2084 static int __init l1vh_retrieve_scheduler_type(enum hv_scheduler_type *out)
2085 {
2086 	u64 integrated_sched_enabled;
2087 	int ret;
2088 
2089 	*out = HV_SCHEDULER_TYPE_CORE_SMT;
2090 
2091 	if (!mshv_root.vmm_caps.vmm_enable_integrated_scheduler)
2092 		return 0;
2093 
2094 	ret = hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF,
2095 						HV_PARTITION_PROPERTY_INTEGRATED_SCHEDULER_ENABLED,
2096 						0, &integrated_sched_enabled,
2097 						sizeof(integrated_sched_enabled));
2098 	if (ret)
2099 		return ret;
2100 
2101 	if (integrated_sched_enabled)
2102 		*out = HV_SCHEDULER_TYPE_ROOT;
2103 
2104 	return 0;
2105 }
2106 
2107 /* TODO move this to hv_common.c when needed outside */
hv_retrieve_scheduler_type(enum hv_scheduler_type * out)2108 static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out)
2109 {
2110 	struct hv_input_get_system_property *input;
2111 	struct hv_output_get_system_property *output;
2112 	unsigned long flags;
2113 	u64 status;
2114 
2115 	local_irq_save(flags);
2116 	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
2117 	output = *this_cpu_ptr(hyperv_pcpu_output_arg);
2118 
2119 	memset(input, 0, sizeof(*input));
2120 	memset(output, 0, sizeof(*output));
2121 	input->property_id = HV_SYSTEM_PROPERTY_SCHEDULER_TYPE;
2122 
2123 	status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output);
2124 	if (!hv_result_success(status)) {
2125 		local_irq_restore(flags);
2126 		pr_err("%s: %s\n", __func__, hv_result_to_string(status));
2127 		return hv_result_to_errno(status);
2128 	}
2129 
2130 	*out = output->scheduler_type;
2131 	local_irq_restore(flags);
2132 
2133 	return 0;
2134 }
2135 
2136 /* Retrieve and stash the supported scheduler type */
mshv_retrieve_scheduler_type(struct device * dev)2137 static int __init mshv_retrieve_scheduler_type(struct device *dev)
2138 {
2139 	int ret;
2140 
2141 	if (hv_l1vh_partition())
2142 		ret = l1vh_retrieve_scheduler_type(&hv_scheduler_type);
2143 	else
2144 		ret = hv_retrieve_scheduler_type(&hv_scheduler_type);
2145 	if (ret)
2146 		return ret;
2147 
2148 	dev_info(dev, "Hypervisor using %s\n",
2149 		 scheduler_type_to_string(hv_scheduler_type));
2150 
2151 	switch (hv_scheduler_type) {
2152 	case HV_SCHEDULER_TYPE_CORE_SMT:
2153 	case HV_SCHEDULER_TYPE_LP_SMT:
2154 	case HV_SCHEDULER_TYPE_ROOT:
2155 	case HV_SCHEDULER_TYPE_LP:
2156 		/* Supported scheduler, nothing to do */
2157 		break;
2158 	default:
2159 		dev_err(dev, "unsupported scheduler 0x%x, bailing.\n",
2160 			hv_scheduler_type);
2161 		return -EOPNOTSUPP;
2162 	}
2163 
2164 	return 0;
2165 }
2166 
mshv_root_scheduler_init(unsigned int cpu)2167 static int mshv_root_scheduler_init(unsigned int cpu)
2168 {
2169 	void **inputarg, **outputarg, *p;
2170 
2171 	inputarg = (void **)this_cpu_ptr(root_scheduler_input);
2172 	outputarg = (void **)this_cpu_ptr(root_scheduler_output);
2173 
2174 	/* Allocate two consecutive pages. One for input, one for output. */
2175 	p = kmalloc(2 * HV_HYP_PAGE_SIZE, GFP_KERNEL);
2176 	if (!p)
2177 		return -ENOMEM;
2178 
2179 	*inputarg = p;
2180 	*outputarg = (char *)p + HV_HYP_PAGE_SIZE;
2181 
2182 	return 0;
2183 }
2184 
mshv_root_scheduler_cleanup(unsigned int cpu)2185 static int mshv_root_scheduler_cleanup(unsigned int cpu)
2186 {
2187 	void *p, **inputarg, **outputarg;
2188 
2189 	inputarg = (void **)this_cpu_ptr(root_scheduler_input);
2190 	outputarg = (void **)this_cpu_ptr(root_scheduler_output);
2191 
2192 	p = *inputarg;
2193 
2194 	*inputarg = NULL;
2195 	*outputarg = NULL;
2196 
2197 	kfree(p);
2198 
2199 	return 0;
2200 }
2201 
2202 /* Must be called after retrieving the scheduler type */
2203 static int
root_scheduler_init(struct device * dev)2204 root_scheduler_init(struct device *dev)
2205 {
2206 	int ret;
2207 
2208 	if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
2209 		return 0;
2210 
2211 	root_scheduler_input = alloc_percpu(void *);
2212 	root_scheduler_output = alloc_percpu(void *);
2213 
2214 	if (!root_scheduler_input || !root_scheduler_output) {
2215 		dev_err(dev, "Failed to allocate root scheduler buffers\n");
2216 		ret = -ENOMEM;
2217 		goto out;
2218 	}
2219 
2220 	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_root_sched",
2221 				mshv_root_scheduler_init,
2222 				mshv_root_scheduler_cleanup);
2223 
2224 	if (ret < 0) {
2225 		dev_err(dev, "Failed to setup root scheduler state: %i\n", ret);
2226 		goto out;
2227 	}
2228 
2229 	mshv_root_sched_online = ret;
2230 
2231 	return 0;
2232 
2233 out:
2234 	free_percpu(root_scheduler_input);
2235 	free_percpu(root_scheduler_output);
2236 	return ret;
2237 }
2238 
2239 static void
root_scheduler_deinit(void)2240 root_scheduler_deinit(void)
2241 {
2242 	if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
2243 		return;
2244 
2245 	cpuhp_remove_state(mshv_root_sched_online);
2246 	free_percpu(root_scheduler_input);
2247 	free_percpu(root_scheduler_output);
2248 }
2249 
mshv_init_vmm_caps(struct device * dev)2250 static int __init mshv_init_vmm_caps(struct device *dev)
2251 {
2252 	int ret;
2253 
2254 	ret = hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF,
2255 						HV_PARTITION_PROPERTY_VMM_CAPABILITIES,
2256 						0, &mshv_root.vmm_caps,
2257 						sizeof(mshv_root.vmm_caps));
2258 	if (ret && hv_l1vh_partition()) {
2259 		dev_err(dev, "Failed to get VMM capabilities: %d\n", ret);
2260 		return ret;
2261 	}
2262 
2263 	dev_dbg(dev, "vmm_caps = %#llx\n", mshv_root.vmm_caps.as_uint64[0]);
2264 
2265 	return 0;
2266 }
2267 
mshv_parent_partition_init(void)2268 static int __init mshv_parent_partition_init(void)
2269 {
2270 	int ret;
2271 	struct device *dev;
2272 	union hv_hypervisor_version_info version_info;
2273 
2274 	if (!hv_parent_partition() || is_kdump_kernel())
2275 		return -ENODEV;
2276 
2277 	if (hv_get_hypervisor_version(&version_info))
2278 		return -ENODEV;
2279 
2280 	ret = misc_register(&mshv_dev);
2281 	if (ret)
2282 		return ret;
2283 
2284 	dev = mshv_dev.this_device;
2285 
2286 	if (version_info.build_number < MSHV_HV_MIN_VERSION ||
2287 	    version_info.build_number > MSHV_HV_MAX_VERSION) {
2288 		dev_err(dev, "Running on unvalidated Hyper-V version\n");
2289 		dev_err(dev, "Versions: current: %u  min: %u  max: %u\n",
2290 			version_info.build_number, MSHV_HV_MIN_VERSION,
2291 			MSHV_HV_MAX_VERSION);
2292 	}
2293 
2294 	ret = mshv_synic_init(dev);
2295 	if (ret)
2296 		goto device_deregister;
2297 
2298 	ret = mshv_init_vmm_caps(dev);
2299 	if (ret)
2300 		goto synic_cleanup;
2301 
2302 	ret = mshv_retrieve_scheduler_type(dev);
2303 	if (ret)
2304 		goto synic_cleanup;
2305 
2306 	ret = root_scheduler_init(dev);
2307 	if (ret)
2308 		goto synic_cleanup;
2309 
2310 	ret = mshv_debugfs_init();
2311 	if (ret)
2312 		goto deinit_root_scheduler;
2313 
2314 	ret = mshv_irqfd_wq_init();
2315 	if (ret)
2316 		goto exit_debugfs;
2317 
2318 	spin_lock_init(&mshv_root.pt_ht_lock);
2319 	hash_init(mshv_root.pt_htable);
2320 
2321 	hv_setup_mshv_handler(mshv_isr);
2322 
2323 	return 0;
2324 
2325 exit_debugfs:
2326 	mshv_debugfs_exit();
2327 deinit_root_scheduler:
2328 	root_scheduler_deinit();
2329 synic_cleanup:
2330 	mshv_synic_exit();
2331 device_deregister:
2332 	misc_deregister(&mshv_dev);
2333 	return ret;
2334 }
2335 
mshv_parent_partition_exit(void)2336 static void __exit mshv_parent_partition_exit(void)
2337 {
2338 	hv_setup_mshv_handler(NULL);
2339 	mshv_port_table_fini();
2340 	mshv_debugfs_exit();
2341 	misc_deregister(&mshv_dev);
2342 	mshv_irqfd_wq_cleanup();
2343 	root_scheduler_deinit();
2344 	mshv_synic_exit();
2345 }
2346 
2347 module_init(mshv_parent_partition_init);
2348 module_exit(mshv_parent_partition_exit);
2349