xref: /linux/drivers/hv/mshv_root_main.c (revision 221533629550e920580ab428f13ffebf54063b95)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2024, Microsoft Corporation.
4  *
5  * The main part of the mshv_root module, providing APIs to create
6  * and manage guest partitions.
7  *
8  * Authors: Microsoft Linux virtualization team
9  */
10 
11 #include <linux/entry-virt.h>
12 #include <linux/kernel.h>
13 #include <linux/module.h>
14 #include <linux/fs.h>
15 #include <linux/miscdevice.h>
16 #include <linux/slab.h>
17 #include <linux/file.h>
18 #include <linux/anon_inodes.h>
19 #include <linux/mm.h>
20 #include <linux/io.h>
21 #include <linux/cpuhotplug.h>
22 #include <linux/random.h>
23 #include <asm/mshyperv.h>
24 #include <linux/hyperv.h>
25 #include <linux/notifier.h>
26 #include <linux/reboot.h>
27 #include <linux/kexec.h>
28 #include <linux/page-flags.h>
29 #include <linux/crash_dump.h>
30 #include <linux/panic_notifier.h>
31 #include <linux/vmalloc.h>
32 
33 #include "mshv_eventfd.h"
34 #include "mshv.h"
35 #include "mshv_root.h"
36 
37 MODULE_AUTHOR("Microsoft");
38 MODULE_LICENSE("GPL");
39 MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv");
40 
41 /* TODO move this to another file when debugfs code is added */
42 enum hv_stats_vp_counters {			/* HV_THREAD_COUNTER */
43 #if defined(CONFIG_X86)
44 	VpRootDispatchThreadBlocked			= 201,
45 #elif defined(CONFIG_ARM64)
46 	VpRootDispatchThreadBlocked			= 94,
47 #endif
48 	VpStatsMaxCounter
49 };
50 
51 struct hv_stats_page {
52 	union {
53 		u64 vp_cntrs[VpStatsMaxCounter];		/* VP counters */
54 		u8 data[HV_HYP_PAGE_SIZE];
55 	};
56 } __packed;
57 
58 struct mshv_root mshv_root;
59 
60 enum hv_scheduler_type hv_scheduler_type;
61 
62 /* Once we implement the fast extended hypercall ABI they can go away. */
63 static void * __percpu *root_scheduler_input;
64 static void * __percpu *root_scheduler_output;
65 
66 static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
67 static int mshv_dev_open(struct inode *inode, struct file *filp);
68 static int mshv_dev_release(struct inode *inode, struct file *filp);
69 static int mshv_vp_release(struct inode *inode, struct file *filp);
70 static long mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
71 static int mshv_partition_release(struct inode *inode, struct file *filp);
72 static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
73 static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma);
74 static vm_fault_t mshv_vp_fault(struct vm_fault *vmf);
75 static int mshv_init_async_handler(struct mshv_partition *partition);
76 static void mshv_async_hvcall_handler(void *data, u64 *status);
77 
78 static const union hv_input_vtl input_vtl_zero;
79 static const union hv_input_vtl input_vtl_normal = {
80 	.target_vtl = HV_NORMAL_VTL,
81 	.use_target_vtl = 1,
82 };
83 
84 static const struct vm_operations_struct mshv_vp_vm_ops = {
85 	.fault = mshv_vp_fault,
86 };
87 
88 static const struct file_operations mshv_vp_fops = {
89 	.owner = THIS_MODULE,
90 	.release = mshv_vp_release,
91 	.unlocked_ioctl = mshv_vp_ioctl,
92 	.llseek = noop_llseek,
93 	.mmap = mshv_vp_mmap,
94 };
95 
96 static const struct file_operations mshv_partition_fops = {
97 	.owner = THIS_MODULE,
98 	.release = mshv_partition_release,
99 	.unlocked_ioctl = mshv_partition_ioctl,
100 	.llseek = noop_llseek,
101 };
102 
103 static const struct file_operations mshv_dev_fops = {
104 	.owner = THIS_MODULE,
105 	.open = mshv_dev_open,
106 	.release = mshv_dev_release,
107 	.unlocked_ioctl = mshv_dev_ioctl,
108 	.llseek = noop_llseek,
109 };
110 
111 static struct miscdevice mshv_dev = {
112 	.minor = MISC_DYNAMIC_MINOR,
113 	.name = "mshv",
114 	.fops = &mshv_dev_fops,
115 	.mode = 0600,
116 };
117 
118 /*
119  * Only allow hypercalls that have a u64 partition id as the first member of
120  * the input structure.
121  * These are sorted by value.
122  */
123 static u16 mshv_passthru_hvcalls[] = {
124 	HVCALL_GET_PARTITION_PROPERTY,
125 	HVCALL_SET_PARTITION_PROPERTY,
126 	HVCALL_INSTALL_INTERCEPT,
127 	HVCALL_GET_VP_REGISTERS,
128 	HVCALL_SET_VP_REGISTERS,
129 	HVCALL_TRANSLATE_VIRTUAL_ADDRESS,
130 	HVCALL_CLEAR_VIRTUAL_INTERRUPT,
131 	HVCALL_REGISTER_INTERCEPT_RESULT,
132 	HVCALL_ASSERT_VIRTUAL_INTERRUPT,
133 	HVCALL_GET_GPA_PAGES_ACCESS_STATES,
134 	HVCALL_SIGNAL_EVENT_DIRECT,
135 	HVCALL_POST_MESSAGE_DIRECT,
136 	HVCALL_GET_VP_CPUID_VALUES,
137 };
138 
mshv_hvcall_is_async(u16 code)139 static bool mshv_hvcall_is_async(u16 code)
140 {
141 	switch (code) {
142 	case HVCALL_SET_PARTITION_PROPERTY:
143 		return true;
144 	default:
145 		break;
146 	}
147 	return false;
148 }
149 
mshv_ioctl_passthru_hvcall(struct mshv_partition * partition,bool partition_locked,void __user * user_args)150 static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition,
151 				      bool partition_locked,
152 				      void __user *user_args)
153 {
154 	u64 status;
155 	int ret = 0, i;
156 	bool is_async;
157 	struct mshv_root_hvcall args;
158 	struct page *page;
159 	unsigned int pages_order;
160 	void *input_pg = NULL;
161 	void *output_pg = NULL;
162 
163 	if (copy_from_user(&args, user_args, sizeof(args)))
164 		return -EFAULT;
165 
166 	if (args.status || !args.in_ptr || args.in_sz < sizeof(u64) ||
167 	    mshv_field_nonzero(args, rsvd) || args.in_sz > HV_HYP_PAGE_SIZE)
168 		return -EINVAL;
169 
170 	if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE))
171 		return -EINVAL;
172 
173 	for (i = 0; i < ARRAY_SIZE(mshv_passthru_hvcalls); ++i)
174 		if (args.code == mshv_passthru_hvcalls[i])
175 			break;
176 
177 	if (i >= ARRAY_SIZE(mshv_passthru_hvcalls))
178 		return -EINVAL;
179 
180 	is_async = mshv_hvcall_is_async(args.code);
181 	if (is_async) {
182 		/* async hypercalls can only be called from partition fd */
183 		if (!partition_locked)
184 			return -EINVAL;
185 		ret = mshv_init_async_handler(partition);
186 		if (ret)
187 			return ret;
188 	}
189 
190 	pages_order = args.out_ptr ? 1 : 0;
191 	page = alloc_pages(GFP_KERNEL, pages_order);
192 	if (!page)
193 		return -ENOMEM;
194 	input_pg = page_address(page);
195 
196 	if (args.out_ptr)
197 		output_pg = (char *)input_pg + PAGE_SIZE;
198 	else
199 		output_pg = NULL;
200 
201 	if (copy_from_user(input_pg, (void __user *)args.in_ptr,
202 			   args.in_sz)) {
203 		ret = -EFAULT;
204 		goto free_pages_out;
205 	}
206 
207 	/*
208 	 * NOTE: This only works because all the allowed hypercalls' input
209 	 * structs begin with a u64 partition_id field.
210 	 */
211 	*(u64 *)input_pg = partition->pt_id;
212 
213 	if (args.reps)
214 		status = hv_do_rep_hypercall(args.code, args.reps, 0,
215 					     input_pg, output_pg);
216 	else
217 		status = hv_do_hypercall(args.code, input_pg, output_pg);
218 
219 	if (hv_result(status) == HV_STATUS_CALL_PENDING) {
220 		if (is_async) {
221 			mshv_async_hvcall_handler(partition, &status);
222 		} else { /* Paranoia check. This shouldn't happen! */
223 			ret = -EBADFD;
224 			goto free_pages_out;
225 		}
226 	}
227 
228 	if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) {
229 		ret = hv_call_deposit_pages(NUMA_NO_NODE, partition->pt_id, 1);
230 		if (!ret)
231 			ret = -EAGAIN;
232 	} else if (!hv_result_success(status)) {
233 		ret = hv_result_to_errno(status);
234 	}
235 
236 	/*
237 	 * Always return the status and output data regardless of result.
238 	 * The VMM may need it to determine how to proceed. E.g. the status may
239 	 * contain the number of reps completed if a rep hypercall partially
240 	 * succeeded.
241 	 */
242 	args.status = hv_result(status);
243 	args.reps = args.reps ? hv_repcomp(status) : 0;
244 	if (copy_to_user(user_args, &args, sizeof(args)))
245 		ret = -EFAULT;
246 
247 	if (output_pg &&
248 	    copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz))
249 		ret = -EFAULT;
250 
251 free_pages_out:
252 	free_pages((unsigned long)input_pg, pages_order);
253 
254 	return ret;
255 }
256 
is_ghcb_mapping_available(void)257 static inline bool is_ghcb_mapping_available(void)
258 {
259 #if IS_ENABLED(CONFIG_X86_64)
260 	return ms_hyperv.ext_features & HV_VP_GHCB_ROOT_MAPPING_AVAILABLE;
261 #else
262 	return 0;
263 #endif
264 }
265 
mshv_get_vp_registers(u32 vp_index,u64 partition_id,u16 count,struct hv_register_assoc * registers)266 static int mshv_get_vp_registers(u32 vp_index, u64 partition_id, u16 count,
267 				 struct hv_register_assoc *registers)
268 {
269 	return hv_call_get_vp_registers(vp_index, partition_id,
270 					count, input_vtl_zero, registers);
271 }
272 
mshv_set_vp_registers(u32 vp_index,u64 partition_id,u16 count,struct hv_register_assoc * registers)273 static int mshv_set_vp_registers(u32 vp_index, u64 partition_id, u16 count,
274 				 struct hv_register_assoc *registers)
275 {
276 	return hv_call_set_vp_registers(vp_index, partition_id,
277 					count, input_vtl_zero, registers);
278 }
279 
280 /*
281  * Explicit guest vCPU suspend is asynchronous by nature (as it is requested by
282  * dom0 vCPU for guest vCPU) and thus it can race with "intercept" suspend,
283  * done by the hypervisor.
284  * "Intercept" suspend leads to asynchronous message delivery to dom0 which
285  * should be awaited to keep the VP loop consistent (i.e. no message pending
286  * upon VP resume).
287  * VP intercept suspend can't be done when the VP is explicitly suspended
288  * already, and thus can be only two possible race scenarios:
289  *   1. implicit suspend bit set -> explicit suspend bit set -> message sent
290  *   2. implicit suspend bit set -> message sent -> explicit suspend bit set
291  * Checking for implicit suspend bit set after explicit suspend request has
292  * succeeded in either case allows us to reliably identify, if there is a
293  * message to receive and deliver to VMM.
294  */
295 static int
mshv_suspend_vp(const struct mshv_vp * vp,bool * message_in_flight)296 mshv_suspend_vp(const struct mshv_vp *vp, bool *message_in_flight)
297 {
298 	struct hv_register_assoc explicit_suspend = {
299 		.name = HV_REGISTER_EXPLICIT_SUSPEND
300 	};
301 	struct hv_register_assoc intercept_suspend = {
302 		.name = HV_REGISTER_INTERCEPT_SUSPEND
303 	};
304 	union hv_explicit_suspend_register *es =
305 		&explicit_suspend.value.explicit_suspend;
306 	union hv_intercept_suspend_register *is =
307 		&intercept_suspend.value.intercept_suspend;
308 	int ret;
309 
310 	es->suspended = 1;
311 
312 	ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
313 				    1, &explicit_suspend);
314 	if (ret) {
315 		vp_err(vp, "Failed to explicitly suspend vCPU\n");
316 		return ret;
317 	}
318 
319 	ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
320 				    1, &intercept_suspend);
321 	if (ret) {
322 		vp_err(vp, "Failed to get intercept suspend state\n");
323 		return ret;
324 	}
325 
326 	*message_in_flight = is->suspended;
327 
328 	return 0;
329 }
330 
331 /*
332  * This function is used when VPs are scheduled by the hypervisor's
333  * scheduler.
334  *
335  * Caller has to make sure the registers contain cleared
336  * HV_REGISTER_INTERCEPT_SUSPEND and HV_REGISTER_EXPLICIT_SUSPEND registers
337  * exactly in this order (the hypervisor clears them sequentially) to avoid
338  * potential invalid clearing a newly arrived HV_REGISTER_INTERCEPT_SUSPEND
339  * after VP is released from HV_REGISTER_EXPLICIT_SUSPEND in case of the
340  * opposite order.
341  */
mshv_run_vp_with_hyp_scheduler(struct mshv_vp * vp)342 static long mshv_run_vp_with_hyp_scheduler(struct mshv_vp *vp)
343 {
344 	long ret;
345 	struct hv_register_assoc suspend_regs[2] = {
346 			{ .name = HV_REGISTER_INTERCEPT_SUSPEND },
347 			{ .name = HV_REGISTER_EXPLICIT_SUSPEND }
348 	};
349 	size_t count = ARRAY_SIZE(suspend_regs);
350 
351 	/* Resume VP execution */
352 	ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
353 				    count, suspend_regs);
354 	if (ret) {
355 		vp_err(vp, "Failed to resume vp execution. %lx\n", ret);
356 		return ret;
357 	}
358 
359 	ret = wait_event_interruptible(vp->run.vp_suspend_queue,
360 				       vp->run.kicked_by_hv == 1);
361 	if (ret) {
362 		bool message_in_flight;
363 
364 		/*
365 		 * Otherwise the waiting was interrupted by a signal: suspend
366 		 * the vCPU explicitly and copy message in flight (if any).
367 		 */
368 		ret = mshv_suspend_vp(vp, &message_in_flight);
369 		if (ret)
370 			return ret;
371 
372 		/* Return if no message in flight */
373 		if (!message_in_flight)
374 			return -EINTR;
375 
376 		/* Wait for the message in flight. */
377 		wait_event(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1);
378 	}
379 
380 	/*
381 	 * Reset the flag to make the wait_event call above work
382 	 * next time.
383 	 */
384 	vp->run.kicked_by_hv = 0;
385 
386 	return 0;
387 }
388 
389 static int
mshv_vp_dispatch(struct mshv_vp * vp,u32 flags,struct hv_output_dispatch_vp * res)390 mshv_vp_dispatch(struct mshv_vp *vp, u32 flags,
391 		 struct hv_output_dispatch_vp *res)
392 {
393 	struct hv_input_dispatch_vp *input;
394 	struct hv_output_dispatch_vp *output;
395 	u64 status;
396 
397 	preempt_disable();
398 	input = *this_cpu_ptr(root_scheduler_input);
399 	output = *this_cpu_ptr(root_scheduler_output);
400 
401 	memset(input, 0, sizeof(*input));
402 	memset(output, 0, sizeof(*output));
403 
404 	input->partition_id = vp->vp_partition->pt_id;
405 	input->vp_index = vp->vp_index;
406 	input->time_slice = 0; /* Run forever until something happens */
407 	input->spec_ctrl = 0; /* TODO: set sensible flags */
408 	input->flags = flags;
409 
410 	vp->run.flags.root_sched_dispatched = 1;
411 	status = hv_do_hypercall(HVCALL_DISPATCH_VP, input, output);
412 	vp->run.flags.root_sched_dispatched = 0;
413 
414 	*res = *output;
415 	preempt_enable();
416 
417 	if (!hv_result_success(status))
418 		vp_err(vp, "%s: status %s\n", __func__,
419 		       hv_result_to_string(status));
420 
421 	return hv_result_to_errno(status);
422 }
423 
424 static int
mshv_vp_clear_explicit_suspend(struct mshv_vp * vp)425 mshv_vp_clear_explicit_suspend(struct mshv_vp *vp)
426 {
427 	struct hv_register_assoc explicit_suspend = {
428 		.name = HV_REGISTER_EXPLICIT_SUSPEND,
429 		.value.explicit_suspend.suspended = 0,
430 	};
431 	int ret;
432 
433 	ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
434 				    1, &explicit_suspend);
435 
436 	if (ret)
437 		vp_err(vp, "Failed to unsuspend\n");
438 
439 	return ret;
440 }
441 
442 #if IS_ENABLED(CONFIG_X86_64)
mshv_vp_interrupt_pending(struct mshv_vp * vp)443 static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp)
444 {
445 	if (!vp->vp_register_page)
446 		return 0;
447 	return vp->vp_register_page->interrupt_vectors.as_uint64;
448 }
449 #else
mshv_vp_interrupt_pending(struct mshv_vp * vp)450 static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp)
451 {
452 	return 0;
453 }
454 #endif
455 
mshv_vp_dispatch_thread_blocked(struct mshv_vp * vp)456 static bool mshv_vp_dispatch_thread_blocked(struct mshv_vp *vp)
457 {
458 	struct hv_stats_page **stats = vp->vp_stats_pages;
459 	u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->vp_cntrs;
460 	u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->vp_cntrs;
461 
462 	if (self_vp_cntrs[VpRootDispatchThreadBlocked])
463 		return self_vp_cntrs[VpRootDispatchThreadBlocked];
464 	return parent_vp_cntrs[VpRootDispatchThreadBlocked];
465 }
466 
467 static int
mshv_vp_wait_for_hv_kick(struct mshv_vp * vp)468 mshv_vp_wait_for_hv_kick(struct mshv_vp *vp)
469 {
470 	int ret;
471 
472 	ret = wait_event_interruptible(vp->run.vp_suspend_queue,
473 				       (vp->run.kicked_by_hv == 1 &&
474 					!mshv_vp_dispatch_thread_blocked(vp)) ||
475 				       mshv_vp_interrupt_pending(vp));
476 	if (ret)
477 		return -EINTR;
478 
479 	vp->run.flags.root_sched_blocked = 0;
480 	vp->run.kicked_by_hv = 0;
481 
482 	return 0;
483 }
484 
485 /* Must be called with interrupts enabled */
mshv_run_vp_with_root_scheduler(struct mshv_vp * vp)486 static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp)
487 {
488 	long ret;
489 
490 	if (vp->run.flags.root_sched_blocked) {
491 		/*
492 		 * Dispatch state of this VP is blocked. Need to wait
493 		 * for the hypervisor to clear the blocked state before
494 		 * dispatching it.
495 		 */
496 		ret = mshv_vp_wait_for_hv_kick(vp);
497 		if (ret)
498 			return ret;
499 	}
500 
501 	do {
502 		u32 flags = 0;
503 		struct hv_output_dispatch_vp output;
504 
505 		if (__xfer_to_guest_mode_work_pending()) {
506 			ret = xfer_to_guest_mode_handle_work();
507 			if (ret)
508 				break;
509 		}
510 
511 		if (vp->run.flags.intercept_suspend)
512 			flags |= HV_DISPATCH_VP_FLAG_CLEAR_INTERCEPT_SUSPEND;
513 
514 		if (mshv_vp_interrupt_pending(vp))
515 			flags |= HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION;
516 
517 		ret = mshv_vp_dispatch(vp, flags, &output);
518 		if (ret)
519 			break;
520 
521 		vp->run.flags.intercept_suspend = 0;
522 
523 		if (output.dispatch_state == HV_VP_DISPATCH_STATE_BLOCKED) {
524 			if (output.dispatch_event ==
525 						HV_VP_DISPATCH_EVENT_SUSPEND) {
526 				/*
527 				 * TODO: remove the warning once VP canceling
528 				 *	 is supported
529 				 */
530 				WARN_ONCE(atomic64_read(&vp->run.vp_signaled_count),
531 					  "%s: vp#%d: unexpected explicit suspend\n",
532 					  __func__, vp->vp_index);
533 				/*
534 				 * Need to clear explicit suspend before
535 				 * dispatching.
536 				 * Explicit suspend is either:
537 				 * - set right after the first VP dispatch or
538 				 * - set explicitly via hypercall
539 				 * Since the latter case is not yet supported,
540 				 * simply clear it here.
541 				 */
542 				ret = mshv_vp_clear_explicit_suspend(vp);
543 				if (ret)
544 					break;
545 
546 				ret = mshv_vp_wait_for_hv_kick(vp);
547 				if (ret)
548 					break;
549 			} else {
550 				vp->run.flags.root_sched_blocked = 1;
551 				ret = mshv_vp_wait_for_hv_kick(vp);
552 				if (ret)
553 					break;
554 			}
555 		} else {
556 			/* HV_VP_DISPATCH_STATE_READY */
557 			if (output.dispatch_event ==
558 						HV_VP_DISPATCH_EVENT_INTERCEPT)
559 				vp->run.flags.intercept_suspend = 1;
560 		}
561 	} while (!vp->run.flags.intercept_suspend);
562 
563 	return ret;
564 }
565 
566 static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ,
567 	      "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ");
568 
mshv_vp_ioctl_run_vp(struct mshv_vp * vp,void __user * ret_msg)569 static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg)
570 {
571 	long rc;
572 
573 	if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
574 		rc = mshv_run_vp_with_root_scheduler(vp);
575 	else
576 		rc = mshv_run_vp_with_hyp_scheduler(vp);
577 
578 	if (rc)
579 		return rc;
580 
581 	if (copy_to_user(ret_msg, vp->vp_intercept_msg_page,
582 			 sizeof(struct hv_message)))
583 		rc = -EFAULT;
584 
585 	return rc;
586 }
587 
588 static int
mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp * vp,struct hv_vp_state_data state_data,unsigned long user_pfn,size_t page_count,bool is_set)589 mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp *vp,
590 				struct hv_vp_state_data state_data,
591 				unsigned long user_pfn, size_t page_count,
592 				bool is_set)
593 {
594 	int completed, ret = 0;
595 	unsigned long check;
596 	struct page **pages;
597 
598 	if (page_count > INT_MAX)
599 		return -EINVAL;
600 	/*
601 	 * Check the arithmetic for wraparound/overflow.
602 	 * The last page address in the buffer is:
603 	 * (user_pfn + (page_count - 1)) * PAGE_SIZE
604 	 */
605 	if (check_add_overflow(user_pfn, (page_count - 1), &check))
606 		return -EOVERFLOW;
607 	if (check_mul_overflow(check, PAGE_SIZE, &check))
608 		return -EOVERFLOW;
609 
610 	/* Pin user pages so hypervisor can copy directly to them */
611 	pages = kcalloc(page_count, sizeof(struct page *), GFP_KERNEL);
612 	if (!pages)
613 		return -ENOMEM;
614 
615 	for (completed = 0; completed < page_count; completed += ret) {
616 		unsigned long user_addr = (user_pfn + completed) * PAGE_SIZE;
617 		int remaining = page_count - completed;
618 
619 		ret = pin_user_pages_fast(user_addr, remaining, FOLL_WRITE,
620 					  &pages[completed]);
621 		if (ret < 0) {
622 			vp_err(vp, "%s: Failed to pin user pages error %i\n",
623 			       __func__, ret);
624 			goto unpin_pages;
625 		}
626 	}
627 
628 	if (is_set)
629 		ret = hv_call_set_vp_state(vp->vp_index,
630 					   vp->vp_partition->pt_id,
631 					   state_data, page_count, pages,
632 					   0, NULL);
633 	else
634 		ret = hv_call_get_vp_state(vp->vp_index,
635 					   vp->vp_partition->pt_id,
636 					   state_data, page_count, pages,
637 					   NULL);
638 
639 unpin_pages:
640 	unpin_user_pages(pages, completed);
641 	kfree(pages);
642 	return ret;
643 }
644 
645 static long
mshv_vp_ioctl_get_set_state(struct mshv_vp * vp,struct mshv_get_set_vp_state __user * user_args,bool is_set)646 mshv_vp_ioctl_get_set_state(struct mshv_vp *vp,
647 			    struct mshv_get_set_vp_state __user *user_args,
648 			    bool is_set)
649 {
650 	struct mshv_get_set_vp_state args;
651 	long ret = 0;
652 	union hv_output_get_vp_state vp_state;
653 	u32 data_sz;
654 	struct hv_vp_state_data state_data = {};
655 
656 	if (copy_from_user(&args, user_args, sizeof(args)))
657 		return -EFAULT;
658 
659 	if (args.type >= MSHV_VP_STATE_COUNT || mshv_field_nonzero(args, rsvd) ||
660 	    !args.buf_sz || !PAGE_ALIGNED(args.buf_sz) ||
661 	    !PAGE_ALIGNED(args.buf_ptr))
662 		return -EINVAL;
663 
664 	if (!access_ok((void __user *)args.buf_ptr, args.buf_sz))
665 		return -EFAULT;
666 
667 	switch (args.type) {
668 	case MSHV_VP_STATE_LAPIC:
669 		state_data.type = HV_GET_SET_VP_STATE_LAPIC_STATE;
670 		data_sz = HV_HYP_PAGE_SIZE;
671 		break;
672 	case MSHV_VP_STATE_XSAVE:
673 	{
674 		u64 data_sz_64;
675 
676 		ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
677 						     HV_PARTITION_PROPERTY_XSAVE_STATES,
678 						     &state_data.xsave.states.as_uint64);
679 		if (ret)
680 			return ret;
681 
682 		ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
683 						     HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE,
684 						     &data_sz_64);
685 		if (ret)
686 			return ret;
687 
688 		data_sz = (u32)data_sz_64;
689 		state_data.xsave.flags = 0;
690 		/* Always request legacy states */
691 		state_data.xsave.states.legacy_x87 = 1;
692 		state_data.xsave.states.legacy_sse = 1;
693 		state_data.type = HV_GET_SET_VP_STATE_XSAVE;
694 		break;
695 	}
696 	case MSHV_VP_STATE_SIMP:
697 		state_data.type = HV_GET_SET_VP_STATE_SIM_PAGE;
698 		data_sz = HV_HYP_PAGE_SIZE;
699 		break;
700 	case MSHV_VP_STATE_SIEFP:
701 		state_data.type = HV_GET_SET_VP_STATE_SIEF_PAGE;
702 		data_sz = HV_HYP_PAGE_SIZE;
703 		break;
704 	case MSHV_VP_STATE_SYNTHETIC_TIMERS:
705 		state_data.type = HV_GET_SET_VP_STATE_SYNTHETIC_TIMERS;
706 		data_sz = sizeof(vp_state.synthetic_timers_state);
707 		break;
708 	default:
709 		return -EINVAL;
710 	}
711 
712 	if (copy_to_user(&user_args->buf_sz, &data_sz, sizeof(user_args->buf_sz)))
713 		return -EFAULT;
714 
715 	if (data_sz > args.buf_sz)
716 		return -EINVAL;
717 
718 	/* If the data is transmitted via pfns, delegate to helper */
719 	if (state_data.type & HV_GET_SET_VP_STATE_TYPE_PFN) {
720 		unsigned long user_pfn = PFN_DOWN(args.buf_ptr);
721 		size_t page_count = PFN_DOWN(args.buf_sz);
722 
723 		return mshv_vp_ioctl_get_set_state_pfn(vp, state_data, user_pfn,
724 						       page_count, is_set);
725 	}
726 
727 	/* Paranoia check - this shouldn't happen! */
728 	if (data_sz > sizeof(vp_state)) {
729 		vp_err(vp, "Invalid vp state data size!\n");
730 		return -EINVAL;
731 	}
732 
733 	if (is_set) {
734 		if (copy_from_user(&vp_state, (__user void *)args.buf_ptr, data_sz))
735 			return -EFAULT;
736 
737 		return hv_call_set_vp_state(vp->vp_index,
738 					    vp->vp_partition->pt_id,
739 					    state_data, 0, NULL,
740 					    sizeof(vp_state), (u8 *)&vp_state);
741 	}
742 
743 	ret = hv_call_get_vp_state(vp->vp_index, vp->vp_partition->pt_id,
744 				   state_data, 0, NULL, &vp_state);
745 	if (ret)
746 		return ret;
747 
748 	if (copy_to_user((void __user *)args.buf_ptr, &vp_state, data_sz))
749 		return -EFAULT;
750 
751 	return 0;
752 }
753 
754 static long
mshv_vp_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)755 mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
756 {
757 	struct mshv_vp *vp = filp->private_data;
758 	long r = -ENOTTY;
759 
760 	if (mutex_lock_killable(&vp->vp_mutex))
761 		return -EINTR;
762 
763 	switch (ioctl) {
764 	case MSHV_RUN_VP:
765 		r = mshv_vp_ioctl_run_vp(vp, (void __user *)arg);
766 		break;
767 	case MSHV_GET_VP_STATE:
768 		r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, false);
769 		break;
770 	case MSHV_SET_VP_STATE:
771 		r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, true);
772 		break;
773 	case MSHV_ROOT_HVCALL:
774 		r = mshv_ioctl_passthru_hvcall(vp->vp_partition, false,
775 					       (void __user *)arg);
776 		break;
777 	default:
778 		vp_warn(vp, "Invalid ioctl: %#x\n", ioctl);
779 		break;
780 	}
781 	mutex_unlock(&vp->vp_mutex);
782 
783 	return r;
784 }
785 
mshv_vp_fault(struct vm_fault * vmf)786 static vm_fault_t mshv_vp_fault(struct vm_fault *vmf)
787 {
788 	struct mshv_vp *vp = vmf->vma->vm_file->private_data;
789 
790 	switch (vmf->vma->vm_pgoff) {
791 	case MSHV_VP_MMAP_OFFSET_REGISTERS:
792 		vmf->page = virt_to_page(vp->vp_register_page);
793 		break;
794 	case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE:
795 		vmf->page = virt_to_page(vp->vp_intercept_msg_page);
796 		break;
797 	case MSHV_VP_MMAP_OFFSET_GHCB:
798 		vmf->page = virt_to_page(vp->vp_ghcb_page);
799 		break;
800 	default:
801 		return VM_FAULT_SIGBUS;
802 	}
803 
804 	get_page(vmf->page);
805 
806 	return 0;
807 }
808 
mshv_vp_mmap(struct file * file,struct vm_area_struct * vma)809 static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma)
810 {
811 	struct mshv_vp *vp = file->private_data;
812 
813 	switch (vma->vm_pgoff) {
814 	case MSHV_VP_MMAP_OFFSET_REGISTERS:
815 		if (!vp->vp_register_page)
816 			return -ENODEV;
817 		break;
818 	case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE:
819 		if (!vp->vp_intercept_msg_page)
820 			return -ENODEV;
821 		break;
822 	case MSHV_VP_MMAP_OFFSET_GHCB:
823 		if (!vp->vp_ghcb_page)
824 			return -ENODEV;
825 		break;
826 	default:
827 		return -EINVAL;
828 	}
829 
830 	vma->vm_ops = &mshv_vp_vm_ops;
831 	return 0;
832 }
833 
834 static int
mshv_vp_release(struct inode * inode,struct file * filp)835 mshv_vp_release(struct inode *inode, struct file *filp)
836 {
837 	struct mshv_vp *vp = filp->private_data;
838 
839 	/* Rest of VP cleanup happens in destroy_partition() */
840 	mshv_partition_put(vp->vp_partition);
841 	return 0;
842 }
843 
mshv_vp_stats_unmap(u64 partition_id,u32 vp_index)844 static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index)
845 {
846 	union hv_stats_object_identity identity = {
847 		.vp.partition_id = partition_id,
848 		.vp.vp_index = vp_index,
849 	};
850 
851 	identity.vp.stats_area_type = HV_STATS_AREA_SELF;
852 	hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity);
853 
854 	identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
855 	hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity);
856 }
857 
mshv_vp_stats_map(u64 partition_id,u32 vp_index,void * stats_pages[])858 static int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
859 			     void *stats_pages[])
860 {
861 	union hv_stats_object_identity identity = {
862 		.vp.partition_id = partition_id,
863 		.vp.vp_index = vp_index,
864 	};
865 	int err;
866 
867 	identity.vp.stats_area_type = HV_STATS_AREA_SELF;
868 	err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity,
869 				    &stats_pages[HV_STATS_AREA_SELF]);
870 	if (err)
871 		return err;
872 
873 	identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
874 	err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity,
875 				    &stats_pages[HV_STATS_AREA_PARENT]);
876 	if (err)
877 		goto unmap_self;
878 
879 	return 0;
880 
881 unmap_self:
882 	identity.vp.stats_area_type = HV_STATS_AREA_SELF;
883 	hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity);
884 	return err;
885 }
886 
887 static long
mshv_partition_ioctl_create_vp(struct mshv_partition * partition,void __user * arg)888 mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
889 			       void __user *arg)
890 {
891 	struct mshv_create_vp args;
892 	struct mshv_vp *vp;
893 	struct page *intercept_message_page, *register_page, *ghcb_page;
894 	void *stats_pages[2];
895 	long ret;
896 
897 	if (copy_from_user(&args, arg, sizeof(args)))
898 		return -EFAULT;
899 
900 	if (args.vp_index >= MSHV_MAX_VPS)
901 		return -EINVAL;
902 
903 	if (partition->pt_vp_array[args.vp_index])
904 		return -EEXIST;
905 
906 	ret = hv_call_create_vp(NUMA_NO_NODE, partition->pt_id, args.vp_index,
907 				0 /* Only valid for root partition VPs */);
908 	if (ret)
909 		return ret;
910 
911 	ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index,
912 					HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
913 					input_vtl_zero,
914 					&intercept_message_page);
915 	if (ret)
916 		goto destroy_vp;
917 
918 	if (!mshv_partition_encrypted(partition)) {
919 		ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index,
920 						HV_VP_STATE_PAGE_REGISTERS,
921 						input_vtl_zero,
922 						&register_page);
923 		if (ret)
924 			goto unmap_intercept_message_page;
925 	}
926 
927 	if (mshv_partition_encrypted(partition) &&
928 	    is_ghcb_mapping_available()) {
929 		ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index,
930 						HV_VP_STATE_PAGE_GHCB,
931 						input_vtl_normal,
932 						&ghcb_page);
933 		if (ret)
934 			goto unmap_register_page;
935 	}
936 
937 	if (hv_parent_partition()) {
938 		ret = mshv_vp_stats_map(partition->pt_id, args.vp_index,
939 					stats_pages);
940 		if (ret)
941 			goto unmap_ghcb_page;
942 	}
943 
944 	vp = kzalloc(sizeof(*vp), GFP_KERNEL);
945 	if (!vp)
946 		goto unmap_stats_pages;
947 
948 	vp->vp_partition = mshv_partition_get(partition);
949 	if (!vp->vp_partition) {
950 		ret = -EBADF;
951 		goto free_vp;
952 	}
953 
954 	mutex_init(&vp->vp_mutex);
955 	init_waitqueue_head(&vp->run.vp_suspend_queue);
956 	atomic64_set(&vp->run.vp_signaled_count, 0);
957 
958 	vp->vp_index = args.vp_index;
959 	vp->vp_intercept_msg_page = page_to_virt(intercept_message_page);
960 	if (!mshv_partition_encrypted(partition))
961 		vp->vp_register_page = page_to_virt(register_page);
962 
963 	if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
964 		vp->vp_ghcb_page = page_to_virt(ghcb_page);
965 
966 	if (hv_parent_partition())
967 		memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages));
968 
969 	/*
970 	 * Keep anon_inode_getfd last: it installs fd in the file struct and
971 	 * thus makes the state accessible in user space.
972 	 */
973 	ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp,
974 			       O_RDWR | O_CLOEXEC);
975 	if (ret < 0)
976 		goto put_partition;
977 
978 	/* already exclusive with the partition mutex for all ioctls */
979 	partition->pt_vp_count++;
980 	partition->pt_vp_array[args.vp_index] = vp;
981 
982 	return ret;
983 
984 put_partition:
985 	mshv_partition_put(partition);
986 free_vp:
987 	kfree(vp);
988 unmap_stats_pages:
989 	if (hv_parent_partition())
990 		mshv_vp_stats_unmap(partition->pt_id, args.vp_index);
991 unmap_ghcb_page:
992 	if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) {
993 		hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index,
994 					    HV_VP_STATE_PAGE_GHCB,
995 					    input_vtl_normal);
996 	}
997 unmap_register_page:
998 	if (!mshv_partition_encrypted(partition)) {
999 		hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index,
1000 					    HV_VP_STATE_PAGE_REGISTERS,
1001 					    input_vtl_zero);
1002 	}
1003 unmap_intercept_message_page:
1004 	hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index,
1005 				    HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
1006 				    input_vtl_zero);
1007 destroy_vp:
1008 	hv_call_delete_vp(partition->pt_id, args.vp_index);
1009 	return ret;
1010 }
1011 
mshv_init_async_handler(struct mshv_partition * partition)1012 static int mshv_init_async_handler(struct mshv_partition *partition)
1013 {
1014 	if (completion_done(&partition->async_hypercall)) {
1015 		pt_err(partition,
1016 		       "Cannot issue async hypercall while another one in progress!\n");
1017 		return -EPERM;
1018 	}
1019 
1020 	reinit_completion(&partition->async_hypercall);
1021 	return 0;
1022 }
1023 
mshv_async_hvcall_handler(void * data,u64 * status)1024 static void mshv_async_hvcall_handler(void *data, u64 *status)
1025 {
1026 	struct mshv_partition *partition = data;
1027 
1028 	wait_for_completion(&partition->async_hypercall);
1029 	pt_dbg(partition, "Async hypercall completed!\n");
1030 
1031 	*status = partition->async_hypercall_status;
1032 }
1033 
1034 static int
mshv_partition_region_share(struct mshv_mem_region * region)1035 mshv_partition_region_share(struct mshv_mem_region *region)
1036 {
1037 	u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED;
1038 
1039 	if (region->flags.large_pages)
1040 		flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
1041 
1042 	return hv_call_modify_spa_host_access(region->partition->pt_id,
1043 			region->pages, region->nr_pages,
1044 			HV_MAP_GPA_READABLE | HV_MAP_GPA_WRITABLE,
1045 			flags, true);
1046 }
1047 
1048 static int
mshv_partition_region_unshare(struct mshv_mem_region * region)1049 mshv_partition_region_unshare(struct mshv_mem_region *region)
1050 {
1051 	u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE;
1052 
1053 	if (region->flags.large_pages)
1054 		flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
1055 
1056 	return hv_call_modify_spa_host_access(region->partition->pt_id,
1057 			region->pages, region->nr_pages,
1058 			0,
1059 			flags, false);
1060 }
1061 
1062 static int
mshv_region_remap_pages(struct mshv_mem_region * region,u32 map_flags,u64 page_offset,u64 page_count)1063 mshv_region_remap_pages(struct mshv_mem_region *region, u32 map_flags,
1064 			u64 page_offset, u64 page_count)
1065 {
1066 	if (page_offset + page_count > region->nr_pages)
1067 		return -EINVAL;
1068 
1069 	if (region->flags.large_pages)
1070 		map_flags |= HV_MAP_GPA_LARGE_PAGE;
1071 
1072 	/* ask the hypervisor to map guest ram */
1073 	return hv_call_map_gpa_pages(region->partition->pt_id,
1074 				     region->start_gfn + page_offset,
1075 				     page_count, map_flags,
1076 				     region->pages + page_offset);
1077 }
1078 
1079 static int
mshv_region_map(struct mshv_mem_region * region)1080 mshv_region_map(struct mshv_mem_region *region)
1081 {
1082 	u32 map_flags = region->hv_map_flags;
1083 
1084 	return mshv_region_remap_pages(region, map_flags,
1085 				       0, region->nr_pages);
1086 }
1087 
1088 static void
mshv_region_evict_pages(struct mshv_mem_region * region,u64 page_offset,u64 page_count)1089 mshv_region_evict_pages(struct mshv_mem_region *region,
1090 			u64 page_offset, u64 page_count)
1091 {
1092 	if (region->flags.range_pinned)
1093 		unpin_user_pages(region->pages + page_offset, page_count);
1094 
1095 	memset(region->pages + page_offset, 0,
1096 	       page_count * sizeof(struct page *));
1097 }
1098 
1099 static void
mshv_region_evict(struct mshv_mem_region * region)1100 mshv_region_evict(struct mshv_mem_region *region)
1101 {
1102 	mshv_region_evict_pages(region, 0, region->nr_pages);
1103 }
1104 
1105 static int
mshv_region_populate_pages(struct mshv_mem_region * region,u64 page_offset,u64 page_count)1106 mshv_region_populate_pages(struct mshv_mem_region *region,
1107 			   u64 page_offset, u64 page_count)
1108 {
1109 	u64 done_count, nr_pages;
1110 	struct page **pages;
1111 	__u64 userspace_addr;
1112 	int ret;
1113 
1114 	if (page_offset + page_count > region->nr_pages)
1115 		return -EINVAL;
1116 
1117 	for (done_count = 0; done_count < page_count; done_count += ret) {
1118 		pages = region->pages + page_offset + done_count;
1119 		userspace_addr = region->start_uaddr +
1120 				(page_offset + done_count) *
1121 				HV_HYP_PAGE_SIZE;
1122 		nr_pages = min(page_count - done_count,
1123 			       MSHV_PIN_PAGES_BATCH_SIZE);
1124 
1125 		/*
1126 		 * Pinning assuming 4k pages works for large pages too.
1127 		 * All page structs within the large page are returned.
1128 		 *
1129 		 * Pin requests are batched because pin_user_pages_fast
1130 		 * with the FOLL_LONGTERM flag does a large temporary
1131 		 * allocation of contiguous memory.
1132 		 */
1133 		if (region->flags.range_pinned)
1134 			ret = pin_user_pages_fast(userspace_addr,
1135 						  nr_pages,
1136 						  FOLL_WRITE | FOLL_LONGTERM,
1137 						  pages);
1138 		else
1139 			ret = -EOPNOTSUPP;
1140 
1141 		if (ret < 0)
1142 			goto release_pages;
1143 	}
1144 
1145 	if (PageHuge(region->pages[page_offset]))
1146 		region->flags.large_pages = true;
1147 
1148 	return 0;
1149 
1150 release_pages:
1151 	mshv_region_evict_pages(region, page_offset, done_count);
1152 	return ret;
1153 }
1154 
1155 static int
mshv_region_populate(struct mshv_mem_region * region)1156 mshv_region_populate(struct mshv_mem_region *region)
1157 {
1158 	return mshv_region_populate_pages(region, 0, region->nr_pages);
1159 }
1160 
1161 static struct mshv_mem_region *
mshv_partition_region_by_gfn(struct mshv_partition * partition,u64 gfn)1162 mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
1163 {
1164 	struct mshv_mem_region *region;
1165 
1166 	hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) {
1167 		if (gfn >= region->start_gfn &&
1168 		    gfn < region->start_gfn + region->nr_pages)
1169 			return region;
1170 	}
1171 
1172 	return NULL;
1173 }
1174 
1175 static struct mshv_mem_region *
mshv_partition_region_by_uaddr(struct mshv_partition * partition,u64 uaddr)1176 mshv_partition_region_by_uaddr(struct mshv_partition *partition, u64 uaddr)
1177 {
1178 	struct mshv_mem_region *region;
1179 
1180 	hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) {
1181 		if (uaddr >= region->start_uaddr &&
1182 		    uaddr < region->start_uaddr +
1183 			    (region->nr_pages << HV_HYP_PAGE_SHIFT))
1184 			return region;
1185 	}
1186 
1187 	return NULL;
1188 }
1189 
1190 /*
1191  * NB: caller checks and makes sure mem->size is page aligned
1192  * Returns: 0 with regionpp updated on success, or -errno
1193  */
mshv_partition_create_region(struct mshv_partition * partition,struct mshv_user_mem_region * mem,struct mshv_mem_region ** regionpp,bool is_mmio)1194 static int mshv_partition_create_region(struct mshv_partition *partition,
1195 					struct mshv_user_mem_region *mem,
1196 					struct mshv_mem_region **regionpp,
1197 					bool is_mmio)
1198 {
1199 	struct mshv_mem_region *region;
1200 	u64 nr_pages = HVPFN_DOWN(mem->size);
1201 
1202 	/* Reject overlapping regions */
1203 	if (mshv_partition_region_by_gfn(partition, mem->guest_pfn) ||
1204 	    mshv_partition_region_by_gfn(partition, mem->guest_pfn + nr_pages - 1) ||
1205 	    mshv_partition_region_by_uaddr(partition, mem->userspace_addr) ||
1206 	    mshv_partition_region_by_uaddr(partition, mem->userspace_addr + mem->size - 1))
1207 		return -EEXIST;
1208 
1209 	region = vzalloc(sizeof(*region) + sizeof(struct page *) * nr_pages);
1210 	if (!region)
1211 		return -ENOMEM;
1212 
1213 	region->nr_pages = nr_pages;
1214 	region->start_gfn = mem->guest_pfn;
1215 	region->start_uaddr = mem->userspace_addr;
1216 	region->hv_map_flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_ADJUSTABLE;
1217 	if (mem->flags & BIT(MSHV_SET_MEM_BIT_WRITABLE))
1218 		region->hv_map_flags |= HV_MAP_GPA_WRITABLE;
1219 	if (mem->flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE))
1220 		region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE;
1221 
1222 	/* Note: large_pages flag populated when we pin the pages */
1223 	if (!is_mmio)
1224 		region->flags.range_pinned = true;
1225 
1226 	region->partition = partition;
1227 
1228 	*regionpp = region;
1229 
1230 	return 0;
1231 }
1232 
1233 /*
1234  * Map guest ram. if snp, make sure to release that from the host first
1235  * Side Effects: In case of failure, pages are unpinned when feasible.
1236  */
1237 static int
mshv_partition_mem_region_map(struct mshv_mem_region * region)1238 mshv_partition_mem_region_map(struct mshv_mem_region *region)
1239 {
1240 	struct mshv_partition *partition = region->partition;
1241 	int ret;
1242 
1243 	ret = mshv_region_populate(region);
1244 	if (ret) {
1245 		pt_err(partition, "Failed to populate memory region: %d\n",
1246 		       ret);
1247 		goto err_out;
1248 	}
1249 
1250 	/*
1251 	 * For an SNP partition it is a requirement that for every memory region
1252 	 * that we are going to map for this partition we should make sure that
1253 	 * host access to that region is released. This is ensured by doing an
1254 	 * additional hypercall which will update the SLAT to release host
1255 	 * access to guest memory regions.
1256 	 */
1257 	if (mshv_partition_encrypted(partition)) {
1258 		ret = mshv_partition_region_unshare(region);
1259 		if (ret) {
1260 			pt_err(partition,
1261 			       "Failed to unshare memory region (guest_pfn: %llu): %d\n",
1262 			       region->start_gfn, ret);
1263 			goto evict_region;
1264 		}
1265 	}
1266 
1267 	ret = mshv_region_map(region);
1268 	if (ret && mshv_partition_encrypted(partition)) {
1269 		int shrc;
1270 
1271 		shrc = mshv_partition_region_share(region);
1272 		if (!shrc)
1273 			goto evict_region;
1274 
1275 		pt_err(partition,
1276 		       "Failed to share memory region (guest_pfn: %llu): %d\n",
1277 		       region->start_gfn, shrc);
1278 		/*
1279 		 * Don't unpin if marking shared failed because pages are no
1280 		 * longer mapped in the host, ie root, anymore.
1281 		 */
1282 		goto err_out;
1283 	}
1284 
1285 	return 0;
1286 
1287 evict_region:
1288 	mshv_region_evict(region);
1289 err_out:
1290 	return ret;
1291 }
1292 
1293 /*
1294  * This maps two things: guest RAM and for pci passthru mmio space.
1295  *
1296  * mmio:
1297  *  - vfio overloads vm_pgoff to store the mmio start pfn/spa.
1298  *  - Two things need to happen for mapping mmio range:
1299  *	1. mapped in the uaddr so VMM can access it.
1300  *	2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it.
1301  *
1302  *   This function takes care of the second. The first one is managed by vfio,
1303  *   and hence is taken care of via vfio_pci_mmap_fault().
1304  */
1305 static long
mshv_map_user_memory(struct mshv_partition * partition,struct mshv_user_mem_region mem)1306 mshv_map_user_memory(struct mshv_partition *partition,
1307 		     struct mshv_user_mem_region mem)
1308 {
1309 	struct mshv_mem_region *region;
1310 	struct vm_area_struct *vma;
1311 	bool is_mmio;
1312 	ulong mmio_pfn;
1313 	long ret;
1314 
1315 	if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP) ||
1316 	    !access_ok((const void *)mem.userspace_addr, mem.size))
1317 		return -EINVAL;
1318 
1319 	mmap_read_lock(current->mm);
1320 	vma = vma_lookup(current->mm, mem.userspace_addr);
1321 	is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
1322 	mmio_pfn = is_mmio ? vma->vm_pgoff : 0;
1323 	mmap_read_unlock(current->mm);
1324 
1325 	if (!vma)
1326 		return -EINVAL;
1327 
1328 	ret = mshv_partition_create_region(partition, &mem, &region,
1329 					   is_mmio);
1330 	if (ret)
1331 		return ret;
1332 
1333 	if (is_mmio)
1334 		ret = hv_call_map_mmio_pages(partition->pt_id, mem.guest_pfn,
1335 					     mmio_pfn, HVPFN_DOWN(mem.size));
1336 	else
1337 		ret = mshv_partition_mem_region_map(region);
1338 
1339 	if (ret)
1340 		goto errout;
1341 
1342 	/* Install the new region */
1343 	hlist_add_head(&region->hnode, &partition->pt_mem_regions);
1344 
1345 	return 0;
1346 
1347 errout:
1348 	vfree(region);
1349 	return ret;
1350 }
1351 
1352 /* Called for unmapping both the guest ram and the mmio space */
1353 static long
mshv_unmap_user_memory(struct mshv_partition * partition,struct mshv_user_mem_region mem)1354 mshv_unmap_user_memory(struct mshv_partition *partition,
1355 		       struct mshv_user_mem_region mem)
1356 {
1357 	struct mshv_mem_region *region;
1358 	u32 unmap_flags = 0;
1359 
1360 	if (!(mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP)))
1361 		return -EINVAL;
1362 
1363 	region = mshv_partition_region_by_gfn(partition, mem.guest_pfn);
1364 	if (!region)
1365 		return -EINVAL;
1366 
1367 	/* Paranoia check */
1368 	if (region->start_uaddr != mem.userspace_addr ||
1369 	    region->start_gfn != mem.guest_pfn ||
1370 	    region->nr_pages != HVPFN_DOWN(mem.size))
1371 		return -EINVAL;
1372 
1373 	hlist_del(&region->hnode);
1374 
1375 	if (region->flags.large_pages)
1376 		unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE;
1377 
1378 	/* ignore unmap failures and continue as process may be exiting */
1379 	hv_call_unmap_gpa_pages(partition->pt_id, region->start_gfn,
1380 				region->nr_pages, unmap_flags);
1381 
1382 	mshv_region_evict(region);
1383 
1384 	vfree(region);
1385 	return 0;
1386 }
1387 
1388 static long
mshv_partition_ioctl_set_memory(struct mshv_partition * partition,struct mshv_user_mem_region __user * user_mem)1389 mshv_partition_ioctl_set_memory(struct mshv_partition *partition,
1390 				struct mshv_user_mem_region __user *user_mem)
1391 {
1392 	struct mshv_user_mem_region mem;
1393 
1394 	if (copy_from_user(&mem, user_mem, sizeof(mem)))
1395 		return -EFAULT;
1396 
1397 	if (!mem.size ||
1398 	    !PAGE_ALIGNED(mem.size) ||
1399 	    !PAGE_ALIGNED(mem.userspace_addr) ||
1400 	    (mem.flags & ~MSHV_SET_MEM_FLAGS_MASK) ||
1401 	    mshv_field_nonzero(mem, rsvd))
1402 		return -EINVAL;
1403 
1404 	if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP))
1405 		return mshv_unmap_user_memory(partition, mem);
1406 
1407 	return mshv_map_user_memory(partition, mem);
1408 }
1409 
1410 static long
mshv_partition_ioctl_ioeventfd(struct mshv_partition * partition,void __user * user_args)1411 mshv_partition_ioctl_ioeventfd(struct mshv_partition *partition,
1412 			       void __user *user_args)
1413 {
1414 	struct mshv_user_ioeventfd args;
1415 
1416 	if (copy_from_user(&args, user_args, sizeof(args)))
1417 		return -EFAULT;
1418 
1419 	return mshv_set_unset_ioeventfd(partition, &args);
1420 }
1421 
1422 static long
mshv_partition_ioctl_irqfd(struct mshv_partition * partition,void __user * user_args)1423 mshv_partition_ioctl_irqfd(struct mshv_partition *partition,
1424 			   void __user *user_args)
1425 {
1426 	struct mshv_user_irqfd args;
1427 
1428 	if (copy_from_user(&args, user_args, sizeof(args)))
1429 		return -EFAULT;
1430 
1431 	return mshv_set_unset_irqfd(partition, &args);
1432 }
1433 
1434 static long
mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition * partition,void __user * user_args)1435 mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition *partition,
1436 					    void __user *user_args)
1437 {
1438 	struct mshv_gpap_access_bitmap args;
1439 	union hv_gpa_page_access_state *states;
1440 	long ret, i;
1441 	union hv_gpa_page_access_state_flags hv_flags = {};
1442 	u8 hv_type_mask;
1443 	ulong bitmap_buf_sz, states_buf_sz;
1444 	int written = 0;
1445 
1446 	if (copy_from_user(&args, user_args, sizeof(args)))
1447 		return -EFAULT;
1448 
1449 	if (args.access_type >= MSHV_GPAP_ACCESS_TYPE_COUNT ||
1450 	    args.access_op >= MSHV_GPAP_ACCESS_OP_COUNT ||
1451 	    mshv_field_nonzero(args, rsvd) || !args.page_count ||
1452 	    !args.bitmap_ptr)
1453 		return -EINVAL;
1454 
1455 	if (check_mul_overflow(args.page_count, sizeof(*states), &states_buf_sz))
1456 		return -E2BIG;
1457 
1458 	/* Num bytes needed to store bitmap; one bit per page rounded up */
1459 	bitmap_buf_sz = DIV_ROUND_UP(args.page_count, 8);
1460 
1461 	/* Sanity check */
1462 	if (bitmap_buf_sz > states_buf_sz)
1463 		return -EBADFD;
1464 
1465 	switch (args.access_type) {
1466 	case MSHV_GPAP_ACCESS_TYPE_ACCESSED:
1467 		hv_type_mask = 1;
1468 		if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) {
1469 			hv_flags.clear_accessed = 1;
1470 			/* not accessed implies not dirty */
1471 			hv_flags.clear_dirty = 1;
1472 		} else { /* MSHV_GPAP_ACCESS_OP_SET */
1473 			hv_flags.set_accessed = 1;
1474 		}
1475 		break;
1476 	case MSHV_GPAP_ACCESS_TYPE_DIRTY:
1477 		hv_type_mask = 2;
1478 		if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) {
1479 			hv_flags.clear_dirty = 1;
1480 		} else { /* MSHV_GPAP_ACCESS_OP_SET */
1481 			hv_flags.set_dirty = 1;
1482 			/* dirty implies accessed */
1483 			hv_flags.set_accessed = 1;
1484 		}
1485 		break;
1486 	}
1487 
1488 	states = vzalloc(states_buf_sz);
1489 	if (!states)
1490 		return -ENOMEM;
1491 
1492 	ret = hv_call_get_gpa_access_states(partition->pt_id, args.page_count,
1493 					    args.gpap_base, hv_flags, &written,
1494 					    states);
1495 	if (ret)
1496 		goto free_return;
1497 
1498 	/*
1499 	 * Overwrite states buffer with bitmap - the bits in hv_type_mask
1500 	 * correspond to bitfields in hv_gpa_page_access_state
1501 	 */
1502 	for (i = 0; i < written; ++i)
1503 		__assign_bit(i, (ulong *)states,
1504 			     states[i].as_uint8 & hv_type_mask);
1505 
1506 	/* zero the unused bits in the last byte(s) of the returned bitmap */
1507 	for (i = written; i < bitmap_buf_sz * 8; ++i)
1508 		__clear_bit(i, (ulong *)states);
1509 
1510 	if (copy_to_user((void __user *)args.bitmap_ptr, states, bitmap_buf_sz))
1511 		ret = -EFAULT;
1512 
1513 free_return:
1514 	vfree(states);
1515 	return ret;
1516 }
1517 
1518 static long
mshv_partition_ioctl_set_msi_routing(struct mshv_partition * partition,void __user * user_args)1519 mshv_partition_ioctl_set_msi_routing(struct mshv_partition *partition,
1520 				     void __user *user_args)
1521 {
1522 	struct mshv_user_irq_entry *entries = NULL;
1523 	struct mshv_user_irq_table args;
1524 	long ret;
1525 
1526 	if (copy_from_user(&args, user_args, sizeof(args)))
1527 		return -EFAULT;
1528 
1529 	if (args.nr > MSHV_MAX_GUEST_IRQS ||
1530 	    mshv_field_nonzero(args, rsvd))
1531 		return -EINVAL;
1532 
1533 	if (args.nr) {
1534 		struct mshv_user_irq_table __user *urouting = user_args;
1535 
1536 		entries = vmemdup_user(urouting->entries,
1537 				       array_size(sizeof(*entries),
1538 						  args.nr));
1539 		if (IS_ERR(entries))
1540 			return PTR_ERR(entries);
1541 	}
1542 	ret = mshv_update_routing_table(partition, entries, args.nr);
1543 	kvfree(entries);
1544 
1545 	return ret;
1546 }
1547 
1548 static long
mshv_partition_ioctl_initialize(struct mshv_partition * partition)1549 mshv_partition_ioctl_initialize(struct mshv_partition *partition)
1550 {
1551 	long ret;
1552 
1553 	if (partition->pt_initialized)
1554 		return 0;
1555 
1556 	ret = hv_call_initialize_partition(partition->pt_id);
1557 	if (ret)
1558 		goto withdraw_mem;
1559 
1560 	partition->pt_initialized = true;
1561 
1562 	return 0;
1563 
1564 withdraw_mem:
1565 	hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);
1566 
1567 	return ret;
1568 }
1569 
1570 static long
mshv_partition_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)1571 mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
1572 {
1573 	struct mshv_partition *partition = filp->private_data;
1574 	long ret;
1575 	void __user *uarg = (void __user *)arg;
1576 
1577 	if (mutex_lock_killable(&partition->pt_mutex))
1578 		return -EINTR;
1579 
1580 	switch (ioctl) {
1581 	case MSHV_INITIALIZE_PARTITION:
1582 		ret = mshv_partition_ioctl_initialize(partition);
1583 		break;
1584 	case MSHV_SET_GUEST_MEMORY:
1585 		ret = mshv_partition_ioctl_set_memory(partition, uarg);
1586 		break;
1587 	case MSHV_CREATE_VP:
1588 		ret = mshv_partition_ioctl_create_vp(partition, uarg);
1589 		break;
1590 	case MSHV_IRQFD:
1591 		ret = mshv_partition_ioctl_irqfd(partition, uarg);
1592 		break;
1593 	case MSHV_IOEVENTFD:
1594 		ret = mshv_partition_ioctl_ioeventfd(partition, uarg);
1595 		break;
1596 	case MSHV_SET_MSI_ROUTING:
1597 		ret = mshv_partition_ioctl_set_msi_routing(partition, uarg);
1598 		break;
1599 	case MSHV_GET_GPAP_ACCESS_BITMAP:
1600 		ret = mshv_partition_ioctl_get_gpap_access_bitmap(partition,
1601 								  uarg);
1602 		break;
1603 	case MSHV_ROOT_HVCALL:
1604 		ret = mshv_ioctl_passthru_hvcall(partition, true, uarg);
1605 		break;
1606 	default:
1607 		ret = -ENOTTY;
1608 	}
1609 
1610 	mutex_unlock(&partition->pt_mutex);
1611 	return ret;
1612 }
1613 
1614 static int
disable_vp_dispatch(struct mshv_vp * vp)1615 disable_vp_dispatch(struct mshv_vp *vp)
1616 {
1617 	int ret;
1618 	struct hv_register_assoc dispatch_suspend = {
1619 		.name = HV_REGISTER_DISPATCH_SUSPEND,
1620 		.value.dispatch_suspend.suspended = 1,
1621 	};
1622 
1623 	ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
1624 				    1, &dispatch_suspend);
1625 	if (ret)
1626 		vp_err(vp, "failed to suspend\n");
1627 
1628 	return ret;
1629 }
1630 
1631 static int
get_vp_signaled_count(struct mshv_vp * vp,u64 * count)1632 get_vp_signaled_count(struct mshv_vp *vp, u64 *count)
1633 {
1634 	int ret;
1635 	struct hv_register_assoc root_signal_count = {
1636 		.name = HV_REGISTER_VP_ROOT_SIGNAL_COUNT,
1637 	};
1638 
1639 	ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
1640 				    1, &root_signal_count);
1641 
1642 	if (ret) {
1643 		vp_err(vp, "Failed to get root signal count");
1644 		*count = 0;
1645 		return ret;
1646 	}
1647 
1648 	*count = root_signal_count.value.reg64;
1649 
1650 	return ret;
1651 }
1652 
1653 static void
drain_vp_signals(struct mshv_vp * vp)1654 drain_vp_signals(struct mshv_vp *vp)
1655 {
1656 	u64 hv_signal_count;
1657 	u64 vp_signal_count;
1658 
1659 	get_vp_signaled_count(vp, &hv_signal_count);
1660 
1661 	vp_signal_count = atomic64_read(&vp->run.vp_signaled_count);
1662 
1663 	/*
1664 	 * There should be at most 1 outstanding notification, but be extra
1665 	 * careful anyway.
1666 	 */
1667 	while (hv_signal_count != vp_signal_count) {
1668 		WARN_ON(hv_signal_count - vp_signal_count != 1);
1669 
1670 		if (wait_event_interruptible(vp->run.vp_suspend_queue,
1671 					     vp->run.kicked_by_hv == 1))
1672 			break;
1673 		vp->run.kicked_by_hv = 0;
1674 		vp_signal_count = atomic64_read(&vp->run.vp_signaled_count);
1675 	}
1676 }
1677 
drain_all_vps(const struct mshv_partition * partition)1678 static void drain_all_vps(const struct mshv_partition *partition)
1679 {
1680 	int i;
1681 	struct mshv_vp *vp;
1682 
1683 	/*
1684 	 * VPs are reachable from ISR. It is safe to not take the partition
1685 	 * lock because nobody else can enter this function and drop the
1686 	 * partition from the list.
1687 	 */
1688 	for (i = 0; i < MSHV_MAX_VPS; i++) {
1689 		vp = partition->pt_vp_array[i];
1690 		if (!vp)
1691 			continue;
1692 		/*
1693 		 * Disable dispatching of the VP in the hypervisor. After this
1694 		 * the hypervisor guarantees it won't generate any signals for
1695 		 * the VP and the hypervisor's VP signal count won't change.
1696 		 */
1697 		disable_vp_dispatch(vp);
1698 		drain_vp_signals(vp);
1699 	}
1700 }
1701 
1702 static void
remove_partition(struct mshv_partition * partition)1703 remove_partition(struct mshv_partition *partition)
1704 {
1705 	spin_lock(&mshv_root.pt_ht_lock);
1706 	hlist_del_rcu(&partition->pt_hnode);
1707 	spin_unlock(&mshv_root.pt_ht_lock);
1708 
1709 	synchronize_rcu();
1710 }
1711 
1712 /*
1713  * Tear down a partition and remove it from the list.
1714  * Partition's refcount must be 0
1715  */
destroy_partition(struct mshv_partition * partition)1716 static void destroy_partition(struct mshv_partition *partition)
1717 {
1718 	struct mshv_vp *vp;
1719 	struct mshv_mem_region *region;
1720 	int i, ret;
1721 	struct hlist_node *n;
1722 
1723 	if (refcount_read(&partition->pt_ref_count)) {
1724 		pt_err(partition,
1725 		       "Attempt to destroy partition but refcount > 0\n");
1726 		return;
1727 	}
1728 
1729 	if (partition->pt_initialized) {
1730 		/*
1731 		 * We only need to drain signals for root scheduler. This should be
1732 		 * done before removing the partition from the partition list.
1733 		 */
1734 		if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
1735 			drain_all_vps(partition);
1736 
1737 		/* Remove vps */
1738 		for (i = 0; i < MSHV_MAX_VPS; ++i) {
1739 			vp = partition->pt_vp_array[i];
1740 			if (!vp)
1741 				continue;
1742 
1743 			if (hv_parent_partition())
1744 				mshv_vp_stats_unmap(partition->pt_id, vp->vp_index);
1745 
1746 			if (vp->vp_register_page) {
1747 				(void)hv_call_unmap_vp_state_page(partition->pt_id,
1748 								  vp->vp_index,
1749 								  HV_VP_STATE_PAGE_REGISTERS,
1750 								  input_vtl_zero);
1751 				vp->vp_register_page = NULL;
1752 			}
1753 
1754 			(void)hv_call_unmap_vp_state_page(partition->pt_id,
1755 							  vp->vp_index,
1756 							  HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
1757 							  input_vtl_zero);
1758 			vp->vp_intercept_msg_page = NULL;
1759 
1760 			if (vp->vp_ghcb_page) {
1761 				(void)hv_call_unmap_vp_state_page(partition->pt_id,
1762 								  vp->vp_index,
1763 								  HV_VP_STATE_PAGE_GHCB,
1764 								  input_vtl_normal);
1765 				vp->vp_ghcb_page = NULL;
1766 			}
1767 
1768 			kfree(vp);
1769 
1770 			partition->pt_vp_array[i] = NULL;
1771 		}
1772 
1773 		/* Deallocates and unmaps everything including vcpus, GPA mappings etc */
1774 		hv_call_finalize_partition(partition->pt_id);
1775 
1776 		partition->pt_initialized = false;
1777 	}
1778 
1779 	remove_partition(partition);
1780 
1781 	/* Remove regions, regain access to the memory and unpin the pages */
1782 	hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions,
1783 				  hnode) {
1784 		hlist_del(&region->hnode);
1785 
1786 		if (mshv_partition_encrypted(partition)) {
1787 			ret = mshv_partition_region_share(region);
1788 			if (ret) {
1789 				pt_err(partition,
1790 				       "Failed to regain access to memory, unpinning user pages will fail and crash the host error: %d\n",
1791 				      ret);
1792 				return;
1793 			}
1794 		}
1795 
1796 		mshv_region_evict(region);
1797 
1798 		vfree(region);
1799 	}
1800 
1801 	/* Withdraw and free all pages we deposited */
1802 	hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);
1803 	hv_call_delete_partition(partition->pt_id);
1804 
1805 	mshv_free_routing_table(partition);
1806 	kfree(partition);
1807 }
1808 
1809 struct
mshv_partition_get(struct mshv_partition * partition)1810 mshv_partition *mshv_partition_get(struct mshv_partition *partition)
1811 {
1812 	if (refcount_inc_not_zero(&partition->pt_ref_count))
1813 		return partition;
1814 	return NULL;
1815 }
1816 
1817 struct
mshv_partition_find(u64 partition_id)1818 mshv_partition *mshv_partition_find(u64 partition_id)
1819 	__must_hold(RCU)
1820 {
1821 	struct mshv_partition *p;
1822 
1823 	hash_for_each_possible_rcu(mshv_root.pt_htable, p, pt_hnode,
1824 				   partition_id)
1825 		if (p->pt_id == partition_id)
1826 			return p;
1827 
1828 	return NULL;
1829 }
1830 
1831 void
mshv_partition_put(struct mshv_partition * partition)1832 mshv_partition_put(struct mshv_partition *partition)
1833 {
1834 	if (refcount_dec_and_test(&partition->pt_ref_count))
1835 		destroy_partition(partition);
1836 }
1837 
1838 static int
mshv_partition_release(struct inode * inode,struct file * filp)1839 mshv_partition_release(struct inode *inode, struct file *filp)
1840 {
1841 	struct mshv_partition *partition = filp->private_data;
1842 
1843 	mshv_eventfd_release(partition);
1844 
1845 	cleanup_srcu_struct(&partition->pt_irq_srcu);
1846 
1847 	mshv_partition_put(partition);
1848 
1849 	return 0;
1850 }
1851 
1852 static int
add_partition(struct mshv_partition * partition)1853 add_partition(struct mshv_partition *partition)
1854 {
1855 	spin_lock(&mshv_root.pt_ht_lock);
1856 
1857 	hash_add_rcu(mshv_root.pt_htable, &partition->pt_hnode,
1858 		     partition->pt_id);
1859 
1860 	spin_unlock(&mshv_root.pt_ht_lock);
1861 
1862 	return 0;
1863 }
1864 
1865 static long
mshv_ioctl_create_partition(void __user * user_arg,struct device * module_dev)1866 mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
1867 {
1868 	struct mshv_create_partition args;
1869 	u64 creation_flags;
1870 	struct hv_partition_creation_properties creation_properties = {};
1871 	union hv_partition_isolation_properties isolation_properties = {};
1872 	struct mshv_partition *partition;
1873 	struct file *file;
1874 	int fd;
1875 	long ret;
1876 
1877 	if (copy_from_user(&args, user_arg, sizeof(args)))
1878 		return -EFAULT;
1879 
1880 	if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
1881 	    args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
1882 		return -EINVAL;
1883 
1884 	/* Only support EXO partitions */
1885 	creation_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION |
1886 			 HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED;
1887 
1888 	if (args.pt_flags & BIT(MSHV_PT_BIT_LAPIC))
1889 		creation_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED;
1890 	if (args.pt_flags & BIT(MSHV_PT_BIT_X2APIC))
1891 		creation_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE;
1892 	if (args.pt_flags & BIT(MSHV_PT_BIT_GPA_SUPER_PAGES))
1893 		creation_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED;
1894 
1895 	switch (args.pt_isolation) {
1896 	case MSHV_PT_ISOLATION_NONE:
1897 		isolation_properties.isolation_type =
1898 			HV_PARTITION_ISOLATION_TYPE_NONE;
1899 		break;
1900 	}
1901 
1902 	partition = kzalloc(sizeof(*partition), GFP_KERNEL);
1903 	if (!partition)
1904 		return -ENOMEM;
1905 
1906 	partition->pt_module_dev = module_dev;
1907 	partition->isolation_type = isolation_properties.isolation_type;
1908 
1909 	refcount_set(&partition->pt_ref_count, 1);
1910 
1911 	mutex_init(&partition->pt_mutex);
1912 
1913 	mutex_init(&partition->pt_irq_lock);
1914 
1915 	init_completion(&partition->async_hypercall);
1916 
1917 	INIT_HLIST_HEAD(&partition->irq_ack_notifier_list);
1918 
1919 	INIT_HLIST_HEAD(&partition->pt_devices);
1920 
1921 	INIT_HLIST_HEAD(&partition->pt_mem_regions);
1922 
1923 	mshv_eventfd_init(partition);
1924 
1925 	ret = init_srcu_struct(&partition->pt_irq_srcu);
1926 	if (ret)
1927 		goto free_partition;
1928 
1929 	ret = hv_call_create_partition(creation_flags,
1930 				       creation_properties,
1931 				       isolation_properties,
1932 				       &partition->pt_id);
1933 	if (ret)
1934 		goto cleanup_irq_srcu;
1935 
1936 	ret = add_partition(partition);
1937 	if (ret)
1938 		goto delete_partition;
1939 
1940 	ret = mshv_init_async_handler(partition);
1941 	if (ret)
1942 		goto remove_partition;
1943 
1944 	fd = get_unused_fd_flags(O_CLOEXEC);
1945 	if (fd < 0) {
1946 		ret = fd;
1947 		goto remove_partition;
1948 	}
1949 
1950 	file = anon_inode_getfile("mshv_partition", &mshv_partition_fops,
1951 				  partition, O_RDWR);
1952 	if (IS_ERR(file)) {
1953 		ret = PTR_ERR(file);
1954 		goto put_fd;
1955 	}
1956 
1957 	fd_install(fd, file);
1958 
1959 	return fd;
1960 
1961 put_fd:
1962 	put_unused_fd(fd);
1963 remove_partition:
1964 	remove_partition(partition);
1965 delete_partition:
1966 	hv_call_delete_partition(partition->pt_id);
1967 cleanup_irq_srcu:
1968 	cleanup_srcu_struct(&partition->pt_irq_srcu);
1969 free_partition:
1970 	kfree(partition);
1971 
1972 	return ret;
1973 }
1974 
mshv_dev_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)1975 static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl,
1976 			   unsigned long arg)
1977 {
1978 	struct miscdevice *misc = filp->private_data;
1979 
1980 	switch (ioctl) {
1981 	case MSHV_CREATE_PARTITION:
1982 		return mshv_ioctl_create_partition((void __user *)arg,
1983 						misc->this_device);
1984 	}
1985 
1986 	return -ENOTTY;
1987 }
1988 
1989 static int
mshv_dev_open(struct inode * inode,struct file * filp)1990 mshv_dev_open(struct inode *inode, struct file *filp)
1991 {
1992 	return 0;
1993 }
1994 
1995 static int
mshv_dev_release(struct inode * inode,struct file * filp)1996 mshv_dev_release(struct inode *inode, struct file *filp)
1997 {
1998 	return 0;
1999 }
2000 
2001 static int mshv_cpuhp_online;
2002 static int mshv_root_sched_online;
2003 
scheduler_type_to_string(enum hv_scheduler_type type)2004 static const char *scheduler_type_to_string(enum hv_scheduler_type type)
2005 {
2006 	switch (type) {
2007 	case HV_SCHEDULER_TYPE_LP:
2008 		return "classic scheduler without SMT";
2009 	case HV_SCHEDULER_TYPE_LP_SMT:
2010 		return "classic scheduler with SMT";
2011 	case HV_SCHEDULER_TYPE_CORE_SMT:
2012 		return "core scheduler";
2013 	case HV_SCHEDULER_TYPE_ROOT:
2014 		return "root scheduler";
2015 	default:
2016 		return "unknown scheduler";
2017 	};
2018 }
2019 
2020 /* TODO move this to hv_common.c when needed outside */
hv_retrieve_scheduler_type(enum hv_scheduler_type * out)2021 static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out)
2022 {
2023 	struct hv_input_get_system_property *input;
2024 	struct hv_output_get_system_property *output;
2025 	unsigned long flags;
2026 	u64 status;
2027 
2028 	local_irq_save(flags);
2029 	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
2030 	output = *this_cpu_ptr(hyperv_pcpu_output_arg);
2031 
2032 	memset(input, 0, sizeof(*input));
2033 	memset(output, 0, sizeof(*output));
2034 	input->property_id = HV_SYSTEM_PROPERTY_SCHEDULER_TYPE;
2035 
2036 	status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output);
2037 	if (!hv_result_success(status)) {
2038 		local_irq_restore(flags);
2039 		pr_err("%s: %s\n", __func__, hv_result_to_string(status));
2040 		return hv_result_to_errno(status);
2041 	}
2042 
2043 	*out = output->scheduler_type;
2044 	local_irq_restore(flags);
2045 
2046 	return 0;
2047 }
2048 
2049 /* Retrieve and stash the supported scheduler type */
mshv_retrieve_scheduler_type(struct device * dev)2050 static int __init mshv_retrieve_scheduler_type(struct device *dev)
2051 {
2052 	int ret = 0;
2053 
2054 	if (hv_l1vh_partition())
2055 		hv_scheduler_type = HV_SCHEDULER_TYPE_CORE_SMT;
2056 	else
2057 		ret = hv_retrieve_scheduler_type(&hv_scheduler_type);
2058 
2059 	if (ret)
2060 		return ret;
2061 
2062 	dev_info(dev, "Hypervisor using %s\n",
2063 		 scheduler_type_to_string(hv_scheduler_type));
2064 
2065 	switch (hv_scheduler_type) {
2066 	case HV_SCHEDULER_TYPE_CORE_SMT:
2067 	case HV_SCHEDULER_TYPE_LP_SMT:
2068 	case HV_SCHEDULER_TYPE_ROOT:
2069 	case HV_SCHEDULER_TYPE_LP:
2070 		/* Supported scheduler, nothing to do */
2071 		break;
2072 	default:
2073 		dev_err(dev, "unsupported scheduler 0x%x, bailing.\n",
2074 			hv_scheduler_type);
2075 		return -EOPNOTSUPP;
2076 	}
2077 
2078 	return 0;
2079 }
2080 
mshv_root_scheduler_init(unsigned int cpu)2081 static int mshv_root_scheduler_init(unsigned int cpu)
2082 {
2083 	void **inputarg, **outputarg, *p;
2084 
2085 	inputarg = (void **)this_cpu_ptr(root_scheduler_input);
2086 	outputarg = (void **)this_cpu_ptr(root_scheduler_output);
2087 
2088 	/* Allocate two consecutive pages. One for input, one for output. */
2089 	p = kmalloc(2 * HV_HYP_PAGE_SIZE, GFP_KERNEL);
2090 	if (!p)
2091 		return -ENOMEM;
2092 
2093 	*inputarg = p;
2094 	*outputarg = (char *)p + HV_HYP_PAGE_SIZE;
2095 
2096 	return 0;
2097 }
2098 
mshv_root_scheduler_cleanup(unsigned int cpu)2099 static int mshv_root_scheduler_cleanup(unsigned int cpu)
2100 {
2101 	void *p, **inputarg, **outputarg;
2102 
2103 	inputarg = (void **)this_cpu_ptr(root_scheduler_input);
2104 	outputarg = (void **)this_cpu_ptr(root_scheduler_output);
2105 
2106 	p = *inputarg;
2107 
2108 	*inputarg = NULL;
2109 	*outputarg = NULL;
2110 
2111 	kfree(p);
2112 
2113 	return 0;
2114 }
2115 
2116 /* Must be called after retrieving the scheduler type */
2117 static int
root_scheduler_init(struct device * dev)2118 root_scheduler_init(struct device *dev)
2119 {
2120 	int ret;
2121 
2122 	if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
2123 		return 0;
2124 
2125 	root_scheduler_input = alloc_percpu(void *);
2126 	root_scheduler_output = alloc_percpu(void *);
2127 
2128 	if (!root_scheduler_input || !root_scheduler_output) {
2129 		dev_err(dev, "Failed to allocate root scheduler buffers\n");
2130 		ret = -ENOMEM;
2131 		goto out;
2132 	}
2133 
2134 	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_root_sched",
2135 				mshv_root_scheduler_init,
2136 				mshv_root_scheduler_cleanup);
2137 
2138 	if (ret < 0) {
2139 		dev_err(dev, "Failed to setup root scheduler state: %i\n", ret);
2140 		goto out;
2141 	}
2142 
2143 	mshv_root_sched_online = ret;
2144 
2145 	return 0;
2146 
2147 out:
2148 	free_percpu(root_scheduler_input);
2149 	free_percpu(root_scheduler_output);
2150 	return ret;
2151 }
2152 
2153 static void
root_scheduler_deinit(void)2154 root_scheduler_deinit(void)
2155 {
2156 	if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
2157 		return;
2158 
2159 	cpuhp_remove_state(mshv_root_sched_online);
2160 	free_percpu(root_scheduler_input);
2161 	free_percpu(root_scheduler_output);
2162 }
2163 
mshv_reboot_notify(struct notifier_block * nb,unsigned long code,void * unused)2164 static int mshv_reboot_notify(struct notifier_block *nb,
2165 			      unsigned long code, void *unused)
2166 {
2167 	cpuhp_remove_state(mshv_cpuhp_online);
2168 	return 0;
2169 }
2170 
2171 struct notifier_block mshv_reboot_nb = {
2172 	.notifier_call = mshv_reboot_notify,
2173 };
2174 
mshv_root_partition_exit(void)2175 static void mshv_root_partition_exit(void)
2176 {
2177 	unregister_reboot_notifier(&mshv_reboot_nb);
2178 	root_scheduler_deinit();
2179 }
2180 
mshv_root_partition_init(struct device * dev)2181 static int __init mshv_root_partition_init(struct device *dev)
2182 {
2183 	int err;
2184 
2185 	err = root_scheduler_init(dev);
2186 	if (err)
2187 		return err;
2188 
2189 	err = register_reboot_notifier(&mshv_reboot_nb);
2190 	if (err)
2191 		goto root_sched_deinit;
2192 
2193 	return 0;
2194 
2195 root_sched_deinit:
2196 	root_scheduler_deinit();
2197 	return err;
2198 }
2199 
mshv_parent_partition_init(void)2200 static int __init mshv_parent_partition_init(void)
2201 {
2202 	int ret;
2203 	struct device *dev;
2204 	union hv_hypervisor_version_info version_info;
2205 
2206 	if (!hv_parent_partition() || is_kdump_kernel())
2207 		return -ENODEV;
2208 
2209 	if (hv_get_hypervisor_version(&version_info))
2210 		return -ENODEV;
2211 
2212 	ret = misc_register(&mshv_dev);
2213 	if (ret)
2214 		return ret;
2215 
2216 	dev = mshv_dev.this_device;
2217 
2218 	if (version_info.build_number < MSHV_HV_MIN_VERSION ||
2219 	    version_info.build_number > MSHV_HV_MAX_VERSION) {
2220 		dev_err(dev, "Running on unvalidated Hyper-V version\n");
2221 		dev_err(dev, "Versions: current: %u  min: %u  max: %u\n",
2222 			version_info.build_number, MSHV_HV_MIN_VERSION,
2223 			MSHV_HV_MAX_VERSION);
2224 	}
2225 
2226 	mshv_root.synic_pages = alloc_percpu(struct hv_synic_pages);
2227 	if (!mshv_root.synic_pages) {
2228 		dev_err(dev, "Failed to allocate percpu synic page\n");
2229 		ret = -ENOMEM;
2230 		goto device_deregister;
2231 	}
2232 
2233 	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic",
2234 				mshv_synic_init,
2235 				mshv_synic_cleanup);
2236 	if (ret < 0) {
2237 		dev_err(dev, "Failed to setup cpu hotplug state: %i\n", ret);
2238 		goto free_synic_pages;
2239 	}
2240 
2241 	mshv_cpuhp_online = ret;
2242 
2243 	ret = mshv_retrieve_scheduler_type(dev);
2244 	if (ret)
2245 		goto remove_cpu_state;
2246 
2247 	if (hv_root_partition())
2248 		ret = mshv_root_partition_init(dev);
2249 	if (ret)
2250 		goto remove_cpu_state;
2251 
2252 	ret = mshv_irqfd_wq_init();
2253 	if (ret)
2254 		goto exit_partition;
2255 
2256 	spin_lock_init(&mshv_root.pt_ht_lock);
2257 	hash_init(mshv_root.pt_htable);
2258 
2259 	hv_setup_mshv_handler(mshv_isr);
2260 
2261 	return 0;
2262 
2263 exit_partition:
2264 	if (hv_root_partition())
2265 		mshv_root_partition_exit();
2266 remove_cpu_state:
2267 	cpuhp_remove_state(mshv_cpuhp_online);
2268 free_synic_pages:
2269 	free_percpu(mshv_root.synic_pages);
2270 device_deregister:
2271 	misc_deregister(&mshv_dev);
2272 	return ret;
2273 }
2274 
mshv_parent_partition_exit(void)2275 static void __exit mshv_parent_partition_exit(void)
2276 {
2277 	hv_setup_mshv_handler(NULL);
2278 	mshv_port_table_fini();
2279 	misc_deregister(&mshv_dev);
2280 	mshv_irqfd_wq_cleanup();
2281 	if (hv_root_partition())
2282 		mshv_root_partition_exit();
2283 	cpuhp_remove_state(mshv_cpuhp_online);
2284 	free_percpu(mshv_root.synic_pages);
2285 }
2286 
2287 module_init(mshv_parent_partition_init);
2288 module_exit(mshv_parent_partition_exit);
2289