xref: /linux/drivers/hv/mshv_root_main.c (revision 36d6cbb62133fc6eea28f380409e0fb190f3dfbe)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2024, Microsoft Corporation.
4  *
5  * The main part of the mshv_root module, providing APIs to create
6  * and manage guest partitions.
7  *
8  * Authors: Microsoft Linux virtualization team
9  */
10 
11 #include <linux/entry-virt.h>
12 #include <linux/kernel.h>
13 #include <linux/module.h>
14 #include <linux/fs.h>
15 #include <linux/miscdevice.h>
16 #include <linux/slab.h>
17 #include <linux/file.h>
18 #include <linux/anon_inodes.h>
19 #include <linux/mm.h>
20 #include <linux/io.h>
21 #include <linux/cpuhotplug.h>
22 #include <linux/random.h>
23 #include <asm/mshyperv.h>
24 #include <linux/hyperv.h>
25 #include <linux/notifier.h>
26 #include <linux/reboot.h>
27 #include <linux/kexec.h>
28 #include <linux/page-flags.h>
29 #include <linux/crash_dump.h>
30 #include <linux/panic_notifier.h>
31 #include <linux/vmalloc.h>
32 #include <linux/rseq.h>
33 
34 #include "mshv_eventfd.h"
35 #include "mshv.h"
36 #include "mshv_root.h"
37 
38 MODULE_AUTHOR("Microsoft");
39 MODULE_LICENSE("GPL");
40 MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv");
41 
42 /* HV_THREAD_COUNTER */
43 #if defined(CONFIG_X86_64)
44 #define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 202
45 #elif defined(CONFIG_ARM64)
46 #define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 95
47 #endif
48 
49 struct mshv_root mshv_root;
50 
51 enum hv_scheduler_type hv_scheduler_type;
52 
53 /* Once we implement the fast extended hypercall ABI they can go away. */
54 static void * __percpu *root_scheduler_input;
55 static void * __percpu *root_scheduler_output;
56 
57 static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
58 static int mshv_dev_open(struct inode *inode, struct file *filp);
59 static int mshv_dev_release(struct inode *inode, struct file *filp);
60 static int mshv_vp_release(struct inode *inode, struct file *filp);
61 static long mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
62 static int mshv_partition_release(struct inode *inode, struct file *filp);
63 static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
64 static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma);
65 static vm_fault_t mshv_vp_fault(struct vm_fault *vmf);
66 static int mshv_init_async_handler(struct mshv_partition *partition);
67 static void mshv_async_hvcall_handler(void *data, u64 *status);
68 
69 static const union hv_input_vtl input_vtl_zero;
70 static const union hv_input_vtl input_vtl_normal = {
71 	.target_vtl = HV_NORMAL_VTL,
72 	.use_target_vtl = 1,
73 };
74 
75 static const struct vm_operations_struct mshv_vp_vm_ops = {
76 	.fault = mshv_vp_fault,
77 };
78 
79 static const struct file_operations mshv_vp_fops = {
80 	.owner = THIS_MODULE,
81 	.release = mshv_vp_release,
82 	.unlocked_ioctl = mshv_vp_ioctl,
83 	.llseek = noop_llseek,
84 	.mmap = mshv_vp_mmap,
85 };
86 
87 static const struct file_operations mshv_partition_fops = {
88 	.owner = THIS_MODULE,
89 	.release = mshv_partition_release,
90 	.unlocked_ioctl = mshv_partition_ioctl,
91 	.llseek = noop_llseek,
92 };
93 
94 static const struct file_operations mshv_dev_fops = {
95 	.owner = THIS_MODULE,
96 	.open = mshv_dev_open,
97 	.release = mshv_dev_release,
98 	.unlocked_ioctl = mshv_dev_ioctl,
99 	.llseek = noop_llseek,
100 };
101 
102 static struct miscdevice mshv_dev = {
103 	.minor = MISC_DYNAMIC_MINOR,
104 	.name = "mshv",
105 	.fops = &mshv_dev_fops,
106 	.mode = 0600,
107 };
108 
109 /*
110  * Only allow hypercalls that have a u64 partition id as the first member of
111  * the input structure.
112  * These are sorted by value.
113  */
114 static u16 mshv_passthru_hvcalls[] = {
115 	HVCALL_GET_PARTITION_PROPERTY,
116 	HVCALL_GET_PARTITION_PROPERTY_EX,
117 	HVCALL_SET_PARTITION_PROPERTY,
118 	HVCALL_INSTALL_INTERCEPT,
119 	HVCALL_GET_VP_REGISTERS,
120 	HVCALL_SET_VP_REGISTERS,
121 	HVCALL_TRANSLATE_VIRTUAL_ADDRESS,
122 	HVCALL_CLEAR_VIRTUAL_INTERRUPT,
123 	HVCALL_SCRUB_PARTITION,
124 	HVCALL_REGISTER_INTERCEPT_RESULT,
125 	HVCALL_ASSERT_VIRTUAL_INTERRUPT,
126 	HVCALL_GET_GPA_PAGES_ACCESS_STATES,
127 	HVCALL_SIGNAL_EVENT_DIRECT,
128 	HVCALL_POST_MESSAGE_DIRECT,
129 	HVCALL_GET_VP_CPUID_VALUES,
130 };
131 
132 /*
133  * Only allow hypercalls that are safe to be called by the VMM with the host
134  * partition as target (i.e. HV_PARTITION_ID_SELF). Carefully audit that a
135  * hypercall cannot be misused by the VMM before adding it to this list.
136  */
137 static u16 mshv_self_passthru_hvcalls[] = {
138 	HVCALL_GET_PARTITION_PROPERTY,
139 	HVCALL_GET_PARTITION_PROPERTY_EX,
140 };
141 
142 static bool mshv_hvcall_is_async(u16 code)
143 {
144 	switch (code) {
145 	case HVCALL_SET_PARTITION_PROPERTY:
146 		return true;
147 	default:
148 		break;
149 	}
150 	return false;
151 }
152 
153 static bool mshv_passthru_hvcall_allowed(u16 code, u64 pt_id)
154 {
155 	int i;
156 	int n = ARRAY_SIZE(mshv_passthru_hvcalls);
157 	u16 *allowed_hvcalls = mshv_passthru_hvcalls;
158 
159 	if (pt_id == HV_PARTITION_ID_SELF) {
160 		n = ARRAY_SIZE(mshv_self_passthru_hvcalls);
161 		allowed_hvcalls = mshv_self_passthru_hvcalls;
162 	}
163 
164 	for (i = 0; i < n; ++i)
165 		if (allowed_hvcalls[i] == code)
166 			return true;
167 
168 	return false;
169 }
170 
171 static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition,
172 				      bool partition_locked,
173 				      void __user *user_args)
174 {
175 	u64 status;
176 	int ret = 0;
177 	bool is_async;
178 	struct mshv_root_hvcall args;
179 	struct page *page;
180 	unsigned int pages_order;
181 	void *input_pg = NULL;
182 	void *output_pg = NULL;
183 	u16 reps_completed;
184 	u64 pt_id = partition ? partition->pt_id : HV_PARTITION_ID_SELF;
185 
186 	if (copy_from_user(&args, user_args, sizeof(args)))
187 		return -EFAULT;
188 
189 	if (args.status || !args.in_ptr || args.in_sz < sizeof(u64) ||
190 	    mshv_field_nonzero(args, rsvd) || args.in_sz > HV_HYP_PAGE_SIZE)
191 		return -EINVAL;
192 
193 	if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE))
194 		return -EINVAL;
195 
196 	if (!mshv_passthru_hvcall_allowed(args.code, pt_id))
197 		return -EINVAL;
198 
199 	is_async = mshv_hvcall_is_async(args.code);
200 	if (is_async) {
201 		/* async hypercalls can only be called from partition fd */
202 		if (!partition || !partition_locked)
203 			return -EINVAL;
204 		ret = mshv_init_async_handler(partition);
205 		if (ret)
206 			return ret;
207 	}
208 
209 	pages_order = args.out_ptr ? 1 : 0;
210 	page = alloc_pages(GFP_KERNEL, pages_order);
211 	if (!page)
212 		return -ENOMEM;
213 	input_pg = page_address(page);
214 
215 	if (args.out_ptr)
216 		output_pg = (char *)input_pg + PAGE_SIZE;
217 	else
218 		output_pg = NULL;
219 
220 	if (copy_from_user(input_pg, (void __user *)args.in_ptr,
221 			   args.in_sz)) {
222 		ret = -EFAULT;
223 		goto free_pages_out;
224 	}
225 
226 	/*
227 	 * NOTE: This only works because all the allowed hypercalls' input
228 	 * structs begin with a u64 partition_id field.
229 	 */
230 	*(u64 *)input_pg = pt_id;
231 
232 	reps_completed = 0;
233 	do {
234 		if (args.reps) {
235 			status = hv_do_rep_hypercall_ex(args.code, args.reps,
236 							0, reps_completed,
237 							input_pg, output_pg);
238 			reps_completed = hv_repcomp(status);
239 		} else {
240 			status = hv_do_hypercall(args.code, input_pg, output_pg);
241 		}
242 
243 		if (hv_result(status) == HV_STATUS_CALL_PENDING) {
244 			if (is_async) {
245 				mshv_async_hvcall_handler(partition, &status);
246 			} else { /* Paranoia check. This shouldn't happen! */
247 				ret = -EBADFD;
248 				goto free_pages_out;
249 			}
250 		}
251 
252 		if (hv_result_success(status))
253 			break;
254 
255 		if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY)
256 			ret = hv_result_to_errno(status);
257 		else
258 			ret = hv_call_deposit_pages(NUMA_NO_NODE,
259 						    pt_id, 1);
260 	} while (!ret);
261 
262 	args.status = hv_result(status);
263 	args.reps = reps_completed;
264 	if (copy_to_user(user_args, &args, sizeof(args)))
265 		ret = -EFAULT;
266 
267 	if (!ret && output_pg &&
268 	    copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz))
269 		ret = -EFAULT;
270 
271 free_pages_out:
272 	free_pages((unsigned long)input_pg, pages_order);
273 
274 	return ret;
275 }
276 
277 static inline bool is_ghcb_mapping_available(void)
278 {
279 #if IS_ENABLED(CONFIG_X86_64)
280 	return ms_hyperv.ext_features & HV_VP_GHCB_ROOT_MAPPING_AVAILABLE;
281 #else
282 	return 0;
283 #endif
284 }
285 
286 static int mshv_get_vp_registers(u32 vp_index, u64 partition_id, u16 count,
287 				 struct hv_register_assoc *registers)
288 {
289 	return hv_call_get_vp_registers(vp_index, partition_id,
290 					count, input_vtl_zero, registers);
291 }
292 
293 static int mshv_set_vp_registers(u32 vp_index, u64 partition_id, u16 count,
294 				 struct hv_register_assoc *registers)
295 {
296 	return hv_call_set_vp_registers(vp_index, partition_id,
297 					count, input_vtl_zero, registers);
298 }
299 
300 /*
301  * Explicit guest vCPU suspend is asynchronous by nature (as it is requested by
302  * dom0 vCPU for guest vCPU) and thus it can race with "intercept" suspend,
303  * done by the hypervisor.
304  * "Intercept" suspend leads to asynchronous message delivery to dom0 which
305  * should be awaited to keep the VP loop consistent (i.e. no message pending
306  * upon VP resume).
307  * VP intercept suspend can't be done when the VP is explicitly suspended
308  * already, and thus can be only two possible race scenarios:
309  *   1. implicit suspend bit set -> explicit suspend bit set -> message sent
310  *   2. implicit suspend bit set -> message sent -> explicit suspend bit set
311  * Checking for implicit suspend bit set after explicit suspend request has
312  * succeeded in either case allows us to reliably identify, if there is a
313  * message to receive and deliver to VMM.
314  */
315 static int
316 mshv_suspend_vp(const struct mshv_vp *vp, bool *message_in_flight)
317 {
318 	struct hv_register_assoc explicit_suspend = {
319 		.name = HV_REGISTER_EXPLICIT_SUSPEND
320 	};
321 	struct hv_register_assoc intercept_suspend = {
322 		.name = HV_REGISTER_INTERCEPT_SUSPEND
323 	};
324 	union hv_explicit_suspend_register *es =
325 		&explicit_suspend.value.explicit_suspend;
326 	union hv_intercept_suspend_register *is =
327 		&intercept_suspend.value.intercept_suspend;
328 	int ret;
329 
330 	es->suspended = 1;
331 
332 	ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
333 				    1, &explicit_suspend);
334 	if (ret) {
335 		vp_err(vp, "Failed to explicitly suspend vCPU\n");
336 		return ret;
337 	}
338 
339 	ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
340 				    1, &intercept_suspend);
341 	if (ret) {
342 		vp_err(vp, "Failed to get intercept suspend state\n");
343 		return ret;
344 	}
345 
346 	*message_in_flight = is->suspended;
347 
348 	return 0;
349 }
350 
351 /*
352  * This function is used when VPs are scheduled by the hypervisor's
353  * scheduler.
354  *
355  * Caller has to make sure the registers contain cleared
356  * HV_REGISTER_INTERCEPT_SUSPEND and HV_REGISTER_EXPLICIT_SUSPEND registers
357  * exactly in this order (the hypervisor clears them sequentially) to avoid
358  * potential invalid clearing a newly arrived HV_REGISTER_INTERCEPT_SUSPEND
359  * after VP is released from HV_REGISTER_EXPLICIT_SUSPEND in case of the
360  * opposite order.
361  */
362 static long mshv_run_vp_with_hyp_scheduler(struct mshv_vp *vp)
363 {
364 	long ret;
365 	struct hv_register_assoc suspend_regs[2] = {
366 			{ .name = HV_REGISTER_INTERCEPT_SUSPEND },
367 			{ .name = HV_REGISTER_EXPLICIT_SUSPEND }
368 	};
369 	size_t count = ARRAY_SIZE(suspend_regs);
370 
371 	/* Resume VP execution */
372 	ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
373 				    count, suspend_regs);
374 	if (ret) {
375 		vp_err(vp, "Failed to resume vp execution. %lx\n", ret);
376 		return ret;
377 	}
378 
379 	ret = wait_event_interruptible(vp->run.vp_suspend_queue,
380 				       vp->run.kicked_by_hv == 1);
381 	if (ret) {
382 		bool message_in_flight;
383 
384 		/*
385 		 * Otherwise the waiting was interrupted by a signal: suspend
386 		 * the vCPU explicitly and copy message in flight (if any).
387 		 */
388 		ret = mshv_suspend_vp(vp, &message_in_flight);
389 		if (ret)
390 			return ret;
391 
392 		/* Return if no message in flight */
393 		if (!message_in_flight)
394 			return -EINTR;
395 
396 		/* Wait for the message in flight. */
397 		wait_event(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1);
398 	}
399 
400 	/*
401 	 * Reset the flag to make the wait_event call above work
402 	 * next time.
403 	 */
404 	vp->run.kicked_by_hv = 0;
405 
406 	return 0;
407 }
408 
409 static int
410 mshv_vp_dispatch(struct mshv_vp *vp, u32 flags,
411 		 struct hv_output_dispatch_vp *res)
412 {
413 	struct hv_input_dispatch_vp *input;
414 	struct hv_output_dispatch_vp *output;
415 	u64 status;
416 
417 	preempt_disable();
418 	input = *this_cpu_ptr(root_scheduler_input);
419 	output = *this_cpu_ptr(root_scheduler_output);
420 
421 	memset(input, 0, sizeof(*input));
422 	memset(output, 0, sizeof(*output));
423 
424 	input->partition_id = vp->vp_partition->pt_id;
425 	input->vp_index = vp->vp_index;
426 	input->time_slice = 0; /* Run forever until something happens */
427 	input->spec_ctrl = 0; /* TODO: set sensible flags */
428 	input->flags = flags;
429 
430 	vp->run.flags.root_sched_dispatched = 1;
431 	status = hv_do_hypercall(HVCALL_DISPATCH_VP, input, output);
432 	vp->run.flags.root_sched_dispatched = 0;
433 
434 	*res = *output;
435 	preempt_enable();
436 
437 	if (!hv_result_success(status))
438 		vp_err(vp, "%s: status %s\n", __func__,
439 		       hv_result_to_string(status));
440 
441 	return hv_result_to_errno(status);
442 }
443 
444 static int
445 mshv_vp_clear_explicit_suspend(struct mshv_vp *vp)
446 {
447 	struct hv_register_assoc explicit_suspend = {
448 		.name = HV_REGISTER_EXPLICIT_SUSPEND,
449 		.value.explicit_suspend.suspended = 0,
450 	};
451 	int ret;
452 
453 	ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
454 				    1, &explicit_suspend);
455 
456 	if (ret)
457 		vp_err(vp, "Failed to unsuspend\n");
458 
459 	return ret;
460 }
461 
462 #if IS_ENABLED(CONFIG_X86_64)
463 static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp)
464 {
465 	if (!vp->vp_register_page)
466 		return 0;
467 	return vp->vp_register_page->interrupt_vectors.as_uint64;
468 }
469 #else
470 static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp)
471 {
472 	return 0;
473 }
474 #endif
475 
476 static bool mshv_vp_dispatch_thread_blocked(struct mshv_vp *vp)
477 {
478 	struct hv_stats_page **stats = vp->vp_stats_pages;
479 	u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->data;
480 	u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->data;
481 
482 	return parent_vp_cntrs[HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED] ||
483 	       self_vp_cntrs[HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED];
484 }
485 
486 static int
487 mshv_vp_wait_for_hv_kick(struct mshv_vp *vp)
488 {
489 	int ret;
490 
491 	ret = wait_event_interruptible(vp->run.vp_suspend_queue,
492 				       (vp->run.kicked_by_hv == 1 &&
493 					!mshv_vp_dispatch_thread_blocked(vp)) ||
494 				       mshv_vp_interrupt_pending(vp));
495 	if (ret)
496 		return -EINTR;
497 
498 	vp->run.flags.root_sched_blocked = 0;
499 	vp->run.kicked_by_hv = 0;
500 
501 	return 0;
502 }
503 
504 /* Must be called with interrupts enabled */
505 static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp)
506 {
507 	long ret;
508 
509 	if (vp->run.flags.root_sched_blocked) {
510 		/*
511 		 * Dispatch state of this VP is blocked. Need to wait
512 		 * for the hypervisor to clear the blocked state before
513 		 * dispatching it.
514 		 */
515 		ret = mshv_vp_wait_for_hv_kick(vp);
516 		if (ret)
517 			return ret;
518 	}
519 
520 	do {
521 		u32 flags = 0;
522 		struct hv_output_dispatch_vp output;
523 
524 		if (__xfer_to_guest_mode_work_pending()) {
525 			ret = xfer_to_guest_mode_handle_work();
526 			if (ret)
527 				break;
528 		}
529 
530 		if (vp->run.flags.intercept_suspend)
531 			flags |= HV_DISPATCH_VP_FLAG_CLEAR_INTERCEPT_SUSPEND;
532 
533 		if (mshv_vp_interrupt_pending(vp))
534 			flags |= HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION;
535 
536 		ret = mshv_vp_dispatch(vp, flags, &output);
537 		if (ret)
538 			break;
539 
540 		vp->run.flags.intercept_suspend = 0;
541 
542 		if (output.dispatch_state == HV_VP_DISPATCH_STATE_BLOCKED) {
543 			if (output.dispatch_event ==
544 						HV_VP_DISPATCH_EVENT_SUSPEND) {
545 				/*
546 				 * TODO: remove the warning once VP canceling
547 				 *	 is supported
548 				 */
549 				WARN_ONCE(atomic64_read(&vp->run.vp_signaled_count),
550 					  "%s: vp#%d: unexpected explicit suspend\n",
551 					  __func__, vp->vp_index);
552 				/*
553 				 * Need to clear explicit suspend before
554 				 * dispatching.
555 				 * Explicit suspend is either:
556 				 * - set right after the first VP dispatch or
557 				 * - set explicitly via hypercall
558 				 * Since the latter case is not yet supported,
559 				 * simply clear it here.
560 				 */
561 				ret = mshv_vp_clear_explicit_suspend(vp);
562 				if (ret)
563 					break;
564 
565 				ret = mshv_vp_wait_for_hv_kick(vp);
566 				if (ret)
567 					break;
568 			} else {
569 				vp->run.flags.root_sched_blocked = 1;
570 				ret = mshv_vp_wait_for_hv_kick(vp);
571 				if (ret)
572 					break;
573 			}
574 		} else {
575 			/* HV_VP_DISPATCH_STATE_READY */
576 			if (output.dispatch_event ==
577 						HV_VP_DISPATCH_EVENT_INTERCEPT)
578 				vp->run.flags.intercept_suspend = 1;
579 		}
580 	} while (!vp->run.flags.intercept_suspend);
581 
582 	rseq_virt_userspace_exit();
583 
584 	return ret;
585 }
586 
587 static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ,
588 	      "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ");
589 
590 static struct mshv_mem_region *
591 mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
592 {
593 	struct mshv_mem_region *region;
594 
595 	hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) {
596 		if (gfn >= region->start_gfn &&
597 		    gfn < region->start_gfn + region->nr_pages)
598 			return region;
599 	}
600 
601 	return NULL;
602 }
603 
604 static struct mshv_mem_region *
605 mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
606 {
607 	struct mshv_mem_region *region;
608 
609 	spin_lock(&p->pt_mem_regions_lock);
610 	region = mshv_partition_region_by_gfn(p, gfn);
611 	if (!region || !mshv_region_get(region)) {
612 		spin_unlock(&p->pt_mem_regions_lock);
613 		return NULL;
614 	}
615 	spin_unlock(&p->pt_mem_regions_lock);
616 
617 	return region;
618 }
619 
620 /**
621  * mshv_handle_gpa_intercept - Handle GPA (Guest Physical Address) intercepts.
622  * @vp: Pointer to the virtual processor structure.
623  *
624  * This function processes GPA intercepts by identifying the memory region
625  * corresponding to the intercepted GPA, aligning the page offset, and
626  * mapping the required pages. It ensures that the region is valid and
627  * handles faults efficiently by mapping multiple pages at once.
628  *
629  * Return: true if the intercept was handled successfully, false otherwise.
630  */
631 static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
632 {
633 	struct mshv_partition *p = vp->vp_partition;
634 	struct mshv_mem_region *region;
635 	bool ret;
636 	u64 gfn;
637 #if defined(CONFIG_X86_64)
638 	struct hv_x64_memory_intercept_message *msg =
639 		(struct hv_x64_memory_intercept_message *)
640 		vp->vp_intercept_msg_page->u.payload;
641 #elif defined(CONFIG_ARM64)
642 	struct hv_arm64_memory_intercept_message *msg =
643 		(struct hv_arm64_memory_intercept_message *)
644 		vp->vp_intercept_msg_page->u.payload;
645 #endif
646 
647 	gfn = HVPFN_DOWN(msg->guest_physical_address);
648 
649 	region = mshv_partition_region_by_gfn_get(p, gfn);
650 	if (!region)
651 		return false;
652 
653 	/* Only movable memory ranges are supported for GPA intercepts */
654 	if (region->mreg_type == MSHV_REGION_TYPE_MEM_MOVABLE)
655 		ret = mshv_region_handle_gfn_fault(region, gfn);
656 	else
657 		ret = false;
658 
659 	mshv_region_put(region);
660 
661 	return ret;
662 }
663 
664 static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
665 {
666 	switch (vp->vp_intercept_msg_page->header.message_type) {
667 	case HVMSG_GPA_INTERCEPT:
668 		return mshv_handle_gpa_intercept(vp);
669 	}
670 	return false;
671 }
672 
673 static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg)
674 {
675 	long rc;
676 
677 	do {
678 		if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
679 			rc = mshv_run_vp_with_root_scheduler(vp);
680 		else
681 			rc = mshv_run_vp_with_hyp_scheduler(vp);
682 	} while (rc == 0 && mshv_vp_handle_intercept(vp));
683 
684 	if (rc)
685 		return rc;
686 
687 	if (copy_to_user(ret_msg, vp->vp_intercept_msg_page,
688 			 sizeof(struct hv_message)))
689 		rc = -EFAULT;
690 
691 	return rc;
692 }
693 
694 static int
695 mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp *vp,
696 				struct hv_vp_state_data state_data,
697 				unsigned long user_pfn, size_t page_count,
698 				bool is_set)
699 {
700 	int completed, ret = 0;
701 	unsigned long check;
702 	struct page **pages;
703 
704 	if (page_count > INT_MAX)
705 		return -EINVAL;
706 	/*
707 	 * Check the arithmetic for wraparound/overflow.
708 	 * The last page address in the buffer is:
709 	 * (user_pfn + (page_count - 1)) * PAGE_SIZE
710 	 */
711 	if (check_add_overflow(user_pfn, (page_count - 1), &check))
712 		return -EOVERFLOW;
713 	if (check_mul_overflow(check, PAGE_SIZE, &check))
714 		return -EOVERFLOW;
715 
716 	/* Pin user pages so hypervisor can copy directly to them */
717 	pages = kcalloc(page_count, sizeof(struct page *), GFP_KERNEL);
718 	if (!pages)
719 		return -ENOMEM;
720 
721 	for (completed = 0; completed < page_count; completed += ret) {
722 		unsigned long user_addr = (user_pfn + completed) * PAGE_SIZE;
723 		int remaining = page_count - completed;
724 
725 		ret = pin_user_pages_fast(user_addr, remaining, FOLL_WRITE,
726 					  &pages[completed]);
727 		if (ret < 0) {
728 			vp_err(vp, "%s: Failed to pin user pages error %i\n",
729 			       __func__, ret);
730 			goto unpin_pages;
731 		}
732 	}
733 
734 	if (is_set)
735 		ret = hv_call_set_vp_state(vp->vp_index,
736 					   vp->vp_partition->pt_id,
737 					   state_data, page_count, pages,
738 					   0, NULL);
739 	else
740 		ret = hv_call_get_vp_state(vp->vp_index,
741 					   vp->vp_partition->pt_id,
742 					   state_data, page_count, pages,
743 					   NULL);
744 
745 unpin_pages:
746 	unpin_user_pages(pages, completed);
747 	kfree(pages);
748 	return ret;
749 }
750 
751 static long
752 mshv_vp_ioctl_get_set_state(struct mshv_vp *vp,
753 			    struct mshv_get_set_vp_state __user *user_args,
754 			    bool is_set)
755 {
756 	struct mshv_get_set_vp_state args;
757 	long ret = 0;
758 	union hv_output_get_vp_state vp_state;
759 	u32 data_sz;
760 	struct hv_vp_state_data state_data = {};
761 
762 	if (copy_from_user(&args, user_args, sizeof(args)))
763 		return -EFAULT;
764 
765 	if (args.type >= MSHV_VP_STATE_COUNT || mshv_field_nonzero(args, rsvd) ||
766 	    !args.buf_sz || !PAGE_ALIGNED(args.buf_sz) ||
767 	    !PAGE_ALIGNED(args.buf_ptr))
768 		return -EINVAL;
769 
770 	if (!access_ok((void __user *)args.buf_ptr, args.buf_sz))
771 		return -EFAULT;
772 
773 	switch (args.type) {
774 	case MSHV_VP_STATE_LAPIC:
775 		state_data.type = HV_GET_SET_VP_STATE_LAPIC_STATE;
776 		data_sz = HV_HYP_PAGE_SIZE;
777 		break;
778 	case MSHV_VP_STATE_XSAVE:
779 	{
780 		u64 data_sz_64;
781 
782 		ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
783 						     HV_PARTITION_PROPERTY_XSAVE_STATES,
784 						     &state_data.xsave.states.as_uint64);
785 		if (ret)
786 			return ret;
787 
788 		ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
789 						     HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE,
790 						     &data_sz_64);
791 		if (ret)
792 			return ret;
793 
794 		data_sz = (u32)data_sz_64;
795 		state_data.xsave.flags = 0;
796 		/* Always request legacy states */
797 		state_data.xsave.states.legacy_x87 = 1;
798 		state_data.xsave.states.legacy_sse = 1;
799 		state_data.type = HV_GET_SET_VP_STATE_XSAVE;
800 		break;
801 	}
802 	case MSHV_VP_STATE_SIMP:
803 		state_data.type = HV_GET_SET_VP_STATE_SIM_PAGE;
804 		data_sz = HV_HYP_PAGE_SIZE;
805 		break;
806 	case MSHV_VP_STATE_SIEFP:
807 		state_data.type = HV_GET_SET_VP_STATE_SIEF_PAGE;
808 		data_sz = HV_HYP_PAGE_SIZE;
809 		break;
810 	case MSHV_VP_STATE_SYNTHETIC_TIMERS:
811 		state_data.type = HV_GET_SET_VP_STATE_SYNTHETIC_TIMERS;
812 		data_sz = sizeof(vp_state.synthetic_timers_state);
813 		break;
814 	default:
815 		return -EINVAL;
816 	}
817 
818 	if (copy_to_user(&user_args->buf_sz, &data_sz, sizeof(user_args->buf_sz)))
819 		return -EFAULT;
820 
821 	if (data_sz > args.buf_sz)
822 		return -EINVAL;
823 
824 	/* If the data is transmitted via pfns, delegate to helper */
825 	if (state_data.type & HV_GET_SET_VP_STATE_TYPE_PFN) {
826 		unsigned long user_pfn = PFN_DOWN(args.buf_ptr);
827 		size_t page_count = PFN_DOWN(args.buf_sz);
828 
829 		return mshv_vp_ioctl_get_set_state_pfn(vp, state_data, user_pfn,
830 						       page_count, is_set);
831 	}
832 
833 	/* Paranoia check - this shouldn't happen! */
834 	if (data_sz > sizeof(vp_state)) {
835 		vp_err(vp, "Invalid vp state data size!\n");
836 		return -EINVAL;
837 	}
838 
839 	if (is_set) {
840 		if (copy_from_user(&vp_state, (__user void *)args.buf_ptr, data_sz))
841 			return -EFAULT;
842 
843 		return hv_call_set_vp_state(vp->vp_index,
844 					    vp->vp_partition->pt_id,
845 					    state_data, 0, NULL,
846 					    sizeof(vp_state), (u8 *)&vp_state);
847 	}
848 
849 	ret = hv_call_get_vp_state(vp->vp_index, vp->vp_partition->pt_id,
850 				   state_data, 0, NULL, &vp_state);
851 	if (ret)
852 		return ret;
853 
854 	if (copy_to_user((void __user *)args.buf_ptr, &vp_state, data_sz))
855 		return -EFAULT;
856 
857 	return 0;
858 }
859 
860 static long
861 mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
862 {
863 	struct mshv_vp *vp = filp->private_data;
864 	long r = -ENOTTY;
865 
866 	if (mutex_lock_killable(&vp->vp_mutex))
867 		return -EINTR;
868 
869 	switch (ioctl) {
870 	case MSHV_RUN_VP:
871 		r = mshv_vp_ioctl_run_vp(vp, (void __user *)arg);
872 		break;
873 	case MSHV_GET_VP_STATE:
874 		r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, false);
875 		break;
876 	case MSHV_SET_VP_STATE:
877 		r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, true);
878 		break;
879 	case MSHV_ROOT_HVCALL:
880 		r = mshv_ioctl_passthru_hvcall(vp->vp_partition, false,
881 					       (void __user *)arg);
882 		break;
883 	default:
884 		vp_warn(vp, "Invalid ioctl: %#x\n", ioctl);
885 		break;
886 	}
887 	mutex_unlock(&vp->vp_mutex);
888 
889 	return r;
890 }
891 
892 static vm_fault_t mshv_vp_fault(struct vm_fault *vmf)
893 {
894 	struct mshv_vp *vp = vmf->vma->vm_file->private_data;
895 
896 	switch (vmf->vma->vm_pgoff) {
897 	case MSHV_VP_MMAP_OFFSET_REGISTERS:
898 		vmf->page = virt_to_page(vp->vp_register_page);
899 		break;
900 	case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE:
901 		vmf->page = virt_to_page(vp->vp_intercept_msg_page);
902 		break;
903 	case MSHV_VP_MMAP_OFFSET_GHCB:
904 		vmf->page = virt_to_page(vp->vp_ghcb_page);
905 		break;
906 	default:
907 		return VM_FAULT_SIGBUS;
908 	}
909 
910 	get_page(vmf->page);
911 
912 	return 0;
913 }
914 
915 static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma)
916 {
917 	struct mshv_vp *vp = file->private_data;
918 
919 	switch (vma->vm_pgoff) {
920 	case MSHV_VP_MMAP_OFFSET_REGISTERS:
921 		if (!vp->vp_register_page)
922 			return -ENODEV;
923 		break;
924 	case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE:
925 		if (!vp->vp_intercept_msg_page)
926 			return -ENODEV;
927 		break;
928 	case MSHV_VP_MMAP_OFFSET_GHCB:
929 		if (!vp->vp_ghcb_page)
930 			return -ENODEV;
931 		break;
932 	default:
933 		return -EINVAL;
934 	}
935 
936 	vma->vm_ops = &mshv_vp_vm_ops;
937 	return 0;
938 }
939 
940 static int
941 mshv_vp_release(struct inode *inode, struct file *filp)
942 {
943 	struct mshv_vp *vp = filp->private_data;
944 
945 	/* Rest of VP cleanup happens in destroy_partition() */
946 	mshv_partition_put(vp->vp_partition);
947 	return 0;
948 }
949 
950 void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index,
951 			 struct hv_stats_page *stats_pages[])
952 {
953 	union hv_stats_object_identity identity = {
954 		.vp.partition_id = partition_id,
955 		.vp.vp_index = vp_index,
956 	};
957 	int err;
958 
959 	identity.vp.stats_area_type = HV_STATS_AREA_SELF;
960 	err = hv_unmap_stats_page(HV_STATS_OBJECT_VP,
961 				  stats_pages[HV_STATS_AREA_SELF],
962 				  &identity);
963 	if (err)
964 		pr_err("%s: failed to unmap partition %llu vp %u self stats, err: %d\n",
965 		       __func__, partition_id, vp_index, err);
966 
967 	if (stats_pages[HV_STATS_AREA_PARENT] != stats_pages[HV_STATS_AREA_SELF]) {
968 		identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
969 		err = hv_unmap_stats_page(HV_STATS_OBJECT_VP,
970 					  stats_pages[HV_STATS_AREA_PARENT],
971 					  &identity);
972 		if (err)
973 			pr_err("%s: failed to unmap partition %llu vp %u parent stats, err: %d\n",
974 			       __func__, partition_id, vp_index, err);
975 	}
976 }
977 
978 int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
979 		      struct hv_stats_page *stats_pages[])
980 {
981 	union hv_stats_object_identity identity = {
982 		.vp.partition_id = partition_id,
983 		.vp.vp_index = vp_index,
984 	};
985 	int err;
986 
987 	identity.vp.stats_area_type = HV_STATS_AREA_SELF;
988 	err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
989 				&stats_pages[HV_STATS_AREA_SELF]);
990 	if (err) {
991 		pr_err("%s: failed to map partition %llu vp %u self stats, err: %d\n",
992 		       __func__, partition_id, vp_index, err);
993 		return err;
994 	}
995 
996 	/*
997 	 * L1VH partition cannot access its vp stats in parent area.
998 	 */
999 	if (is_l1vh_parent(partition_id)) {
1000 		stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF];
1001 	} else {
1002 		identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
1003 		err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
1004 					&stats_pages[HV_STATS_AREA_PARENT]);
1005 		if (err) {
1006 			pr_err("%s: failed to map partition %llu vp %u parent stats, err: %d\n",
1007 			       __func__, partition_id, vp_index, err);
1008 			goto unmap_self;
1009 		}
1010 		if (!stats_pages[HV_STATS_AREA_PARENT])
1011 			stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF];
1012 	}
1013 
1014 	return 0;
1015 
1016 unmap_self:
1017 	identity.vp.stats_area_type = HV_STATS_AREA_SELF;
1018 	hv_unmap_stats_page(HV_STATS_OBJECT_VP,
1019 			    stats_pages[HV_STATS_AREA_SELF],
1020 			    &identity);
1021 	return err;
1022 }
1023 
1024 static long
1025 mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
1026 			       void __user *arg)
1027 {
1028 	struct mshv_create_vp args;
1029 	struct mshv_vp *vp;
1030 	struct page *intercept_msg_page, *register_page, *ghcb_page;
1031 	struct hv_stats_page *stats_pages[2];
1032 	long ret;
1033 
1034 	if (copy_from_user(&args, arg, sizeof(args)))
1035 		return -EFAULT;
1036 
1037 	if (args.vp_index >= MSHV_MAX_VPS)
1038 		return -EINVAL;
1039 
1040 	if (partition->pt_vp_array[args.vp_index])
1041 		return -EEXIST;
1042 
1043 	ret = hv_call_create_vp(NUMA_NO_NODE, partition->pt_id, args.vp_index,
1044 				0 /* Only valid for root partition VPs */);
1045 	if (ret)
1046 		return ret;
1047 
1048 	ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
1049 				   HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
1050 				   input_vtl_zero, &intercept_msg_page);
1051 	if (ret)
1052 		goto destroy_vp;
1053 
1054 	if (!mshv_partition_encrypted(partition)) {
1055 		ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
1056 					   HV_VP_STATE_PAGE_REGISTERS,
1057 					   input_vtl_zero, &register_page);
1058 		if (ret)
1059 			goto unmap_intercept_message_page;
1060 	}
1061 
1062 	if (mshv_partition_encrypted(partition) &&
1063 	    is_ghcb_mapping_available()) {
1064 		ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
1065 					   HV_VP_STATE_PAGE_GHCB,
1066 					   input_vtl_normal, &ghcb_page);
1067 		if (ret)
1068 			goto unmap_register_page;
1069 	}
1070 
1071 	ret = mshv_vp_stats_map(partition->pt_id, args.vp_index,
1072 				stats_pages);
1073 	if (ret)
1074 		goto unmap_ghcb_page;
1075 
1076 	vp = kzalloc(sizeof(*vp), GFP_KERNEL);
1077 	if (!vp)
1078 		goto unmap_stats_pages;
1079 
1080 	vp->vp_partition = mshv_partition_get(partition);
1081 	if (!vp->vp_partition) {
1082 		ret = -EBADF;
1083 		goto free_vp;
1084 	}
1085 
1086 	mutex_init(&vp->vp_mutex);
1087 	init_waitqueue_head(&vp->run.vp_suspend_queue);
1088 	atomic64_set(&vp->run.vp_signaled_count, 0);
1089 
1090 	vp->vp_index = args.vp_index;
1091 	vp->vp_intercept_msg_page = page_to_virt(intercept_msg_page);
1092 	if (!mshv_partition_encrypted(partition))
1093 		vp->vp_register_page = page_to_virt(register_page);
1094 
1095 	if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
1096 		vp->vp_ghcb_page = page_to_virt(ghcb_page);
1097 
1098 	memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages));
1099 
1100 	ret = mshv_debugfs_vp_create(vp);
1101 	if (ret)
1102 		goto put_partition;
1103 
1104 	/*
1105 	 * Keep anon_inode_getfd last: it installs fd in the file struct and
1106 	 * thus makes the state accessible in user space.
1107 	 */
1108 	ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp,
1109 			       O_RDWR | O_CLOEXEC);
1110 	if (ret < 0)
1111 		goto remove_debugfs_vp;
1112 
1113 	/* already exclusive with the partition mutex for all ioctls */
1114 	partition->pt_vp_count++;
1115 	partition->pt_vp_array[args.vp_index] = vp;
1116 
1117 	return ret;
1118 
1119 remove_debugfs_vp:
1120 	mshv_debugfs_vp_remove(vp);
1121 put_partition:
1122 	mshv_partition_put(partition);
1123 free_vp:
1124 	kfree(vp);
1125 unmap_stats_pages:
1126 	mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages);
1127 unmap_ghcb_page:
1128 	if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
1129 		hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
1130 				       HV_VP_STATE_PAGE_GHCB, ghcb_page,
1131 				       input_vtl_normal);
1132 unmap_register_page:
1133 	if (!mshv_partition_encrypted(partition))
1134 		hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
1135 				       HV_VP_STATE_PAGE_REGISTERS,
1136 				       register_page, input_vtl_zero);
1137 unmap_intercept_message_page:
1138 	hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
1139 			       HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
1140 			       intercept_msg_page, input_vtl_zero);
1141 destroy_vp:
1142 	hv_call_delete_vp(partition->pt_id, args.vp_index);
1143 	return ret;
1144 }
1145 
1146 static int mshv_init_async_handler(struct mshv_partition *partition)
1147 {
1148 	if (completion_done(&partition->async_hypercall)) {
1149 		pt_err(partition,
1150 		       "Cannot issue async hypercall while another one in progress!\n");
1151 		return -EPERM;
1152 	}
1153 
1154 	reinit_completion(&partition->async_hypercall);
1155 	return 0;
1156 }
1157 
1158 static void mshv_async_hvcall_handler(void *data, u64 *status)
1159 {
1160 	struct mshv_partition *partition = data;
1161 
1162 	wait_for_completion(&partition->async_hypercall);
1163 	pt_dbg(partition, "Async hypercall completed!\n");
1164 
1165 	*status = partition->async_hypercall_status;
1166 }
1167 
1168 /*
1169  * NB: caller checks and makes sure mem->size is page aligned
1170  * Returns: 0 with regionpp updated on success, or -errno
1171  */
1172 static int mshv_partition_create_region(struct mshv_partition *partition,
1173 					struct mshv_user_mem_region *mem,
1174 					struct mshv_mem_region **regionpp,
1175 					bool is_mmio)
1176 {
1177 	struct mshv_mem_region *rg;
1178 	u64 nr_pages = HVPFN_DOWN(mem->size);
1179 
1180 	/* Reject overlapping regions */
1181 	spin_lock(&partition->pt_mem_regions_lock);
1182 	hlist_for_each_entry(rg, &partition->pt_mem_regions, hnode) {
1183 		if (mem->guest_pfn + nr_pages <= rg->start_gfn ||
1184 		    rg->start_gfn + rg->nr_pages <= mem->guest_pfn)
1185 			continue;
1186 		spin_unlock(&partition->pt_mem_regions_lock);
1187 		return -EEXIST;
1188 	}
1189 	spin_unlock(&partition->pt_mem_regions_lock);
1190 
1191 	rg = mshv_region_create(mem->guest_pfn, nr_pages,
1192 				mem->userspace_addr, mem->flags);
1193 	if (IS_ERR(rg))
1194 		return PTR_ERR(rg);
1195 
1196 	if (is_mmio)
1197 		rg->mreg_type = MSHV_REGION_TYPE_MMIO;
1198 	else if (mshv_partition_encrypted(partition) ||
1199 		 !mshv_region_movable_init(rg))
1200 		rg->mreg_type = MSHV_REGION_TYPE_MEM_PINNED;
1201 	else
1202 		rg->mreg_type = MSHV_REGION_TYPE_MEM_MOVABLE;
1203 
1204 	rg->partition = partition;
1205 
1206 	*regionpp = rg;
1207 
1208 	return 0;
1209 }
1210 
1211 /**
1212  * mshv_prepare_pinned_region - Pin and map memory regions
1213  * @region: Pointer to the memory region structure
1214  *
1215  * This function processes memory regions that are explicitly marked as pinned.
1216  * Pinned regions are preallocated, mapped upfront, and do not rely on fault-based
1217  * population. The function ensures the region is properly populated, handles
1218  * encryption requirements for SNP partitions if applicable, maps the region,
1219  * and performs necessary sharing or eviction operations based on the mapping
1220  * result.
1221  *
1222  * Return: 0 on success, negative error code on failure.
1223  */
1224 static int mshv_prepare_pinned_region(struct mshv_mem_region *region)
1225 {
1226 	struct mshv_partition *partition = region->partition;
1227 	int ret;
1228 
1229 	ret = mshv_region_pin(region);
1230 	if (ret) {
1231 		pt_err(partition, "Failed to pin memory region: %d\n",
1232 		       ret);
1233 		goto err_out;
1234 	}
1235 
1236 	/*
1237 	 * For an SNP partition it is a requirement that for every memory region
1238 	 * that we are going to map for this partition we should make sure that
1239 	 * host access to that region is released. This is ensured by doing an
1240 	 * additional hypercall which will update the SLAT to release host
1241 	 * access to guest memory regions.
1242 	 */
1243 	if (mshv_partition_encrypted(partition)) {
1244 		ret = mshv_region_unshare(region);
1245 		if (ret) {
1246 			pt_err(partition,
1247 			       "Failed to unshare memory region (guest_pfn: %llu): %d\n",
1248 			       region->start_gfn, ret);
1249 			goto invalidate_region;
1250 		}
1251 	}
1252 
1253 	ret = mshv_region_map(region);
1254 	if (ret && mshv_partition_encrypted(partition)) {
1255 		int shrc;
1256 
1257 		shrc = mshv_region_share(region);
1258 		if (!shrc)
1259 			goto invalidate_region;
1260 
1261 		pt_err(partition,
1262 		       "Failed to share memory region (guest_pfn: %llu): %d\n",
1263 		       region->start_gfn, shrc);
1264 		/*
1265 		 * Don't unpin if marking shared failed because pages are no
1266 		 * longer mapped in the host, ie root, anymore.
1267 		 */
1268 		goto err_out;
1269 	}
1270 
1271 	return 0;
1272 
1273 invalidate_region:
1274 	mshv_region_invalidate(region);
1275 err_out:
1276 	return ret;
1277 }
1278 
1279 /*
1280  * This maps two things: guest RAM and for pci passthru mmio space.
1281  *
1282  * mmio:
1283  *  - vfio overloads vm_pgoff to store the mmio start pfn/spa.
1284  *  - Two things need to happen for mapping mmio range:
1285  *	1. mapped in the uaddr so VMM can access it.
1286  *	2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it.
1287  *
1288  *   This function takes care of the second. The first one is managed by vfio,
1289  *   and hence is taken care of via vfio_pci_mmap_fault().
1290  */
1291 static long
1292 mshv_map_user_memory(struct mshv_partition *partition,
1293 		     struct mshv_user_mem_region mem)
1294 {
1295 	struct mshv_mem_region *region;
1296 	struct vm_area_struct *vma;
1297 	bool is_mmio;
1298 	ulong mmio_pfn;
1299 	long ret;
1300 
1301 	if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP) ||
1302 	    !access_ok((const void __user *)mem.userspace_addr, mem.size))
1303 		return -EINVAL;
1304 
1305 	mmap_read_lock(current->mm);
1306 	vma = vma_lookup(current->mm, mem.userspace_addr);
1307 	is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
1308 	mmio_pfn = is_mmio ? vma->vm_pgoff : 0;
1309 	mmap_read_unlock(current->mm);
1310 
1311 	if (!vma)
1312 		return -EINVAL;
1313 
1314 	ret = mshv_partition_create_region(partition, &mem, &region,
1315 					   is_mmio);
1316 	if (ret)
1317 		return ret;
1318 
1319 	switch (region->mreg_type) {
1320 	case MSHV_REGION_TYPE_MEM_PINNED:
1321 		ret = mshv_prepare_pinned_region(region);
1322 		break;
1323 	case MSHV_REGION_TYPE_MEM_MOVABLE:
1324 		/*
1325 		 * For movable memory regions, remap with no access to let
1326 		 * the hypervisor track dirty pages, enabling pre-copy live
1327 		 * migration.
1328 		 */
1329 		ret = hv_call_map_gpa_pages(partition->pt_id,
1330 					    region->start_gfn,
1331 					    region->nr_pages,
1332 					    HV_MAP_GPA_NO_ACCESS, NULL);
1333 		break;
1334 	case MSHV_REGION_TYPE_MMIO:
1335 		ret = hv_call_map_mmio_pages(partition->pt_id,
1336 					     region->start_gfn,
1337 					     mmio_pfn,
1338 					     region->nr_pages);
1339 		break;
1340 	}
1341 
1342 	if (ret)
1343 		goto errout;
1344 
1345 	spin_lock(&partition->pt_mem_regions_lock);
1346 	hlist_add_head(&region->hnode, &partition->pt_mem_regions);
1347 	spin_unlock(&partition->pt_mem_regions_lock);
1348 
1349 	return 0;
1350 
1351 errout:
1352 	vfree(region);
1353 	return ret;
1354 }
1355 
1356 /* Called for unmapping both the guest ram and the mmio space */
1357 static long
1358 mshv_unmap_user_memory(struct mshv_partition *partition,
1359 		       struct mshv_user_mem_region mem)
1360 {
1361 	struct mshv_mem_region *region;
1362 
1363 	if (!(mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP)))
1364 		return -EINVAL;
1365 
1366 	spin_lock(&partition->pt_mem_regions_lock);
1367 
1368 	region = mshv_partition_region_by_gfn(partition, mem.guest_pfn);
1369 	if (!region) {
1370 		spin_unlock(&partition->pt_mem_regions_lock);
1371 		return -ENOENT;
1372 	}
1373 
1374 	/* Paranoia check */
1375 	if (region->start_uaddr != mem.userspace_addr ||
1376 	    region->start_gfn != mem.guest_pfn ||
1377 	    region->nr_pages != HVPFN_DOWN(mem.size)) {
1378 		spin_unlock(&partition->pt_mem_regions_lock);
1379 		return -EINVAL;
1380 	}
1381 
1382 	hlist_del(&region->hnode);
1383 
1384 	spin_unlock(&partition->pt_mem_regions_lock);
1385 
1386 	mshv_region_put(region);
1387 
1388 	return 0;
1389 }
1390 
1391 static long
1392 mshv_partition_ioctl_set_memory(struct mshv_partition *partition,
1393 				struct mshv_user_mem_region __user *user_mem)
1394 {
1395 	struct mshv_user_mem_region mem;
1396 
1397 	if (copy_from_user(&mem, user_mem, sizeof(mem)))
1398 		return -EFAULT;
1399 
1400 	if (!mem.size ||
1401 	    !PAGE_ALIGNED(mem.size) ||
1402 	    !PAGE_ALIGNED(mem.userspace_addr) ||
1403 	    (mem.flags & ~MSHV_SET_MEM_FLAGS_MASK) ||
1404 	    mshv_field_nonzero(mem, rsvd))
1405 		return -EINVAL;
1406 
1407 	if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP))
1408 		return mshv_unmap_user_memory(partition, mem);
1409 
1410 	return mshv_map_user_memory(partition, mem);
1411 }
1412 
1413 static long
1414 mshv_partition_ioctl_ioeventfd(struct mshv_partition *partition,
1415 			       void __user *user_args)
1416 {
1417 	struct mshv_user_ioeventfd args;
1418 
1419 	if (copy_from_user(&args, user_args, sizeof(args)))
1420 		return -EFAULT;
1421 
1422 	return mshv_set_unset_ioeventfd(partition, &args);
1423 }
1424 
1425 static long
1426 mshv_partition_ioctl_irqfd(struct mshv_partition *partition,
1427 			   void __user *user_args)
1428 {
1429 	struct mshv_user_irqfd args;
1430 
1431 	if (copy_from_user(&args, user_args, sizeof(args)))
1432 		return -EFAULT;
1433 
1434 	return mshv_set_unset_irqfd(partition, &args);
1435 }
1436 
1437 static long
1438 mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition *partition,
1439 					    void __user *user_args)
1440 {
1441 	struct mshv_gpap_access_bitmap args;
1442 	union hv_gpa_page_access_state *states;
1443 	long ret, i;
1444 	union hv_gpa_page_access_state_flags hv_flags = {};
1445 	u8 hv_type_mask;
1446 	ulong bitmap_buf_sz, states_buf_sz;
1447 	int written = 0;
1448 
1449 	if (copy_from_user(&args, user_args, sizeof(args)))
1450 		return -EFAULT;
1451 
1452 	if (args.access_type >= MSHV_GPAP_ACCESS_TYPE_COUNT ||
1453 	    args.access_op >= MSHV_GPAP_ACCESS_OP_COUNT ||
1454 	    mshv_field_nonzero(args, rsvd) || !args.page_count ||
1455 	    !args.bitmap_ptr)
1456 		return -EINVAL;
1457 
1458 	if (check_mul_overflow(args.page_count, sizeof(*states), &states_buf_sz))
1459 		return -E2BIG;
1460 
1461 	/* Num bytes needed to store bitmap; one bit per page rounded up */
1462 	bitmap_buf_sz = DIV_ROUND_UP(args.page_count, 8);
1463 
1464 	/* Sanity check */
1465 	if (bitmap_buf_sz > states_buf_sz)
1466 		return -EBADFD;
1467 
1468 	switch (args.access_type) {
1469 	case MSHV_GPAP_ACCESS_TYPE_ACCESSED:
1470 		hv_type_mask = 1;
1471 		if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) {
1472 			hv_flags.clear_accessed = 1;
1473 			/* not accessed implies not dirty */
1474 			hv_flags.clear_dirty = 1;
1475 		} else { /* MSHV_GPAP_ACCESS_OP_SET */
1476 			hv_flags.set_accessed = 1;
1477 		}
1478 		break;
1479 	case MSHV_GPAP_ACCESS_TYPE_DIRTY:
1480 		hv_type_mask = 2;
1481 		if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) {
1482 			hv_flags.clear_dirty = 1;
1483 		} else { /* MSHV_GPAP_ACCESS_OP_SET */
1484 			hv_flags.set_dirty = 1;
1485 			/* dirty implies accessed */
1486 			hv_flags.set_accessed = 1;
1487 		}
1488 		break;
1489 	}
1490 
1491 	states = vzalloc(states_buf_sz);
1492 	if (!states)
1493 		return -ENOMEM;
1494 
1495 	ret = hv_call_get_gpa_access_states(partition->pt_id, args.page_count,
1496 					    args.gpap_base, hv_flags, &written,
1497 					    states);
1498 	if (ret)
1499 		goto free_return;
1500 
1501 	/*
1502 	 * Overwrite states buffer with bitmap - the bits in hv_type_mask
1503 	 * correspond to bitfields in hv_gpa_page_access_state
1504 	 */
1505 	for (i = 0; i < written; ++i)
1506 		__assign_bit(i, (ulong *)states,
1507 			     states[i].as_uint8 & hv_type_mask);
1508 
1509 	/* zero the unused bits in the last byte(s) of the returned bitmap */
1510 	for (i = written; i < bitmap_buf_sz * 8; ++i)
1511 		__clear_bit(i, (ulong *)states);
1512 
1513 	if (copy_to_user((void __user *)args.bitmap_ptr, states, bitmap_buf_sz))
1514 		ret = -EFAULT;
1515 
1516 free_return:
1517 	vfree(states);
1518 	return ret;
1519 }
1520 
1521 static long
1522 mshv_partition_ioctl_set_msi_routing(struct mshv_partition *partition,
1523 				     void __user *user_args)
1524 {
1525 	struct mshv_user_irq_entry *entries = NULL;
1526 	struct mshv_user_irq_table args;
1527 	long ret;
1528 
1529 	if (copy_from_user(&args, user_args, sizeof(args)))
1530 		return -EFAULT;
1531 
1532 	if (args.nr > MSHV_MAX_GUEST_IRQS ||
1533 	    mshv_field_nonzero(args, rsvd))
1534 		return -EINVAL;
1535 
1536 	if (args.nr) {
1537 		struct mshv_user_irq_table __user *urouting = user_args;
1538 
1539 		entries = vmemdup_user(urouting->entries,
1540 				       array_size(sizeof(*entries),
1541 						  args.nr));
1542 		if (IS_ERR(entries))
1543 			return PTR_ERR(entries);
1544 	}
1545 	ret = mshv_update_routing_table(partition, entries, args.nr);
1546 	kvfree(entries);
1547 
1548 	return ret;
1549 }
1550 
1551 static long
1552 mshv_partition_ioctl_initialize(struct mshv_partition *partition)
1553 {
1554 	long ret;
1555 
1556 	if (partition->pt_initialized)
1557 		return 0;
1558 
1559 	ret = hv_call_initialize_partition(partition->pt_id);
1560 	if (ret)
1561 		goto withdraw_mem;
1562 
1563 	ret = mshv_debugfs_partition_create(partition);
1564 	if (ret)
1565 		goto finalize_partition;
1566 
1567 	partition->pt_initialized = true;
1568 
1569 	return 0;
1570 
1571 finalize_partition:
1572 	hv_call_finalize_partition(partition->pt_id);
1573 withdraw_mem:
1574 	hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);
1575 
1576 	return ret;
1577 }
1578 
1579 static long
1580 mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
1581 {
1582 	struct mshv_partition *partition = filp->private_data;
1583 	long ret;
1584 	void __user *uarg = (void __user *)arg;
1585 
1586 	if (mutex_lock_killable(&partition->pt_mutex))
1587 		return -EINTR;
1588 
1589 	switch (ioctl) {
1590 	case MSHV_INITIALIZE_PARTITION:
1591 		ret = mshv_partition_ioctl_initialize(partition);
1592 		break;
1593 	case MSHV_SET_GUEST_MEMORY:
1594 		ret = mshv_partition_ioctl_set_memory(partition, uarg);
1595 		break;
1596 	case MSHV_CREATE_VP:
1597 		ret = mshv_partition_ioctl_create_vp(partition, uarg);
1598 		break;
1599 	case MSHV_IRQFD:
1600 		ret = mshv_partition_ioctl_irqfd(partition, uarg);
1601 		break;
1602 	case MSHV_IOEVENTFD:
1603 		ret = mshv_partition_ioctl_ioeventfd(partition, uarg);
1604 		break;
1605 	case MSHV_SET_MSI_ROUTING:
1606 		ret = mshv_partition_ioctl_set_msi_routing(partition, uarg);
1607 		break;
1608 	case MSHV_GET_GPAP_ACCESS_BITMAP:
1609 		ret = mshv_partition_ioctl_get_gpap_access_bitmap(partition,
1610 								  uarg);
1611 		break;
1612 	case MSHV_ROOT_HVCALL:
1613 		ret = mshv_ioctl_passthru_hvcall(partition, true, uarg);
1614 		break;
1615 	default:
1616 		ret = -ENOTTY;
1617 	}
1618 
1619 	mutex_unlock(&partition->pt_mutex);
1620 	return ret;
1621 }
1622 
1623 static int
1624 disable_vp_dispatch(struct mshv_vp *vp)
1625 {
1626 	int ret;
1627 	struct hv_register_assoc dispatch_suspend = {
1628 		.name = HV_REGISTER_DISPATCH_SUSPEND,
1629 		.value.dispatch_suspend.suspended = 1,
1630 	};
1631 
1632 	ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
1633 				    1, &dispatch_suspend);
1634 	if (ret)
1635 		vp_err(vp, "failed to suspend\n");
1636 
1637 	return ret;
1638 }
1639 
1640 static int
1641 get_vp_signaled_count(struct mshv_vp *vp, u64 *count)
1642 {
1643 	int ret;
1644 	struct hv_register_assoc root_signal_count = {
1645 		.name = HV_REGISTER_VP_ROOT_SIGNAL_COUNT,
1646 	};
1647 
1648 	ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
1649 				    1, &root_signal_count);
1650 
1651 	if (ret) {
1652 		vp_err(vp, "Failed to get root signal count");
1653 		*count = 0;
1654 		return ret;
1655 	}
1656 
1657 	*count = root_signal_count.value.reg64;
1658 
1659 	return ret;
1660 }
1661 
1662 static void
1663 drain_vp_signals(struct mshv_vp *vp)
1664 {
1665 	u64 hv_signal_count;
1666 	u64 vp_signal_count;
1667 
1668 	get_vp_signaled_count(vp, &hv_signal_count);
1669 
1670 	vp_signal_count = atomic64_read(&vp->run.vp_signaled_count);
1671 
1672 	/*
1673 	 * There should be at most 1 outstanding notification, but be extra
1674 	 * careful anyway.
1675 	 */
1676 	while (hv_signal_count != vp_signal_count) {
1677 		WARN_ON(hv_signal_count - vp_signal_count != 1);
1678 
1679 		if (wait_event_interruptible(vp->run.vp_suspend_queue,
1680 					     vp->run.kicked_by_hv == 1))
1681 			break;
1682 		vp->run.kicked_by_hv = 0;
1683 		vp_signal_count = atomic64_read(&vp->run.vp_signaled_count);
1684 	}
1685 }
1686 
1687 static void drain_all_vps(const struct mshv_partition *partition)
1688 {
1689 	int i;
1690 	struct mshv_vp *vp;
1691 
1692 	/*
1693 	 * VPs are reachable from ISR. It is safe to not take the partition
1694 	 * lock because nobody else can enter this function and drop the
1695 	 * partition from the list.
1696 	 */
1697 	for (i = 0; i < MSHV_MAX_VPS; i++) {
1698 		vp = partition->pt_vp_array[i];
1699 		if (!vp)
1700 			continue;
1701 		/*
1702 		 * Disable dispatching of the VP in the hypervisor. After this
1703 		 * the hypervisor guarantees it won't generate any signals for
1704 		 * the VP and the hypervisor's VP signal count won't change.
1705 		 */
1706 		disable_vp_dispatch(vp);
1707 		drain_vp_signals(vp);
1708 	}
1709 }
1710 
1711 static void
1712 remove_partition(struct mshv_partition *partition)
1713 {
1714 	spin_lock(&mshv_root.pt_ht_lock);
1715 	hlist_del_rcu(&partition->pt_hnode);
1716 	spin_unlock(&mshv_root.pt_ht_lock);
1717 
1718 	synchronize_rcu();
1719 }
1720 
1721 /*
1722  * Tear down a partition and remove it from the list.
1723  * Partition's refcount must be 0
1724  */
1725 static void destroy_partition(struct mshv_partition *partition)
1726 {
1727 	struct mshv_vp *vp;
1728 	struct mshv_mem_region *region;
1729 	struct hlist_node *n;
1730 	int i;
1731 
1732 	if (refcount_read(&partition->pt_ref_count)) {
1733 		pt_err(partition,
1734 		       "Attempt to destroy partition but refcount > 0\n");
1735 		return;
1736 	}
1737 
1738 	if (partition->pt_initialized) {
1739 		/*
1740 		 * We only need to drain signals for root scheduler. This should be
1741 		 * done before removing the partition from the partition list.
1742 		 */
1743 		if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
1744 			drain_all_vps(partition);
1745 
1746 		/* Remove vps */
1747 		for (i = 0; i < MSHV_MAX_VPS; ++i) {
1748 			vp = partition->pt_vp_array[i];
1749 			if (!vp)
1750 				continue;
1751 
1752 			mshv_debugfs_vp_remove(vp);
1753 			mshv_vp_stats_unmap(partition->pt_id, vp->vp_index,
1754 					    vp->vp_stats_pages);
1755 
1756 			if (vp->vp_register_page) {
1757 				(void)hv_unmap_vp_state_page(partition->pt_id,
1758 							     vp->vp_index,
1759 							     HV_VP_STATE_PAGE_REGISTERS,
1760 							     virt_to_page(vp->vp_register_page),
1761 							     input_vtl_zero);
1762 				vp->vp_register_page = NULL;
1763 			}
1764 
1765 			(void)hv_unmap_vp_state_page(partition->pt_id,
1766 						     vp->vp_index,
1767 						     HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
1768 						     virt_to_page(vp->vp_intercept_msg_page),
1769 						     input_vtl_zero);
1770 			vp->vp_intercept_msg_page = NULL;
1771 
1772 			if (vp->vp_ghcb_page) {
1773 				(void)hv_unmap_vp_state_page(partition->pt_id,
1774 							     vp->vp_index,
1775 							     HV_VP_STATE_PAGE_GHCB,
1776 							     virt_to_page(vp->vp_ghcb_page),
1777 							     input_vtl_normal);
1778 				vp->vp_ghcb_page = NULL;
1779 			}
1780 
1781 			kfree(vp);
1782 
1783 			partition->pt_vp_array[i] = NULL;
1784 		}
1785 
1786 		mshv_debugfs_partition_remove(partition);
1787 
1788 		/* Deallocates and unmaps everything including vcpus, GPA mappings etc */
1789 		hv_call_finalize_partition(partition->pt_id);
1790 
1791 		partition->pt_initialized = false;
1792 	}
1793 
1794 	remove_partition(partition);
1795 
1796 	hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions,
1797 				  hnode) {
1798 		hlist_del(&region->hnode);
1799 		mshv_region_put(region);
1800 	}
1801 
1802 	/* Withdraw and free all pages we deposited */
1803 	hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);
1804 	hv_call_delete_partition(partition->pt_id);
1805 
1806 	mshv_free_routing_table(partition);
1807 	kfree(partition);
1808 }
1809 
1810 struct
1811 mshv_partition *mshv_partition_get(struct mshv_partition *partition)
1812 {
1813 	if (refcount_inc_not_zero(&partition->pt_ref_count))
1814 		return partition;
1815 	return NULL;
1816 }
1817 
1818 struct
1819 mshv_partition *mshv_partition_find(u64 partition_id)
1820 	__must_hold(RCU)
1821 {
1822 	struct mshv_partition *p;
1823 
1824 	hash_for_each_possible_rcu(mshv_root.pt_htable, p, pt_hnode,
1825 				   partition_id)
1826 		if (p->pt_id == partition_id)
1827 			return p;
1828 
1829 	return NULL;
1830 }
1831 
1832 void
1833 mshv_partition_put(struct mshv_partition *partition)
1834 {
1835 	if (refcount_dec_and_test(&partition->pt_ref_count))
1836 		destroy_partition(partition);
1837 }
1838 
1839 static int
1840 mshv_partition_release(struct inode *inode, struct file *filp)
1841 {
1842 	struct mshv_partition *partition = filp->private_data;
1843 
1844 	mshv_eventfd_release(partition);
1845 
1846 	cleanup_srcu_struct(&partition->pt_irq_srcu);
1847 
1848 	mshv_partition_put(partition);
1849 
1850 	return 0;
1851 }
1852 
1853 static int
1854 add_partition(struct mshv_partition *partition)
1855 {
1856 	spin_lock(&mshv_root.pt_ht_lock);
1857 
1858 	hash_add_rcu(mshv_root.pt_htable, &partition->pt_hnode,
1859 		     partition->pt_id);
1860 
1861 	spin_unlock(&mshv_root.pt_ht_lock);
1862 
1863 	return 0;
1864 }
1865 
1866 static_assert(MSHV_NUM_CPU_FEATURES_BANKS ==
1867 	      HV_PARTITION_PROCESSOR_FEATURES_BANKS);
1868 
1869 static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags,
1870 					struct hv_partition_creation_properties *cr_props,
1871 					union hv_partition_isolation_properties *isol_props)
1872 {
1873 	int i;
1874 	struct mshv_create_partition_v2 args;
1875 	union hv_partition_processor_features *disabled_procs;
1876 	union hv_partition_processor_xsave_features *disabled_xsave;
1877 
1878 	/* First, copy v1 struct in case user is on previous versions */
1879 	if (copy_from_user(&args, user_arg,
1880 			   sizeof(struct mshv_create_partition)))
1881 		return -EFAULT;
1882 
1883 	if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
1884 	    args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
1885 		return -EINVAL;
1886 
1887 	disabled_procs = &cr_props->disabled_processor_features;
1888 	disabled_xsave = &cr_props->disabled_processor_xsave_features;
1889 
1890 	/* Check if user provided newer struct with feature fields */
1891 	if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES)) {
1892 		if (copy_from_user(&args, user_arg, sizeof(args)))
1893 			return -EFAULT;
1894 
1895 		/* Re-validate v1 fields after second copy_from_user() */
1896 		if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
1897 		    args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
1898 			return -EINVAL;
1899 
1900 		if (args.pt_num_cpu_fbanks != MSHV_NUM_CPU_FEATURES_BANKS ||
1901 		    mshv_field_nonzero(args, pt_rsvd) ||
1902 		    mshv_field_nonzero(args, pt_rsvd1))
1903 			return -EINVAL;
1904 
1905 		/*
1906 		 * Note this assumes MSHV_NUM_CPU_FEATURES_BANKS will never
1907 		 * change and equals HV_PARTITION_PROCESSOR_FEATURES_BANKS
1908 		 * (i.e. 2).
1909 		 *
1910 		 * Further banks (index >= 2) will be modifiable as 'early'
1911 		 * properties via the set partition property hypercall.
1912 		 */
1913 		for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
1914 			disabled_procs->as_uint64[i] = args.pt_cpu_fbanks[i];
1915 
1916 #if IS_ENABLED(CONFIG_X86_64)
1917 		disabled_xsave->as_uint64 = args.pt_disabled_xsave;
1918 #else
1919 		/*
1920 		 * In practice this field is ignored on arm64, but safer to
1921 		 * zero it in case it is ever used.
1922 		 */
1923 		disabled_xsave->as_uint64 = 0;
1924 
1925 		if (mshv_field_nonzero(args, pt_rsvd2))
1926 			return -EINVAL;
1927 #endif
1928 	} else {
1929 		/*
1930 		 * v1 behavior: try to enable everything. The hypervisor will
1931 		 * disable features that are not supported. The banks can be
1932 		 * queried via the get partition property hypercall.
1933 		 */
1934 		for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
1935 			disabled_procs->as_uint64[i] = 0;
1936 
1937 		disabled_xsave->as_uint64 = 0;
1938 	}
1939 
1940 	/* Only support EXO partitions */
1941 	*pt_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION |
1942 		    HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED;
1943 
1944 	if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_LAPIC))
1945 		*pt_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED;
1946 	if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_X2APIC))
1947 		*pt_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE;
1948 	if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_GPA_SUPER_PAGES))
1949 		*pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED;
1950 
1951 	isol_props->as_uint64 = 0;
1952 
1953 	switch (args.pt_isolation) {
1954 	case MSHV_PT_ISOLATION_NONE:
1955 		isol_props->isolation_type = HV_PARTITION_ISOLATION_TYPE_NONE;
1956 		break;
1957 	}
1958 
1959 	return 0;
1960 }
1961 
1962 static long
1963 mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
1964 {
1965 	u64 creation_flags;
1966 	struct hv_partition_creation_properties creation_properties;
1967 	union hv_partition_isolation_properties isolation_properties;
1968 	struct mshv_partition *partition;
1969 	long ret;
1970 
1971 	ret = mshv_ioctl_process_pt_flags(user_arg, &creation_flags,
1972 					  &creation_properties,
1973 					  &isolation_properties);
1974 	if (ret)
1975 		return ret;
1976 
1977 	partition = kzalloc(sizeof(*partition), GFP_KERNEL);
1978 	if (!partition)
1979 		return -ENOMEM;
1980 
1981 	partition->pt_module_dev = module_dev;
1982 	partition->isolation_type = isolation_properties.isolation_type;
1983 
1984 	refcount_set(&partition->pt_ref_count, 1);
1985 
1986 	mutex_init(&partition->pt_mutex);
1987 
1988 	mutex_init(&partition->pt_irq_lock);
1989 
1990 	init_completion(&partition->async_hypercall);
1991 
1992 	INIT_HLIST_HEAD(&partition->irq_ack_notifier_list);
1993 
1994 	INIT_HLIST_HEAD(&partition->pt_devices);
1995 
1996 	spin_lock_init(&partition->pt_mem_regions_lock);
1997 	INIT_HLIST_HEAD(&partition->pt_mem_regions);
1998 
1999 	mshv_eventfd_init(partition);
2000 
2001 	ret = init_srcu_struct(&partition->pt_irq_srcu);
2002 	if (ret)
2003 		goto free_partition;
2004 
2005 	ret = hv_call_create_partition(creation_flags,
2006 				       creation_properties,
2007 				       isolation_properties,
2008 				       &partition->pt_id);
2009 	if (ret)
2010 		goto cleanup_irq_srcu;
2011 
2012 	ret = add_partition(partition);
2013 	if (ret)
2014 		goto delete_partition;
2015 
2016 	ret = mshv_init_async_handler(partition);
2017 	if (!ret) {
2018 		ret = FD_ADD(O_CLOEXEC, anon_inode_getfile("mshv_partition",
2019 							   &mshv_partition_fops,
2020 							   partition, O_RDWR));
2021 		if (ret >= 0)
2022 			return ret;
2023 	}
2024 	remove_partition(partition);
2025 delete_partition:
2026 	hv_call_delete_partition(partition->pt_id);
2027 cleanup_irq_srcu:
2028 	cleanup_srcu_struct(&partition->pt_irq_srcu);
2029 free_partition:
2030 	kfree(partition);
2031 
2032 	return ret;
2033 }
2034 
2035 static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl,
2036 			   unsigned long arg)
2037 {
2038 	struct miscdevice *misc = filp->private_data;
2039 
2040 	switch (ioctl) {
2041 	case MSHV_CREATE_PARTITION:
2042 		return mshv_ioctl_create_partition((void __user *)arg,
2043 						misc->this_device);
2044 	case MSHV_ROOT_HVCALL:
2045 		return mshv_ioctl_passthru_hvcall(NULL, false,
2046 					(void __user *)arg);
2047 	}
2048 
2049 	return -ENOTTY;
2050 }
2051 
2052 static int
2053 mshv_dev_open(struct inode *inode, struct file *filp)
2054 {
2055 	return 0;
2056 }
2057 
2058 static int
2059 mshv_dev_release(struct inode *inode, struct file *filp)
2060 {
2061 	return 0;
2062 }
2063 
2064 static int mshv_cpuhp_online;
2065 static int mshv_root_sched_online;
2066 
2067 static const char *scheduler_type_to_string(enum hv_scheduler_type type)
2068 {
2069 	switch (type) {
2070 	case HV_SCHEDULER_TYPE_LP:
2071 		return "classic scheduler without SMT";
2072 	case HV_SCHEDULER_TYPE_LP_SMT:
2073 		return "classic scheduler with SMT";
2074 	case HV_SCHEDULER_TYPE_CORE_SMT:
2075 		return "core scheduler";
2076 	case HV_SCHEDULER_TYPE_ROOT:
2077 		return "root scheduler";
2078 	default:
2079 		return "unknown scheduler";
2080 	};
2081 }
2082 
2083 static int __init l1vh_retrieve_scheduler_type(enum hv_scheduler_type *out)
2084 {
2085 	u64 integrated_sched_enabled;
2086 	int ret;
2087 
2088 	*out = HV_SCHEDULER_TYPE_CORE_SMT;
2089 
2090 	if (!mshv_root.vmm_caps.vmm_enable_integrated_scheduler)
2091 		return 0;
2092 
2093 	ret = hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF,
2094 						HV_PARTITION_PROPERTY_INTEGRATED_SCHEDULER_ENABLED,
2095 						0, &integrated_sched_enabled,
2096 						sizeof(integrated_sched_enabled));
2097 	if (ret)
2098 		return ret;
2099 
2100 	if (integrated_sched_enabled)
2101 		*out = HV_SCHEDULER_TYPE_ROOT;
2102 
2103 	return 0;
2104 }
2105 
2106 /* TODO move this to hv_common.c when needed outside */
2107 static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out)
2108 {
2109 	struct hv_input_get_system_property *input;
2110 	struct hv_output_get_system_property *output;
2111 	unsigned long flags;
2112 	u64 status;
2113 
2114 	local_irq_save(flags);
2115 	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
2116 	output = *this_cpu_ptr(hyperv_pcpu_output_arg);
2117 
2118 	memset(input, 0, sizeof(*input));
2119 	memset(output, 0, sizeof(*output));
2120 	input->property_id = HV_SYSTEM_PROPERTY_SCHEDULER_TYPE;
2121 
2122 	status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output);
2123 	if (!hv_result_success(status)) {
2124 		local_irq_restore(flags);
2125 		pr_err("%s: %s\n", __func__, hv_result_to_string(status));
2126 		return hv_result_to_errno(status);
2127 	}
2128 
2129 	*out = output->scheduler_type;
2130 	local_irq_restore(flags);
2131 
2132 	return 0;
2133 }
2134 
2135 /* Retrieve and stash the supported scheduler type */
2136 static int __init mshv_retrieve_scheduler_type(struct device *dev)
2137 {
2138 	int ret;
2139 
2140 	if (hv_l1vh_partition())
2141 		ret = l1vh_retrieve_scheduler_type(&hv_scheduler_type);
2142 	else
2143 		ret = hv_retrieve_scheduler_type(&hv_scheduler_type);
2144 	if (ret)
2145 		return ret;
2146 
2147 	dev_info(dev, "Hypervisor using %s\n",
2148 		 scheduler_type_to_string(hv_scheduler_type));
2149 
2150 	switch (hv_scheduler_type) {
2151 	case HV_SCHEDULER_TYPE_CORE_SMT:
2152 	case HV_SCHEDULER_TYPE_LP_SMT:
2153 	case HV_SCHEDULER_TYPE_ROOT:
2154 	case HV_SCHEDULER_TYPE_LP:
2155 		/* Supported scheduler, nothing to do */
2156 		break;
2157 	default:
2158 		dev_err(dev, "unsupported scheduler 0x%x, bailing.\n",
2159 			hv_scheduler_type);
2160 		return -EOPNOTSUPP;
2161 	}
2162 
2163 	return 0;
2164 }
2165 
2166 static int mshv_root_scheduler_init(unsigned int cpu)
2167 {
2168 	void **inputarg, **outputarg, *p;
2169 
2170 	inputarg = (void **)this_cpu_ptr(root_scheduler_input);
2171 	outputarg = (void **)this_cpu_ptr(root_scheduler_output);
2172 
2173 	/* Allocate two consecutive pages. One for input, one for output. */
2174 	p = kmalloc(2 * HV_HYP_PAGE_SIZE, GFP_KERNEL);
2175 	if (!p)
2176 		return -ENOMEM;
2177 
2178 	*inputarg = p;
2179 	*outputarg = (char *)p + HV_HYP_PAGE_SIZE;
2180 
2181 	return 0;
2182 }
2183 
2184 static int mshv_root_scheduler_cleanup(unsigned int cpu)
2185 {
2186 	void *p, **inputarg, **outputarg;
2187 
2188 	inputarg = (void **)this_cpu_ptr(root_scheduler_input);
2189 	outputarg = (void **)this_cpu_ptr(root_scheduler_output);
2190 
2191 	p = *inputarg;
2192 
2193 	*inputarg = NULL;
2194 	*outputarg = NULL;
2195 
2196 	kfree(p);
2197 
2198 	return 0;
2199 }
2200 
2201 /* Must be called after retrieving the scheduler type */
2202 static int
2203 root_scheduler_init(struct device *dev)
2204 {
2205 	int ret;
2206 
2207 	if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
2208 		return 0;
2209 
2210 	root_scheduler_input = alloc_percpu(void *);
2211 	root_scheduler_output = alloc_percpu(void *);
2212 
2213 	if (!root_scheduler_input || !root_scheduler_output) {
2214 		dev_err(dev, "Failed to allocate root scheduler buffers\n");
2215 		ret = -ENOMEM;
2216 		goto out;
2217 	}
2218 
2219 	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_root_sched",
2220 				mshv_root_scheduler_init,
2221 				mshv_root_scheduler_cleanup);
2222 
2223 	if (ret < 0) {
2224 		dev_err(dev, "Failed to setup root scheduler state: %i\n", ret);
2225 		goto out;
2226 	}
2227 
2228 	mshv_root_sched_online = ret;
2229 
2230 	return 0;
2231 
2232 out:
2233 	free_percpu(root_scheduler_input);
2234 	free_percpu(root_scheduler_output);
2235 	return ret;
2236 }
2237 
2238 static void
2239 root_scheduler_deinit(void)
2240 {
2241 	if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
2242 		return;
2243 
2244 	cpuhp_remove_state(mshv_root_sched_online);
2245 	free_percpu(root_scheduler_input);
2246 	free_percpu(root_scheduler_output);
2247 }
2248 
2249 static int mshv_reboot_notify(struct notifier_block *nb,
2250 			      unsigned long code, void *unused)
2251 {
2252 	cpuhp_remove_state(mshv_cpuhp_online);
2253 	return 0;
2254 }
2255 
2256 struct notifier_block mshv_reboot_nb = {
2257 	.notifier_call = mshv_reboot_notify,
2258 };
2259 
2260 static void mshv_root_partition_exit(void)
2261 {
2262 	unregister_reboot_notifier(&mshv_reboot_nb);
2263 }
2264 
2265 static int __init mshv_root_partition_init(struct device *dev)
2266 {
2267 	return register_reboot_notifier(&mshv_reboot_nb);
2268 }
2269 
2270 static int __init mshv_init_vmm_caps(struct device *dev)
2271 {
2272 	int ret;
2273 
2274 	ret = hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF,
2275 						HV_PARTITION_PROPERTY_VMM_CAPABILITIES,
2276 						0, &mshv_root.vmm_caps,
2277 						sizeof(mshv_root.vmm_caps));
2278 	if (ret && hv_l1vh_partition()) {
2279 		dev_err(dev, "Failed to get VMM capabilities: %d\n", ret);
2280 		return ret;
2281 	}
2282 
2283 	dev_dbg(dev, "vmm_caps = %#llx\n", mshv_root.vmm_caps.as_uint64[0]);
2284 
2285 	return 0;
2286 }
2287 
2288 static int __init mshv_parent_partition_init(void)
2289 {
2290 	int ret;
2291 	struct device *dev;
2292 	union hv_hypervisor_version_info version_info;
2293 
2294 	if (!hv_parent_partition() || is_kdump_kernel())
2295 		return -ENODEV;
2296 
2297 	if (hv_get_hypervisor_version(&version_info))
2298 		return -ENODEV;
2299 
2300 	ret = misc_register(&mshv_dev);
2301 	if (ret)
2302 		return ret;
2303 
2304 	dev = mshv_dev.this_device;
2305 
2306 	if (version_info.build_number < MSHV_HV_MIN_VERSION ||
2307 	    version_info.build_number > MSHV_HV_MAX_VERSION) {
2308 		dev_err(dev, "Running on unvalidated Hyper-V version\n");
2309 		dev_err(dev, "Versions: current: %u  min: %u  max: %u\n",
2310 			version_info.build_number, MSHV_HV_MIN_VERSION,
2311 			MSHV_HV_MAX_VERSION);
2312 	}
2313 
2314 	mshv_root.synic_pages = alloc_percpu(struct hv_synic_pages);
2315 	if (!mshv_root.synic_pages) {
2316 		dev_err(dev, "Failed to allocate percpu synic page\n");
2317 		ret = -ENOMEM;
2318 		goto device_deregister;
2319 	}
2320 
2321 	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic",
2322 				mshv_synic_init,
2323 				mshv_synic_cleanup);
2324 	if (ret < 0) {
2325 		dev_err(dev, "Failed to setup cpu hotplug state: %i\n", ret);
2326 		goto free_synic_pages;
2327 	}
2328 
2329 	mshv_cpuhp_online = ret;
2330 
2331 	ret = mshv_init_vmm_caps(dev);
2332 	if (ret)
2333 		goto remove_cpu_state;
2334 
2335 	ret = mshv_retrieve_scheduler_type(dev);
2336 	if (ret)
2337 		goto remove_cpu_state;
2338 
2339 	if (hv_root_partition())
2340 		ret = mshv_root_partition_init(dev);
2341 	if (ret)
2342 		goto remove_cpu_state;
2343 
2344 	ret = root_scheduler_init(dev);
2345 	if (ret)
2346 		goto exit_partition;
2347 
2348 	ret = mshv_debugfs_init();
2349 	if (ret)
2350 		goto deinit_root_scheduler;
2351 
2352 	ret = mshv_irqfd_wq_init();
2353 	if (ret)
2354 		goto exit_debugfs;
2355 
2356 	spin_lock_init(&mshv_root.pt_ht_lock);
2357 	hash_init(mshv_root.pt_htable);
2358 
2359 	hv_setup_mshv_handler(mshv_isr);
2360 
2361 	return 0;
2362 
2363 exit_debugfs:
2364 	mshv_debugfs_exit();
2365 deinit_root_scheduler:
2366 	root_scheduler_deinit();
2367 exit_partition:
2368 	if (hv_root_partition())
2369 		mshv_root_partition_exit();
2370 remove_cpu_state:
2371 	cpuhp_remove_state(mshv_cpuhp_online);
2372 free_synic_pages:
2373 	free_percpu(mshv_root.synic_pages);
2374 device_deregister:
2375 	misc_deregister(&mshv_dev);
2376 	return ret;
2377 }
2378 
2379 static void __exit mshv_parent_partition_exit(void)
2380 {
2381 	hv_setup_mshv_handler(NULL);
2382 	mshv_port_table_fini();
2383 	mshv_debugfs_exit();
2384 	misc_deregister(&mshv_dev);
2385 	mshv_irqfd_wq_cleanup();
2386 	root_scheduler_deinit();
2387 	if (hv_root_partition())
2388 		mshv_root_partition_exit();
2389 	cpuhp_remove_state(mshv_cpuhp_online);
2390 	free_percpu(mshv_root.synic_pages);
2391 }
2392 
2393 module_init(mshv_parent_partition_init);
2394 module_exit(mshv_parent_partition_exit);
2395