xref: /linux/drivers/hv/mshv_eventfd.c (revision feb06d2690bb826fd33798a99ce5cff8d07b38f9)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * eventfd support for mshv
4  *
5  * Heavily inspired from KVM implementation of irqfd/ioeventfd. The basic
6  * framework code is taken from the kvm implementation.
7  *
8  * All credits to kvm developers.
9  */
10 
11 #include <linux/syscalls.h>
12 #include <linux/wait.h>
13 #include <linux/poll.h>
14 #include <linux/file.h>
15 #include <linux/list.h>
16 #include <linux/workqueue.h>
17 #include <linux/eventfd.h>
18 
19 #if IS_ENABLED(CONFIG_X86_64)
20 #include <asm/apic.h>
21 #endif
22 #include <asm/mshyperv.h>
23 
24 #include "mshv_eventfd.h"
25 #include "mshv.h"
26 #include "mshv_root.h"
27 
28 static struct workqueue_struct *irqfd_cleanup_wq;
29 
mshv_register_irq_ack_notifier(struct mshv_partition * partition,struct mshv_irq_ack_notifier * mian)30 void mshv_register_irq_ack_notifier(struct mshv_partition *partition,
31 				    struct mshv_irq_ack_notifier *mian)
32 {
33 	mutex_lock(&partition->pt_irq_lock);
34 	hlist_add_head_rcu(&mian->link, &partition->irq_ack_notifier_list);
35 	mutex_unlock(&partition->pt_irq_lock);
36 }
37 
mshv_unregister_irq_ack_notifier(struct mshv_partition * partition,struct mshv_irq_ack_notifier * mian)38 void mshv_unregister_irq_ack_notifier(struct mshv_partition *partition,
39 				      struct mshv_irq_ack_notifier *mian)
40 {
41 	mutex_lock(&partition->pt_irq_lock);
42 	hlist_del_init_rcu(&mian->link);
43 	mutex_unlock(&partition->pt_irq_lock);
44 	synchronize_rcu();
45 }
46 
mshv_notify_acked_gsi(struct mshv_partition * partition,int gsi)47 bool mshv_notify_acked_gsi(struct mshv_partition *partition, int gsi)
48 {
49 	struct mshv_irq_ack_notifier *mian;
50 	bool acked = false;
51 
52 	rcu_read_lock();
53 	hlist_for_each_entry_rcu(mian, &partition->irq_ack_notifier_list,
54 				 link) {
55 		if (mian->irq_ack_gsi == gsi) {
56 			mian->irq_acked(mian);
57 			acked = true;
58 		}
59 	}
60 	rcu_read_unlock();
61 
62 	return acked;
63 }
64 
65 #if IS_ENABLED(CONFIG_ARM64)
hv_should_clear_interrupt(enum hv_interrupt_type type)66 static inline bool hv_should_clear_interrupt(enum hv_interrupt_type type)
67 {
68 	return false;
69 }
70 #elif IS_ENABLED(CONFIG_X86_64)
hv_should_clear_interrupt(enum hv_interrupt_type type)71 static inline bool hv_should_clear_interrupt(enum hv_interrupt_type type)
72 {
73 	return type == HV_X64_INTERRUPT_TYPE_EXTINT;
74 }
75 #endif
76 
mshv_irqfd_resampler_ack(struct mshv_irq_ack_notifier * mian)77 static void mshv_irqfd_resampler_ack(struct mshv_irq_ack_notifier *mian)
78 {
79 	struct mshv_irqfd_resampler *resampler;
80 	struct mshv_partition *partition;
81 	struct mshv_irqfd *irqfd;
82 	int idx;
83 
84 	resampler = container_of(mian, struct mshv_irqfd_resampler,
85 				 rsmplr_notifier);
86 	partition = resampler->rsmplr_partn;
87 
88 	idx = srcu_read_lock(&partition->pt_irq_srcu);
89 
90 	hlist_for_each_entry_rcu(irqfd, &resampler->rsmplr_irqfd_list,
91 				 irqfd_resampler_hnode) {
92 		if (hv_should_clear_interrupt(irqfd->irqfd_lapic_irq.lapic_control.interrupt_type))
93 			hv_call_clear_virtual_interrupt(partition->pt_id);
94 
95 		eventfd_signal(irqfd->irqfd_resamplefd);
96 	}
97 
98 	srcu_read_unlock(&partition->pt_irq_srcu, idx);
99 }
100 
101 #if IS_ENABLED(CONFIG_X86_64)
102 static bool
mshv_vp_irq_vector_injected(union hv_vp_register_page_interrupt_vectors iv,u32 vector)103 mshv_vp_irq_vector_injected(union hv_vp_register_page_interrupt_vectors iv,
104 			    u32 vector)
105 {
106 	int i;
107 
108 	for (i = 0; i < iv.vector_count; i++) {
109 		if (iv.vector[i] == vector)
110 			return true;
111 	}
112 
113 	return false;
114 }
115 
mshv_vp_irq_try_set_vector(struct mshv_vp * vp,u32 vector)116 static int mshv_vp_irq_try_set_vector(struct mshv_vp *vp, u32 vector)
117 {
118 	union hv_vp_register_page_interrupt_vectors iv, new_iv;
119 
120 	iv = vp->vp_register_page->interrupt_vectors;
121 	new_iv = iv;
122 
123 	if (mshv_vp_irq_vector_injected(iv, vector))
124 		return 0;
125 
126 	if (iv.vector_count >= HV_VP_REGISTER_PAGE_MAX_VECTOR_COUNT)
127 		return -ENOSPC;
128 
129 	new_iv.vector[new_iv.vector_count++] = vector;
130 
131 	if (cmpxchg(&vp->vp_register_page->interrupt_vectors.as_uint64,
132 		    iv.as_uint64, new_iv.as_uint64) != iv.as_uint64)
133 		return -EAGAIN;
134 
135 	return 0;
136 }
137 
mshv_vp_irq_set_vector(struct mshv_vp * vp,u32 vector)138 static int mshv_vp_irq_set_vector(struct mshv_vp *vp, u32 vector)
139 {
140 	int ret;
141 
142 	do {
143 		ret = mshv_vp_irq_try_set_vector(vp, vector);
144 	} while (ret == -EAGAIN && !need_resched());
145 
146 	return ret;
147 }
148 
149 /*
150  * Try to raise irq for guest via shared vector array. hyp does the actual
151  * inject of the interrupt.
152  */
mshv_try_assert_irq_fast(struct mshv_irqfd * irqfd)153 static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd)
154 {
155 	struct mshv_partition *partition = irqfd->irqfd_partn;
156 	struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq;
157 	struct mshv_vp *vp;
158 
159 	if (!(ms_hyperv.ext_features &
160 	      HV_VP_DISPATCH_INTERRUPT_INJECTION_AVAILABLE))
161 		return -EOPNOTSUPP;
162 
163 	if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
164 		return -EOPNOTSUPP;
165 
166 #if IS_ENABLED(CONFIG_X86)
167 	if (irq->lapic_control.logical_dest_mode)
168 		return -EOPNOTSUPP;
169 #endif
170 
171 	vp = partition->pt_vp_array[irq->lapic_apic_id];
172 
173 	if (!vp->vp_register_page)
174 		return -EOPNOTSUPP;
175 
176 	if (mshv_vp_irq_set_vector(vp, irq->lapic_vector))
177 		return -EINVAL;
178 
179 	if (vp->run.flags.root_sched_dispatched &&
180 	    vp->vp_register_page->interrupt_vectors.as_uint64)
181 		return -EBUSY;
182 
183 	wake_up(&vp->run.vp_suspend_queue);
184 
185 	return 0;
186 }
187 #else /* CONFIG_X86_64 */
mshv_try_assert_irq_fast(struct mshv_irqfd * irqfd)188 static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd)
189 {
190 	return -EOPNOTSUPP;
191 }
192 #endif
193 
mshv_assert_irq_slow(struct mshv_irqfd * irqfd)194 static void mshv_assert_irq_slow(struct mshv_irqfd *irqfd)
195 {
196 	struct mshv_partition *partition = irqfd->irqfd_partn;
197 	struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq;
198 	unsigned int seq;
199 	int idx;
200 
201 #if IS_ENABLED(CONFIG_X86)
202 	WARN_ON(irqfd->irqfd_resampler &&
203 		!irq->lapic_control.level_triggered);
204 #endif
205 
206 	idx = srcu_read_lock(&partition->pt_irq_srcu);
207 	if (irqfd->irqfd_girq_ent.guest_irq_num) {
208 		if (!irqfd->irqfd_girq_ent.girq_entry_valid) {
209 			srcu_read_unlock(&partition->pt_irq_srcu, idx);
210 			return;
211 		}
212 
213 		do {
214 			seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
215 		} while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
216 	}
217 
218 	hv_call_assert_virtual_interrupt(irqfd->irqfd_partn->pt_id,
219 					 irq->lapic_vector, irq->lapic_apic_id,
220 					 irq->lapic_control);
221 	srcu_read_unlock(&partition->pt_irq_srcu, idx);
222 }
223 
mshv_irqfd_resampler_shutdown(struct mshv_irqfd * irqfd)224 static void mshv_irqfd_resampler_shutdown(struct mshv_irqfd *irqfd)
225 {
226 	struct mshv_irqfd_resampler *rp = irqfd->irqfd_resampler;
227 	struct mshv_partition *pt = rp->rsmplr_partn;
228 
229 	mutex_lock(&pt->irqfds_resampler_lock);
230 
231 	hlist_del_rcu(&irqfd->irqfd_resampler_hnode);
232 	synchronize_srcu(&pt->pt_irq_srcu);
233 
234 	if (hlist_empty(&rp->rsmplr_irqfd_list)) {
235 		hlist_del(&rp->rsmplr_hnode);
236 		mshv_unregister_irq_ack_notifier(pt, &rp->rsmplr_notifier);
237 		kfree(rp);
238 	}
239 
240 	mutex_unlock(&pt->irqfds_resampler_lock);
241 }
242 
243 /*
244  * Race-free decouple logic (ordering is critical)
245  */
mshv_irqfd_shutdown(struct work_struct * work)246 static void mshv_irqfd_shutdown(struct work_struct *work)
247 {
248 	struct mshv_irqfd *irqfd =
249 			container_of(work, struct mshv_irqfd, irqfd_shutdown);
250 
251 	/*
252 	 * Synchronize with the wait-queue and unhook ourselves to prevent
253 	 * further events.
254 	 */
255 	remove_wait_queue(irqfd->irqfd_wqh, &irqfd->irqfd_wait);
256 
257 	if (irqfd->irqfd_resampler) {
258 		mshv_irqfd_resampler_shutdown(irqfd);
259 		eventfd_ctx_put(irqfd->irqfd_resamplefd);
260 	}
261 
262 	/*
263 	 * It is now safe to release the object's resources
264 	 */
265 	eventfd_ctx_put(irqfd->irqfd_eventfd_ctx);
266 	kfree(irqfd);
267 }
268 
269 /* assumes partition->pt_irqfds_lock is held */
mshv_irqfd_is_active(struct mshv_irqfd * irqfd)270 static bool mshv_irqfd_is_active(struct mshv_irqfd *irqfd)
271 {
272 	return !hlist_unhashed(&irqfd->irqfd_hnode);
273 }
274 
275 /*
276  * Mark the irqfd as inactive and schedule it for removal
277  *
278  * assumes partition->pt_irqfds_lock is held
279  */
mshv_irqfd_deactivate(struct mshv_irqfd * irqfd)280 static void mshv_irqfd_deactivate(struct mshv_irqfd *irqfd)
281 {
282 	if (!mshv_irqfd_is_active(irqfd))
283 		return;
284 
285 	hlist_del(&irqfd->irqfd_hnode);
286 
287 	queue_work(irqfd_cleanup_wq, &irqfd->irqfd_shutdown);
288 }
289 
290 /*
291  * Called with wqh->lock held and interrupts disabled
292  */
mshv_irqfd_wakeup(wait_queue_entry_t * wait,unsigned int mode,int sync,void * key)293 static int mshv_irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode,
294 			     int sync, void *key)
295 {
296 	struct mshv_irqfd *irqfd = container_of(wait, struct mshv_irqfd,
297 						irqfd_wait);
298 	unsigned long flags = (unsigned long)key;
299 	int idx;
300 	unsigned int seq;
301 	struct mshv_partition *pt = irqfd->irqfd_partn;
302 	int ret = 0;
303 
304 	if (flags & POLLIN) {
305 		u64 cnt;
306 
307 		eventfd_ctx_do_read(irqfd->irqfd_eventfd_ctx, &cnt);
308 		idx = srcu_read_lock(&pt->pt_irq_srcu);
309 		do {
310 			seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
311 		} while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
312 
313 		/* An event has been signaled, raise an interrupt */
314 		ret = mshv_try_assert_irq_fast(irqfd);
315 		if (ret)
316 			mshv_assert_irq_slow(irqfd);
317 
318 		srcu_read_unlock(&pt->pt_irq_srcu, idx);
319 
320 		ret = 1;
321 	}
322 
323 	if (flags & POLLHUP) {
324 		/* The eventfd is closing, detach from the partition */
325 		unsigned long flags;
326 
327 		spin_lock_irqsave(&pt->pt_irqfds_lock, flags);
328 
329 		/*
330 		 * We must check if someone deactivated the irqfd before
331 		 * we could acquire the pt_irqfds_lock since the item is
332 		 * deactivated from the mshv side before it is unhooked from
333 		 * the wait-queue.  If it is already deactivated, we can
334 		 * simply return knowing the other side will cleanup for us.
335 		 * We cannot race against the irqfd going away since the
336 		 * other side is required to acquire wqh->lock, which we hold
337 		 */
338 		if (mshv_irqfd_is_active(irqfd))
339 			mshv_irqfd_deactivate(irqfd);
340 
341 		spin_unlock_irqrestore(&pt->pt_irqfds_lock, flags);
342 	}
343 
344 	return ret;
345 }
346 
347 /* Must be called under pt_irqfds_lock */
mshv_irqfd_update(struct mshv_partition * pt,struct mshv_irqfd * irqfd)348 static void mshv_irqfd_update(struct mshv_partition *pt,
349 			      struct mshv_irqfd *irqfd)
350 {
351 	write_seqcount_begin(&irqfd->irqfd_irqe_sc);
352 	irqfd->irqfd_girq_ent = mshv_ret_girq_entry(pt,
353 						    irqfd->irqfd_irqnum);
354 	mshv_copy_girq_info(&irqfd->irqfd_girq_ent, &irqfd->irqfd_lapic_irq);
355 	write_seqcount_end(&irqfd->irqfd_irqe_sc);
356 }
357 
mshv_irqfd_routing_update(struct mshv_partition * pt)358 void mshv_irqfd_routing_update(struct mshv_partition *pt)
359 {
360 	struct mshv_irqfd *irqfd;
361 
362 	spin_lock_irq(&pt->pt_irqfds_lock);
363 	hlist_for_each_entry(irqfd, &pt->pt_irqfds_list, irqfd_hnode)
364 		mshv_irqfd_update(pt, irqfd);
365 	spin_unlock_irq(&pt->pt_irqfds_lock);
366 }
367 
mshv_irqfd_queue_proc(struct file * file,wait_queue_head_t * wqh,poll_table * polltbl)368 static void mshv_irqfd_queue_proc(struct file *file, wait_queue_head_t *wqh,
369 				  poll_table *polltbl)
370 {
371 	struct mshv_irqfd *irqfd =
372 			container_of(polltbl, struct mshv_irqfd, irqfd_polltbl);
373 
374 	irqfd->irqfd_wqh = wqh;
375 
376 	/*
377 	 * TODO: Ensure there isn't already an exclusive, priority waiter, e.g.
378 	 * that the irqfd isn't already bound to another partition.  Only the
379 	 * first exclusive waiter encountered will be notified, and
380 	 * add_wait_queue_priority() doesn't enforce exclusivity.
381 	 */
382 	irqfd->irqfd_wait.flags |= WQ_FLAG_EXCLUSIVE;
383 	add_wait_queue_priority(wqh, &irqfd->irqfd_wait);
384 }
385 
mshv_irqfd_assign(struct mshv_partition * pt,struct mshv_user_irqfd * args)386 static int mshv_irqfd_assign(struct mshv_partition *pt,
387 			     struct mshv_user_irqfd *args)
388 {
389 	struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
390 	struct mshv_irqfd *irqfd, *tmp;
391 	unsigned int events;
392 	int ret;
393 	int idx;
394 
395 	CLASS(fd, f)(args->fd);
396 
397 	irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
398 	if (!irqfd)
399 		return -ENOMEM;
400 
401 	irqfd->irqfd_partn = pt;
402 	irqfd->irqfd_irqnum = args->gsi;
403 	INIT_WORK(&irqfd->irqfd_shutdown, mshv_irqfd_shutdown);
404 	seqcount_spinlock_init(&irqfd->irqfd_irqe_sc, &pt->pt_irqfds_lock);
405 
406 	if (fd_empty(f)) {
407 		ret = -EBADF;
408 		goto out;
409 	}
410 
411 	eventfd = eventfd_ctx_fileget(fd_file(f));
412 	if (IS_ERR(eventfd)) {
413 		ret = PTR_ERR(eventfd);
414 		goto fail;
415 	}
416 
417 	irqfd->irqfd_eventfd_ctx = eventfd;
418 
419 	if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE)) {
420 		struct mshv_irqfd_resampler *rp;
421 
422 		resamplefd = eventfd_ctx_fdget(args->resamplefd);
423 		if (IS_ERR(resamplefd)) {
424 			ret = PTR_ERR(resamplefd);
425 			goto fail;
426 		}
427 
428 		irqfd->irqfd_resamplefd = resamplefd;
429 
430 		mutex_lock(&pt->irqfds_resampler_lock);
431 
432 		hlist_for_each_entry(rp, &pt->irqfds_resampler_list,
433 				     rsmplr_hnode) {
434 			if (rp->rsmplr_notifier.irq_ack_gsi ==
435 							 irqfd->irqfd_irqnum) {
436 				irqfd->irqfd_resampler = rp;
437 				break;
438 			}
439 		}
440 
441 		if (!irqfd->irqfd_resampler) {
442 			rp = kzalloc(sizeof(*rp), GFP_KERNEL_ACCOUNT);
443 			if (!rp) {
444 				ret = -ENOMEM;
445 				mutex_unlock(&pt->irqfds_resampler_lock);
446 				goto fail;
447 			}
448 
449 			rp->rsmplr_partn = pt;
450 			INIT_HLIST_HEAD(&rp->rsmplr_irqfd_list);
451 			rp->rsmplr_notifier.irq_ack_gsi = irqfd->irqfd_irqnum;
452 			rp->rsmplr_notifier.irq_acked =
453 						      mshv_irqfd_resampler_ack;
454 
455 			hlist_add_head(&rp->rsmplr_hnode,
456 				       &pt->irqfds_resampler_list);
457 			mshv_register_irq_ack_notifier(pt,
458 						       &rp->rsmplr_notifier);
459 			irqfd->irqfd_resampler = rp;
460 		}
461 
462 		hlist_add_head_rcu(&irqfd->irqfd_resampler_hnode,
463 				   &irqfd->irqfd_resampler->rsmplr_irqfd_list);
464 
465 		mutex_unlock(&pt->irqfds_resampler_lock);
466 	}
467 
468 	/*
469 	 * Install our own custom wake-up handling so we are notified via
470 	 * a callback whenever someone signals the underlying eventfd
471 	 */
472 	init_waitqueue_func_entry(&irqfd->irqfd_wait, mshv_irqfd_wakeup);
473 	init_poll_funcptr(&irqfd->irqfd_polltbl, mshv_irqfd_queue_proc);
474 
475 	spin_lock_irq(&pt->pt_irqfds_lock);
476 #if IS_ENABLED(CONFIG_X86)
477 	if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE) &&
478 	    !irqfd->irqfd_lapic_irq.lapic_control.level_triggered) {
479 		/*
480 		 * Resample Fd must be for level triggered interrupt
481 		 * Otherwise return with failure
482 		 */
483 		spin_unlock_irq(&pt->pt_irqfds_lock);
484 		ret = -EINVAL;
485 		goto fail;
486 	}
487 #endif
488 	ret = 0;
489 	hlist_for_each_entry(tmp, &pt->pt_irqfds_list, irqfd_hnode) {
490 		if (irqfd->irqfd_eventfd_ctx != tmp->irqfd_eventfd_ctx)
491 			continue;
492 		/* This fd is used for another irq already. */
493 		ret = -EBUSY;
494 		spin_unlock_irq(&pt->pt_irqfds_lock);
495 		goto fail;
496 	}
497 
498 	idx = srcu_read_lock(&pt->pt_irq_srcu);
499 	mshv_irqfd_update(pt, irqfd);
500 	hlist_add_head(&irqfd->irqfd_hnode, &pt->pt_irqfds_list);
501 	spin_unlock_irq(&pt->pt_irqfds_lock);
502 
503 	/*
504 	 * Check if there was an event already pending on the eventfd
505 	 * before we registered, and trigger it as if we didn't miss it.
506 	 */
507 	events = vfs_poll(fd_file(f), &irqfd->irqfd_polltbl);
508 
509 	if (events & POLLIN)
510 		mshv_assert_irq_slow(irqfd);
511 
512 	srcu_read_unlock(&pt->pt_irq_srcu, idx);
513 	return 0;
514 
515 fail:
516 	if (irqfd->irqfd_resampler)
517 		mshv_irqfd_resampler_shutdown(irqfd);
518 
519 	if (resamplefd && !IS_ERR(resamplefd))
520 		eventfd_ctx_put(resamplefd);
521 
522 	if (eventfd && !IS_ERR(eventfd))
523 		eventfd_ctx_put(eventfd);
524 
525 out:
526 	kfree(irqfd);
527 	return ret;
528 }
529 
530 /*
531  * shutdown any irqfd's that match fd+gsi
532  */
mshv_irqfd_deassign(struct mshv_partition * pt,struct mshv_user_irqfd * args)533 static int mshv_irqfd_deassign(struct mshv_partition *pt,
534 			       struct mshv_user_irqfd *args)
535 {
536 	struct mshv_irqfd *irqfd;
537 	struct hlist_node *n;
538 	struct eventfd_ctx *eventfd;
539 
540 	eventfd = eventfd_ctx_fdget(args->fd);
541 	if (IS_ERR(eventfd))
542 		return PTR_ERR(eventfd);
543 
544 	hlist_for_each_entry_safe(irqfd, n, &pt->pt_irqfds_list,
545 				  irqfd_hnode) {
546 		if (irqfd->irqfd_eventfd_ctx == eventfd &&
547 		    irqfd->irqfd_irqnum == args->gsi)
548 
549 			mshv_irqfd_deactivate(irqfd);
550 	}
551 
552 	eventfd_ctx_put(eventfd);
553 
554 	/*
555 	 * Block until we know all outstanding shutdown jobs have completed
556 	 * so that we guarantee there will not be any more interrupts on this
557 	 * gsi once this deassign function returns.
558 	 */
559 	flush_workqueue(irqfd_cleanup_wq);
560 
561 	return 0;
562 }
563 
mshv_set_unset_irqfd(struct mshv_partition * pt,struct mshv_user_irqfd * args)564 int mshv_set_unset_irqfd(struct mshv_partition *pt,
565 			 struct mshv_user_irqfd *args)
566 {
567 	if (args->flags & ~MSHV_IRQFD_FLAGS_MASK)
568 		return -EINVAL;
569 
570 	if (args->flags & BIT(MSHV_IRQFD_BIT_DEASSIGN))
571 		return mshv_irqfd_deassign(pt, args);
572 
573 	return mshv_irqfd_assign(pt, args);
574 }
575 
576 /*
577  * This function is called as the mshv VM fd is being released.
578  * Shutdown all irqfds that still remain open
579  */
mshv_irqfd_release(struct mshv_partition * pt)580 static void mshv_irqfd_release(struct mshv_partition *pt)
581 {
582 	struct mshv_irqfd *irqfd;
583 	struct hlist_node *n;
584 
585 	spin_lock_irq(&pt->pt_irqfds_lock);
586 
587 	hlist_for_each_entry_safe(irqfd, n, &pt->pt_irqfds_list, irqfd_hnode)
588 		mshv_irqfd_deactivate(irqfd);
589 
590 	spin_unlock_irq(&pt->pt_irqfds_lock);
591 
592 	/*
593 	 * Block until we know all outstanding shutdown jobs have completed
594 	 * since we do not take a mshv_partition* reference.
595 	 */
596 	flush_workqueue(irqfd_cleanup_wq);
597 }
598 
mshv_irqfd_wq_init(void)599 int mshv_irqfd_wq_init(void)
600 {
601 	irqfd_cleanup_wq = alloc_workqueue("mshv-irqfd-cleanup", WQ_PERCPU, 0);
602 	if (!irqfd_cleanup_wq)
603 		return -ENOMEM;
604 
605 	return 0;
606 }
607 
mshv_irqfd_wq_cleanup(void)608 void mshv_irqfd_wq_cleanup(void)
609 {
610 	destroy_workqueue(irqfd_cleanup_wq);
611 }
612 
613 /*
614  * --------------------------------------------------------------------
615  * ioeventfd: translate a MMIO memory write to an eventfd signal.
616  *
617  * userspace can register a MMIO address with an eventfd for receiving
618  * notification when the memory has been touched.
619  * --------------------------------------------------------------------
620  */
621 
ioeventfd_release(struct mshv_ioeventfd * p,u64 partition_id)622 static void ioeventfd_release(struct mshv_ioeventfd *p, u64 partition_id)
623 {
624 	if (p->iovntfd_doorbell_id > 0)
625 		mshv_unregister_doorbell(partition_id, p->iovntfd_doorbell_id);
626 	eventfd_ctx_put(p->iovntfd_eventfd);
627 	kfree(p);
628 }
629 
630 /* MMIO writes trigger an event if the addr/val match */
ioeventfd_mmio_write(int doorbell_id,void * data)631 static void ioeventfd_mmio_write(int doorbell_id, void *data)
632 {
633 	struct mshv_partition *partition = (struct mshv_partition *)data;
634 	struct mshv_ioeventfd *p;
635 
636 	rcu_read_lock();
637 	hlist_for_each_entry_rcu(p, &partition->ioeventfds_list, iovntfd_hnode)
638 		if (p->iovntfd_doorbell_id == doorbell_id) {
639 			eventfd_signal(p->iovntfd_eventfd);
640 			break;
641 		}
642 
643 	rcu_read_unlock();
644 }
645 
ioeventfd_check_collision(struct mshv_partition * pt,struct mshv_ioeventfd * p)646 static bool ioeventfd_check_collision(struct mshv_partition *pt,
647 				      struct mshv_ioeventfd *p)
648 	__must_hold(&pt->mutex)
649 {
650 	struct mshv_ioeventfd *_p;
651 
652 	hlist_for_each_entry(_p, &pt->ioeventfds_list, iovntfd_hnode)
653 		if (_p->iovntfd_addr == p->iovntfd_addr &&
654 		    _p->iovntfd_length == p->iovntfd_length &&
655 		    (_p->iovntfd_wildcard || p->iovntfd_wildcard ||
656 		     _p->iovntfd_datamatch == p->iovntfd_datamatch))
657 			return true;
658 
659 	return false;
660 }
661 
mshv_assign_ioeventfd(struct mshv_partition * pt,struct mshv_user_ioeventfd * args)662 static int mshv_assign_ioeventfd(struct mshv_partition *pt,
663 				 struct mshv_user_ioeventfd *args)
664 	__must_hold(&pt->mutex)
665 {
666 	struct mshv_ioeventfd *p;
667 	struct eventfd_ctx *eventfd;
668 	u64 doorbell_flags = 0;
669 	int ret;
670 
671 	/* This mutex is currently protecting ioeventfd.items list */
672 	WARN_ON_ONCE(!mutex_is_locked(&pt->pt_mutex));
673 
674 	if (args->flags & BIT(MSHV_IOEVENTFD_BIT_PIO))
675 		return -EOPNOTSUPP;
676 
677 	/* must be natural-word sized */
678 	switch (args->len) {
679 	case 0:
680 		doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_ANY;
681 		break;
682 	case 1:
683 		doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_BYTE;
684 		break;
685 	case 2:
686 		doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_WORD;
687 		break;
688 	case 4:
689 		doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_DWORD;
690 		break;
691 	case 8:
692 		doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_QWORD;
693 		break;
694 	default:
695 		return -EINVAL;
696 	}
697 
698 	/* check for range overflow */
699 	if (args->addr + args->len < args->addr)
700 		return -EINVAL;
701 
702 	/* check for extra flags that we don't understand */
703 	if (args->flags & ~MSHV_IOEVENTFD_FLAGS_MASK)
704 		return -EINVAL;
705 
706 	eventfd = eventfd_ctx_fdget(args->fd);
707 	if (IS_ERR(eventfd))
708 		return PTR_ERR(eventfd);
709 
710 	p = kzalloc(sizeof(*p), GFP_KERNEL);
711 	if (!p) {
712 		ret = -ENOMEM;
713 		goto fail;
714 	}
715 
716 	p->iovntfd_addr = args->addr;
717 	p->iovntfd_length  = args->len;
718 	p->iovntfd_eventfd = eventfd;
719 
720 	/* The datamatch feature is optional, otherwise this is a wildcard */
721 	if (args->flags & BIT(MSHV_IOEVENTFD_BIT_DATAMATCH)) {
722 		p->iovntfd_datamatch = args->datamatch;
723 	} else {
724 		p->iovntfd_wildcard = true;
725 		doorbell_flags |= HV_DOORBELL_FLAG_TRIGGER_ANY_VALUE;
726 	}
727 
728 	if (ioeventfd_check_collision(pt, p)) {
729 		ret = -EEXIST;
730 		goto unlock_fail;
731 	}
732 
733 	ret = mshv_register_doorbell(pt->pt_id, ioeventfd_mmio_write,
734 				     (void *)pt, p->iovntfd_addr,
735 				     p->iovntfd_datamatch, doorbell_flags);
736 	if (ret < 0)
737 		goto unlock_fail;
738 
739 	p->iovntfd_doorbell_id = ret;
740 
741 	hlist_add_head_rcu(&p->iovntfd_hnode, &pt->ioeventfds_list);
742 
743 	return 0;
744 
745 unlock_fail:
746 	kfree(p);
747 
748 fail:
749 	eventfd_ctx_put(eventfd);
750 
751 	return ret;
752 }
753 
mshv_deassign_ioeventfd(struct mshv_partition * pt,struct mshv_user_ioeventfd * args)754 static int mshv_deassign_ioeventfd(struct mshv_partition *pt,
755 				   struct mshv_user_ioeventfd *args)
756 	__must_hold(&pt->mutex)
757 {
758 	struct mshv_ioeventfd *p;
759 	struct eventfd_ctx *eventfd;
760 	struct hlist_node *n;
761 	int ret = -ENOENT;
762 
763 	/* This mutex is currently protecting ioeventfd.items list */
764 	WARN_ON_ONCE(!mutex_is_locked(&pt->pt_mutex));
765 
766 	eventfd = eventfd_ctx_fdget(args->fd);
767 	if (IS_ERR(eventfd))
768 		return PTR_ERR(eventfd);
769 
770 	hlist_for_each_entry_safe(p, n, &pt->ioeventfds_list, iovntfd_hnode) {
771 		bool wildcard = !(args->flags & BIT(MSHV_IOEVENTFD_BIT_DATAMATCH));
772 
773 		if (p->iovntfd_eventfd != eventfd  ||
774 		    p->iovntfd_addr != args->addr  ||
775 		    p->iovntfd_length != args->len ||
776 		    p->iovntfd_wildcard != wildcard)
777 			continue;
778 
779 		if (!p->iovntfd_wildcard &&
780 		    p->iovntfd_datamatch != args->datamatch)
781 			continue;
782 
783 		hlist_del_rcu(&p->iovntfd_hnode);
784 		synchronize_rcu();
785 		ioeventfd_release(p, pt->pt_id);
786 		ret = 0;
787 		break;
788 	}
789 
790 	eventfd_ctx_put(eventfd);
791 
792 	return ret;
793 }
794 
mshv_set_unset_ioeventfd(struct mshv_partition * pt,struct mshv_user_ioeventfd * args)795 int mshv_set_unset_ioeventfd(struct mshv_partition *pt,
796 			     struct mshv_user_ioeventfd *args)
797 	__must_hold(&pt->mutex)
798 {
799 	if ((args->flags & ~MSHV_IOEVENTFD_FLAGS_MASK) ||
800 	    mshv_field_nonzero(*args, rsvd))
801 		return -EINVAL;
802 
803 	/* PIO not yet implemented */
804 	if (args->flags & BIT(MSHV_IOEVENTFD_BIT_PIO))
805 		return -EOPNOTSUPP;
806 
807 	if (args->flags & BIT(MSHV_IOEVENTFD_BIT_DEASSIGN))
808 		return mshv_deassign_ioeventfd(pt, args);
809 
810 	return mshv_assign_ioeventfd(pt, args);
811 }
812 
mshv_eventfd_init(struct mshv_partition * pt)813 void mshv_eventfd_init(struct mshv_partition *pt)
814 {
815 	spin_lock_init(&pt->pt_irqfds_lock);
816 	INIT_HLIST_HEAD(&pt->pt_irqfds_list);
817 
818 	INIT_HLIST_HEAD(&pt->irqfds_resampler_list);
819 	mutex_init(&pt->irqfds_resampler_lock);
820 
821 	INIT_HLIST_HEAD(&pt->ioeventfds_list);
822 }
823 
mshv_eventfd_release(struct mshv_partition * pt)824 void mshv_eventfd_release(struct mshv_partition *pt)
825 {
826 	struct hlist_head items;
827 	struct hlist_node *n;
828 	struct mshv_ioeventfd *p;
829 
830 	hlist_move_list(&pt->ioeventfds_list, &items);
831 	synchronize_rcu();
832 
833 	hlist_for_each_entry_safe(p, n, &items, iovntfd_hnode) {
834 		hlist_del(&p->iovntfd_hnode);
835 		ioeventfd_release(p, pt->pt_id);
836 	}
837 
838 	mshv_irqfd_release(pt);
839 }
840