xref: /linux/drivers/vdpa/vdpa_user/vduse_dev.c (revision 55d0969c451159cff86949b38c39171cab962069)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VDUSE: vDPA Device in Userspace
4  *
5  * Copyright (C) 2020-2021 Bytedance Inc. and/or its affiliates. All rights reserved.
6  *
7  * Author: Xie Yongji <xieyongji@bytedance.com>
8  *
9  */
10 
11 #include "linux/virtio_net.h"
12 #include <linux/init.h>
13 #include <linux/module.h>
14 #include <linux/cdev.h>
15 #include <linux/device.h>
16 #include <linux/eventfd.h>
17 #include <linux/slab.h>
18 #include <linux/wait.h>
19 #include <linux/dma-map-ops.h>
20 #include <linux/poll.h>
21 #include <linux/file.h>
22 #include <linux/uio.h>
23 #include <linux/vdpa.h>
24 #include <linux/nospec.h>
25 #include <linux/vmalloc.h>
26 #include <linux/sched/mm.h>
27 #include <uapi/linux/vduse.h>
28 #include <uapi/linux/vdpa.h>
29 #include <uapi/linux/virtio_config.h>
30 #include <uapi/linux/virtio_ids.h>
31 #include <uapi/linux/virtio_blk.h>
32 #include <uapi/linux/virtio_ring.h>
33 #include <linux/mod_devicetable.h>
34 
35 #include "iova_domain.h"
36 
37 #define DRV_AUTHOR   "Yongji Xie <xieyongji@bytedance.com>"
38 #define DRV_DESC     "vDPA Device in Userspace"
39 #define DRV_LICENSE  "GPL v2"
40 
41 #define VDUSE_DEV_MAX (1U << MINORBITS)
42 #define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024)
43 #define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024)
44 #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
45 /* 128 MB reserved for virtqueue creation */
46 #define VDUSE_IOVA_SIZE (VDUSE_MAX_BOUNCE_SIZE + 128 * 1024 * 1024)
47 #define VDUSE_MSG_DEFAULT_TIMEOUT 30
48 
49 #define IRQ_UNBOUND -1
50 
51 struct vduse_virtqueue {
52 	u16 index;
53 	u16 num_max;
54 	u32 num;
55 	u64 desc_addr;
56 	u64 driver_addr;
57 	u64 device_addr;
58 	struct vdpa_vq_state state;
59 	bool ready;
60 	bool kicked;
61 	spinlock_t kick_lock;
62 	spinlock_t irq_lock;
63 	struct eventfd_ctx *kickfd;
64 	struct vdpa_callback cb;
65 	struct work_struct inject;
66 	struct work_struct kick;
67 	int irq_effective_cpu;
68 	struct cpumask irq_affinity;
69 	struct kobject kobj;
70 };
71 
72 struct vduse_dev;
73 
74 struct vduse_vdpa {
75 	struct vdpa_device vdpa;
76 	struct vduse_dev *dev;
77 };
78 
79 struct vduse_umem {
80 	unsigned long iova;
81 	unsigned long npages;
82 	struct page **pages;
83 	struct mm_struct *mm;
84 };
85 
86 struct vduse_dev {
87 	struct vduse_vdpa *vdev;
88 	struct device *dev;
89 	struct vduse_virtqueue **vqs;
90 	struct vduse_iova_domain *domain;
91 	char *name;
92 	struct mutex lock;
93 	spinlock_t msg_lock;
94 	u64 msg_unique;
95 	u32 msg_timeout;
96 	wait_queue_head_t waitq;
97 	struct list_head send_list;
98 	struct list_head recv_list;
99 	struct vdpa_callback config_cb;
100 	struct work_struct inject;
101 	spinlock_t irq_lock;
102 	struct rw_semaphore rwsem;
103 	int minor;
104 	bool broken;
105 	bool connected;
106 	u64 api_version;
107 	u64 device_features;
108 	u64 driver_features;
109 	u32 device_id;
110 	u32 vendor_id;
111 	u32 generation;
112 	u32 config_size;
113 	void *config;
114 	u8 status;
115 	u32 vq_num;
116 	u32 vq_align;
117 	struct vduse_umem *umem;
118 	struct mutex mem_lock;
119 	unsigned int bounce_size;
120 	struct mutex domain_lock;
121 };
122 
123 struct vduse_dev_msg {
124 	struct vduse_dev_request req;
125 	struct vduse_dev_response resp;
126 	struct list_head list;
127 	wait_queue_head_t waitq;
128 	bool completed;
129 };
130 
131 struct vduse_control {
132 	u64 api_version;
133 };
134 
135 static DEFINE_MUTEX(vduse_lock);
136 static DEFINE_IDR(vduse_idr);
137 
138 static dev_t vduse_major;
139 static struct cdev vduse_ctrl_cdev;
140 static struct cdev vduse_cdev;
141 static struct workqueue_struct *vduse_irq_wq;
142 static struct workqueue_struct *vduse_irq_bound_wq;
143 
144 static u32 allowed_device_id[] = {
145 	VIRTIO_ID_BLOCK,
146 	VIRTIO_ID_NET,
147 };
148 
149 static inline struct vduse_dev *vdpa_to_vduse(struct vdpa_device *vdpa)
150 {
151 	struct vduse_vdpa *vdev = container_of(vdpa, struct vduse_vdpa, vdpa);
152 
153 	return vdev->dev;
154 }
155 
156 static inline struct vduse_dev *dev_to_vduse(struct device *dev)
157 {
158 	struct vdpa_device *vdpa = dev_to_vdpa(dev);
159 
160 	return vdpa_to_vduse(vdpa);
161 }
162 
163 static struct vduse_dev_msg *vduse_find_msg(struct list_head *head,
164 					    uint32_t request_id)
165 {
166 	struct vduse_dev_msg *msg;
167 
168 	list_for_each_entry(msg, head, list) {
169 		if (msg->req.request_id == request_id) {
170 			list_del(&msg->list);
171 			return msg;
172 		}
173 	}
174 
175 	return NULL;
176 }
177 
178 static struct vduse_dev_msg *vduse_dequeue_msg(struct list_head *head)
179 {
180 	struct vduse_dev_msg *msg = NULL;
181 
182 	if (!list_empty(head)) {
183 		msg = list_first_entry(head, struct vduse_dev_msg, list);
184 		list_del(&msg->list);
185 	}
186 
187 	return msg;
188 }
189 
190 static void vduse_enqueue_msg(struct list_head *head,
191 			      struct vduse_dev_msg *msg)
192 {
193 	list_add_tail(&msg->list, head);
194 }
195 
196 static void vduse_dev_broken(struct vduse_dev *dev)
197 {
198 	struct vduse_dev_msg *msg, *tmp;
199 
200 	if (unlikely(dev->broken))
201 		return;
202 
203 	list_splice_init(&dev->recv_list, &dev->send_list);
204 	list_for_each_entry_safe(msg, tmp, &dev->send_list, list) {
205 		list_del(&msg->list);
206 		msg->completed = 1;
207 		msg->resp.result = VDUSE_REQ_RESULT_FAILED;
208 		wake_up(&msg->waitq);
209 	}
210 	dev->broken = true;
211 	wake_up(&dev->waitq);
212 }
213 
214 static int vduse_dev_msg_sync(struct vduse_dev *dev,
215 			      struct vduse_dev_msg *msg)
216 {
217 	int ret;
218 
219 	if (unlikely(dev->broken))
220 		return -EIO;
221 
222 	init_waitqueue_head(&msg->waitq);
223 	spin_lock(&dev->msg_lock);
224 	if (unlikely(dev->broken)) {
225 		spin_unlock(&dev->msg_lock);
226 		return -EIO;
227 	}
228 	msg->req.request_id = dev->msg_unique++;
229 	vduse_enqueue_msg(&dev->send_list, msg);
230 	wake_up(&dev->waitq);
231 	spin_unlock(&dev->msg_lock);
232 	if (dev->msg_timeout)
233 		ret = wait_event_killable_timeout(msg->waitq, msg->completed,
234 						  (long)dev->msg_timeout * HZ);
235 	else
236 		ret = wait_event_killable(msg->waitq, msg->completed);
237 
238 	spin_lock(&dev->msg_lock);
239 	if (!msg->completed) {
240 		list_del(&msg->list);
241 		msg->resp.result = VDUSE_REQ_RESULT_FAILED;
242 		/* Mark the device as malfunction when there is a timeout */
243 		if (!ret)
244 			vduse_dev_broken(dev);
245 	}
246 	ret = (msg->resp.result == VDUSE_REQ_RESULT_OK) ? 0 : -EIO;
247 	spin_unlock(&dev->msg_lock);
248 
249 	return ret;
250 }
251 
252 static int vduse_dev_get_vq_state_packed(struct vduse_dev *dev,
253 					 struct vduse_virtqueue *vq,
254 					 struct vdpa_vq_state_packed *packed)
255 {
256 	struct vduse_dev_msg msg = { 0 };
257 	int ret;
258 
259 	msg.req.type = VDUSE_GET_VQ_STATE;
260 	msg.req.vq_state.index = vq->index;
261 
262 	ret = vduse_dev_msg_sync(dev, &msg);
263 	if (ret)
264 		return ret;
265 
266 	packed->last_avail_counter =
267 			msg.resp.vq_state.packed.last_avail_counter & 0x0001;
268 	packed->last_avail_idx =
269 			msg.resp.vq_state.packed.last_avail_idx & 0x7FFF;
270 	packed->last_used_counter =
271 			msg.resp.vq_state.packed.last_used_counter & 0x0001;
272 	packed->last_used_idx =
273 			msg.resp.vq_state.packed.last_used_idx & 0x7FFF;
274 
275 	return 0;
276 }
277 
278 static int vduse_dev_get_vq_state_split(struct vduse_dev *dev,
279 					struct vduse_virtqueue *vq,
280 					struct vdpa_vq_state_split *split)
281 {
282 	struct vduse_dev_msg msg = { 0 };
283 	int ret;
284 
285 	msg.req.type = VDUSE_GET_VQ_STATE;
286 	msg.req.vq_state.index = vq->index;
287 
288 	ret = vduse_dev_msg_sync(dev, &msg);
289 	if (ret)
290 		return ret;
291 
292 	split->avail_index = msg.resp.vq_state.split.avail_index;
293 
294 	return 0;
295 }
296 
297 static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
298 {
299 	struct vduse_dev_msg msg = { 0 };
300 
301 	msg.req.type = VDUSE_SET_STATUS;
302 	msg.req.s.status = status;
303 
304 	return vduse_dev_msg_sync(dev, &msg);
305 }
306 
307 static int vduse_dev_update_iotlb(struct vduse_dev *dev,
308 				  u64 start, u64 last)
309 {
310 	struct vduse_dev_msg msg = { 0 };
311 
312 	if (last < start)
313 		return -EINVAL;
314 
315 	msg.req.type = VDUSE_UPDATE_IOTLB;
316 	msg.req.iova.start = start;
317 	msg.req.iova.last = last;
318 
319 	return vduse_dev_msg_sync(dev, &msg);
320 }
321 
322 static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
323 {
324 	struct file *file = iocb->ki_filp;
325 	struct vduse_dev *dev = file->private_data;
326 	struct vduse_dev_msg *msg;
327 	int size = sizeof(struct vduse_dev_request);
328 	ssize_t ret;
329 
330 	if (iov_iter_count(to) < size)
331 		return -EINVAL;
332 
333 	spin_lock(&dev->msg_lock);
334 	while (1) {
335 		msg = vduse_dequeue_msg(&dev->send_list);
336 		if (msg)
337 			break;
338 
339 		ret = -EAGAIN;
340 		if (file->f_flags & O_NONBLOCK)
341 			goto unlock;
342 
343 		spin_unlock(&dev->msg_lock);
344 		ret = wait_event_interruptible_exclusive(dev->waitq,
345 					!list_empty(&dev->send_list));
346 		if (ret)
347 			return ret;
348 
349 		spin_lock(&dev->msg_lock);
350 	}
351 	spin_unlock(&dev->msg_lock);
352 	ret = copy_to_iter(&msg->req, size, to);
353 	spin_lock(&dev->msg_lock);
354 	if (ret != size) {
355 		ret = -EFAULT;
356 		vduse_enqueue_msg(&dev->send_list, msg);
357 		goto unlock;
358 	}
359 	vduse_enqueue_msg(&dev->recv_list, msg);
360 unlock:
361 	spin_unlock(&dev->msg_lock);
362 
363 	return ret;
364 }
365 
366 static bool is_mem_zero(const char *ptr, int size)
367 {
368 	int i;
369 
370 	for (i = 0; i < size; i++) {
371 		if (ptr[i])
372 			return false;
373 	}
374 	return true;
375 }
376 
377 static ssize_t vduse_dev_write_iter(struct kiocb *iocb, struct iov_iter *from)
378 {
379 	struct file *file = iocb->ki_filp;
380 	struct vduse_dev *dev = file->private_data;
381 	struct vduse_dev_response resp;
382 	struct vduse_dev_msg *msg;
383 	size_t ret;
384 
385 	ret = copy_from_iter(&resp, sizeof(resp), from);
386 	if (ret != sizeof(resp))
387 		return -EINVAL;
388 
389 	if (!is_mem_zero((const char *)resp.reserved, sizeof(resp.reserved)))
390 		return -EINVAL;
391 
392 	spin_lock(&dev->msg_lock);
393 	msg = vduse_find_msg(&dev->recv_list, resp.request_id);
394 	if (!msg) {
395 		ret = -ENOENT;
396 		goto unlock;
397 	}
398 
399 	memcpy(&msg->resp, &resp, sizeof(resp));
400 	msg->completed = 1;
401 	wake_up(&msg->waitq);
402 unlock:
403 	spin_unlock(&dev->msg_lock);
404 
405 	return ret;
406 }
407 
408 static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
409 {
410 	struct vduse_dev *dev = file->private_data;
411 	__poll_t mask = 0;
412 
413 	poll_wait(file, &dev->waitq, wait);
414 
415 	spin_lock(&dev->msg_lock);
416 
417 	if (unlikely(dev->broken))
418 		mask |= EPOLLERR;
419 	if (!list_empty(&dev->send_list))
420 		mask |= EPOLLIN | EPOLLRDNORM;
421 	if (!list_empty(&dev->recv_list))
422 		mask |= EPOLLOUT | EPOLLWRNORM;
423 
424 	spin_unlock(&dev->msg_lock);
425 
426 	return mask;
427 }
428 
429 static void vduse_dev_reset(struct vduse_dev *dev)
430 {
431 	int i;
432 	struct vduse_iova_domain *domain = dev->domain;
433 
434 	/* The coherent mappings are handled in vduse_dev_free_coherent() */
435 	if (domain && domain->bounce_map)
436 		vduse_domain_reset_bounce_map(domain);
437 
438 	down_write(&dev->rwsem);
439 
440 	dev->status = 0;
441 	dev->driver_features = 0;
442 	dev->generation++;
443 	spin_lock(&dev->irq_lock);
444 	dev->config_cb.callback = NULL;
445 	dev->config_cb.private = NULL;
446 	spin_unlock(&dev->irq_lock);
447 	flush_work(&dev->inject);
448 
449 	for (i = 0; i < dev->vq_num; i++) {
450 		struct vduse_virtqueue *vq = dev->vqs[i];
451 
452 		vq->ready = false;
453 		vq->desc_addr = 0;
454 		vq->driver_addr = 0;
455 		vq->device_addr = 0;
456 		vq->num = 0;
457 		memset(&vq->state, 0, sizeof(vq->state));
458 
459 		spin_lock(&vq->kick_lock);
460 		vq->kicked = false;
461 		if (vq->kickfd)
462 			eventfd_ctx_put(vq->kickfd);
463 		vq->kickfd = NULL;
464 		spin_unlock(&vq->kick_lock);
465 
466 		spin_lock(&vq->irq_lock);
467 		vq->cb.callback = NULL;
468 		vq->cb.private = NULL;
469 		vq->cb.trigger = NULL;
470 		spin_unlock(&vq->irq_lock);
471 		flush_work(&vq->inject);
472 		flush_work(&vq->kick);
473 	}
474 
475 	up_write(&dev->rwsem);
476 }
477 
478 static int vduse_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 idx,
479 				u64 desc_area, u64 driver_area,
480 				u64 device_area)
481 {
482 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
483 	struct vduse_virtqueue *vq = dev->vqs[idx];
484 
485 	vq->desc_addr = desc_area;
486 	vq->driver_addr = driver_area;
487 	vq->device_addr = device_area;
488 
489 	return 0;
490 }
491 
492 static void vduse_vq_kick(struct vduse_virtqueue *vq)
493 {
494 	spin_lock(&vq->kick_lock);
495 	if (!vq->ready)
496 		goto unlock;
497 
498 	if (vq->kickfd)
499 		eventfd_signal(vq->kickfd);
500 	else
501 		vq->kicked = true;
502 unlock:
503 	spin_unlock(&vq->kick_lock);
504 }
505 
506 static void vduse_vq_kick_work(struct work_struct *work)
507 {
508 	struct vduse_virtqueue *vq = container_of(work,
509 					struct vduse_virtqueue, kick);
510 
511 	vduse_vq_kick(vq);
512 }
513 
514 static void vduse_vdpa_kick_vq(struct vdpa_device *vdpa, u16 idx)
515 {
516 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
517 	struct vduse_virtqueue *vq = dev->vqs[idx];
518 
519 	if (!eventfd_signal_allowed()) {
520 		schedule_work(&vq->kick);
521 		return;
522 	}
523 	vduse_vq_kick(vq);
524 }
525 
526 static void vduse_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 idx,
527 			      struct vdpa_callback *cb)
528 {
529 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
530 	struct vduse_virtqueue *vq = dev->vqs[idx];
531 
532 	spin_lock(&vq->irq_lock);
533 	vq->cb.callback = cb->callback;
534 	vq->cb.private = cb->private;
535 	vq->cb.trigger = cb->trigger;
536 	spin_unlock(&vq->irq_lock);
537 }
538 
539 static void vduse_vdpa_set_vq_num(struct vdpa_device *vdpa, u16 idx, u32 num)
540 {
541 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
542 	struct vduse_virtqueue *vq = dev->vqs[idx];
543 
544 	vq->num = num;
545 }
546 
547 static u16 vduse_vdpa_get_vq_size(struct vdpa_device *vdpa, u16 idx)
548 {
549 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
550 	struct vduse_virtqueue *vq = dev->vqs[idx];
551 
552 	if (vq->num)
553 		return vq->num;
554 	else
555 		return vq->num_max;
556 }
557 
558 static void vduse_vdpa_set_vq_ready(struct vdpa_device *vdpa,
559 					u16 idx, bool ready)
560 {
561 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
562 	struct vduse_virtqueue *vq = dev->vqs[idx];
563 
564 	vq->ready = ready;
565 }
566 
567 static bool vduse_vdpa_get_vq_ready(struct vdpa_device *vdpa, u16 idx)
568 {
569 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
570 	struct vduse_virtqueue *vq = dev->vqs[idx];
571 
572 	return vq->ready;
573 }
574 
575 static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx,
576 				const struct vdpa_vq_state *state)
577 {
578 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
579 	struct vduse_virtqueue *vq = dev->vqs[idx];
580 
581 	if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
582 		vq->state.packed.last_avail_counter =
583 				state->packed.last_avail_counter;
584 		vq->state.packed.last_avail_idx = state->packed.last_avail_idx;
585 		vq->state.packed.last_used_counter =
586 				state->packed.last_used_counter;
587 		vq->state.packed.last_used_idx = state->packed.last_used_idx;
588 	} else
589 		vq->state.split.avail_index = state->split.avail_index;
590 
591 	return 0;
592 }
593 
594 static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
595 				struct vdpa_vq_state *state)
596 {
597 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
598 	struct vduse_virtqueue *vq = dev->vqs[idx];
599 
600 	if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED))
601 		return vduse_dev_get_vq_state_packed(dev, vq, &state->packed);
602 
603 	return vduse_dev_get_vq_state_split(dev, vq, &state->split);
604 }
605 
606 static u32 vduse_vdpa_get_vq_align(struct vdpa_device *vdpa)
607 {
608 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
609 
610 	return dev->vq_align;
611 }
612 
613 static u64 vduse_vdpa_get_device_features(struct vdpa_device *vdpa)
614 {
615 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
616 
617 	return dev->device_features;
618 }
619 
620 static int vduse_vdpa_set_driver_features(struct vdpa_device *vdpa, u64 features)
621 {
622 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
623 
624 	dev->driver_features = features;
625 	return 0;
626 }
627 
628 static u64 vduse_vdpa_get_driver_features(struct vdpa_device *vdpa)
629 {
630 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
631 
632 	return dev->driver_features;
633 }
634 
635 static void vduse_vdpa_set_config_cb(struct vdpa_device *vdpa,
636 				  struct vdpa_callback *cb)
637 {
638 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
639 
640 	spin_lock(&dev->irq_lock);
641 	dev->config_cb.callback = cb->callback;
642 	dev->config_cb.private = cb->private;
643 	spin_unlock(&dev->irq_lock);
644 }
645 
646 static u16 vduse_vdpa_get_vq_num_max(struct vdpa_device *vdpa)
647 {
648 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
649 	u16 num_max = 0;
650 	int i;
651 
652 	for (i = 0; i < dev->vq_num; i++)
653 		if (num_max < dev->vqs[i]->num_max)
654 			num_max = dev->vqs[i]->num_max;
655 
656 	return num_max;
657 }
658 
659 static u32 vduse_vdpa_get_device_id(struct vdpa_device *vdpa)
660 {
661 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
662 
663 	return dev->device_id;
664 }
665 
666 static u32 vduse_vdpa_get_vendor_id(struct vdpa_device *vdpa)
667 {
668 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
669 
670 	return dev->vendor_id;
671 }
672 
673 static u8 vduse_vdpa_get_status(struct vdpa_device *vdpa)
674 {
675 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
676 
677 	return dev->status;
678 }
679 
680 static void vduse_vdpa_set_status(struct vdpa_device *vdpa, u8 status)
681 {
682 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
683 
684 	if (vduse_dev_set_status(dev, status))
685 		return;
686 
687 	dev->status = status;
688 }
689 
690 static size_t vduse_vdpa_get_config_size(struct vdpa_device *vdpa)
691 {
692 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
693 
694 	return dev->config_size;
695 }
696 
697 static void vduse_vdpa_get_config(struct vdpa_device *vdpa, unsigned int offset,
698 				  void *buf, unsigned int len)
699 {
700 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
701 
702 	/* Initialize the buffer in case of partial copy. */
703 	memset(buf, 0, len);
704 
705 	if (offset > dev->config_size)
706 		return;
707 
708 	if (len > dev->config_size - offset)
709 		len = dev->config_size - offset;
710 
711 	memcpy(buf, dev->config + offset, len);
712 }
713 
714 static void vduse_vdpa_set_config(struct vdpa_device *vdpa, unsigned int offset,
715 			const void *buf, unsigned int len)
716 {
717 	/* Now we only support read-only configuration space */
718 }
719 
720 static int vduse_vdpa_reset(struct vdpa_device *vdpa)
721 {
722 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
723 	int ret = vduse_dev_set_status(dev, 0);
724 
725 	vduse_dev_reset(dev);
726 
727 	return ret;
728 }
729 
730 static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa)
731 {
732 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
733 
734 	return dev->generation;
735 }
736 
737 static int vduse_vdpa_set_vq_affinity(struct vdpa_device *vdpa, u16 idx,
738 				      const struct cpumask *cpu_mask)
739 {
740 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
741 
742 	if (cpu_mask)
743 		cpumask_copy(&dev->vqs[idx]->irq_affinity, cpu_mask);
744 	else
745 		cpumask_setall(&dev->vqs[idx]->irq_affinity);
746 
747 	return 0;
748 }
749 
750 static const struct cpumask *
751 vduse_vdpa_get_vq_affinity(struct vdpa_device *vdpa, u16 idx)
752 {
753 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
754 
755 	return &dev->vqs[idx]->irq_affinity;
756 }
757 
758 static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
759 				unsigned int asid,
760 				struct vhost_iotlb *iotlb)
761 {
762 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
763 	int ret;
764 
765 	ret = vduse_domain_set_map(dev->domain, iotlb);
766 	if (ret)
767 		return ret;
768 
769 	ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX);
770 	if (ret) {
771 		vduse_domain_clear_map(dev->domain, iotlb);
772 		return ret;
773 	}
774 
775 	return 0;
776 }
777 
778 static void vduse_vdpa_free(struct vdpa_device *vdpa)
779 {
780 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
781 
782 	dev->vdev = NULL;
783 }
784 
785 static const struct vdpa_config_ops vduse_vdpa_config_ops = {
786 	.set_vq_address		= vduse_vdpa_set_vq_address,
787 	.kick_vq		= vduse_vdpa_kick_vq,
788 	.set_vq_cb		= vduse_vdpa_set_vq_cb,
789 	.set_vq_num             = vduse_vdpa_set_vq_num,
790 	.get_vq_size		= vduse_vdpa_get_vq_size,
791 	.set_vq_ready		= vduse_vdpa_set_vq_ready,
792 	.get_vq_ready		= vduse_vdpa_get_vq_ready,
793 	.set_vq_state		= vduse_vdpa_set_vq_state,
794 	.get_vq_state		= vduse_vdpa_get_vq_state,
795 	.get_vq_align		= vduse_vdpa_get_vq_align,
796 	.get_device_features	= vduse_vdpa_get_device_features,
797 	.set_driver_features	= vduse_vdpa_set_driver_features,
798 	.get_driver_features	= vduse_vdpa_get_driver_features,
799 	.set_config_cb		= vduse_vdpa_set_config_cb,
800 	.get_vq_num_max		= vduse_vdpa_get_vq_num_max,
801 	.get_device_id		= vduse_vdpa_get_device_id,
802 	.get_vendor_id		= vduse_vdpa_get_vendor_id,
803 	.get_status		= vduse_vdpa_get_status,
804 	.set_status		= vduse_vdpa_set_status,
805 	.get_config_size	= vduse_vdpa_get_config_size,
806 	.get_config		= vduse_vdpa_get_config,
807 	.set_config		= vduse_vdpa_set_config,
808 	.get_generation		= vduse_vdpa_get_generation,
809 	.set_vq_affinity	= vduse_vdpa_set_vq_affinity,
810 	.get_vq_affinity	= vduse_vdpa_get_vq_affinity,
811 	.reset			= vduse_vdpa_reset,
812 	.set_map		= vduse_vdpa_set_map,
813 	.free			= vduse_vdpa_free,
814 };
815 
816 static void vduse_dev_sync_single_for_device(struct device *dev,
817 					     dma_addr_t dma_addr, size_t size,
818 					     enum dma_data_direction dir)
819 {
820 	struct vduse_dev *vdev = dev_to_vduse(dev);
821 	struct vduse_iova_domain *domain = vdev->domain;
822 
823 	vduse_domain_sync_single_for_device(domain, dma_addr, size, dir);
824 }
825 
826 static void vduse_dev_sync_single_for_cpu(struct device *dev,
827 					     dma_addr_t dma_addr, size_t size,
828 					     enum dma_data_direction dir)
829 {
830 	struct vduse_dev *vdev = dev_to_vduse(dev);
831 	struct vduse_iova_domain *domain = vdev->domain;
832 
833 	vduse_domain_sync_single_for_cpu(domain, dma_addr, size, dir);
834 }
835 
836 static dma_addr_t vduse_dev_map_page(struct device *dev, struct page *page,
837 				     unsigned long offset, size_t size,
838 				     enum dma_data_direction dir,
839 				     unsigned long attrs)
840 {
841 	struct vduse_dev *vdev = dev_to_vduse(dev);
842 	struct vduse_iova_domain *domain = vdev->domain;
843 
844 	return vduse_domain_map_page(domain, page, offset, size, dir, attrs);
845 }
846 
847 static void vduse_dev_unmap_page(struct device *dev, dma_addr_t dma_addr,
848 				size_t size, enum dma_data_direction dir,
849 				unsigned long attrs)
850 {
851 	struct vduse_dev *vdev = dev_to_vduse(dev);
852 	struct vduse_iova_domain *domain = vdev->domain;
853 
854 	return vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs);
855 }
856 
857 static void *vduse_dev_alloc_coherent(struct device *dev, size_t size,
858 					dma_addr_t *dma_addr, gfp_t flag,
859 					unsigned long attrs)
860 {
861 	struct vduse_dev *vdev = dev_to_vduse(dev);
862 	struct vduse_iova_domain *domain = vdev->domain;
863 	unsigned long iova;
864 	void *addr;
865 
866 	*dma_addr = DMA_MAPPING_ERROR;
867 	addr = vduse_domain_alloc_coherent(domain, size,
868 				(dma_addr_t *)&iova, flag, attrs);
869 	if (!addr)
870 		return NULL;
871 
872 	*dma_addr = (dma_addr_t)iova;
873 
874 	return addr;
875 }
876 
877 static void vduse_dev_free_coherent(struct device *dev, size_t size,
878 					void *vaddr, dma_addr_t dma_addr,
879 					unsigned long attrs)
880 {
881 	struct vduse_dev *vdev = dev_to_vduse(dev);
882 	struct vduse_iova_domain *domain = vdev->domain;
883 
884 	vduse_domain_free_coherent(domain, size, vaddr, dma_addr, attrs);
885 }
886 
887 static size_t vduse_dev_max_mapping_size(struct device *dev)
888 {
889 	struct vduse_dev *vdev = dev_to_vduse(dev);
890 	struct vduse_iova_domain *domain = vdev->domain;
891 
892 	return domain->bounce_size;
893 }
894 
895 static const struct dma_map_ops vduse_dev_dma_ops = {
896 	.sync_single_for_device = vduse_dev_sync_single_for_device,
897 	.sync_single_for_cpu = vduse_dev_sync_single_for_cpu,
898 	.map_page = vduse_dev_map_page,
899 	.unmap_page = vduse_dev_unmap_page,
900 	.alloc = vduse_dev_alloc_coherent,
901 	.free = vduse_dev_free_coherent,
902 	.max_mapping_size = vduse_dev_max_mapping_size,
903 };
904 
905 static unsigned int perm_to_file_flags(u8 perm)
906 {
907 	unsigned int flags = 0;
908 
909 	switch (perm) {
910 	case VDUSE_ACCESS_WO:
911 		flags |= O_WRONLY;
912 		break;
913 	case VDUSE_ACCESS_RO:
914 		flags |= O_RDONLY;
915 		break;
916 	case VDUSE_ACCESS_RW:
917 		flags |= O_RDWR;
918 		break;
919 	default:
920 		WARN(1, "invalidate vhost IOTLB permission\n");
921 		break;
922 	}
923 
924 	return flags;
925 }
926 
927 static int vduse_kickfd_setup(struct vduse_dev *dev,
928 			struct vduse_vq_eventfd *eventfd)
929 {
930 	struct eventfd_ctx *ctx = NULL;
931 	struct vduse_virtqueue *vq;
932 	u32 index;
933 
934 	if (eventfd->index >= dev->vq_num)
935 		return -EINVAL;
936 
937 	index = array_index_nospec(eventfd->index, dev->vq_num);
938 	vq = dev->vqs[index];
939 	if (eventfd->fd >= 0) {
940 		ctx = eventfd_ctx_fdget(eventfd->fd);
941 		if (IS_ERR(ctx))
942 			return PTR_ERR(ctx);
943 	} else if (eventfd->fd != VDUSE_EVENTFD_DEASSIGN)
944 		return 0;
945 
946 	spin_lock(&vq->kick_lock);
947 	if (vq->kickfd)
948 		eventfd_ctx_put(vq->kickfd);
949 	vq->kickfd = ctx;
950 	if (vq->ready && vq->kicked && vq->kickfd) {
951 		eventfd_signal(vq->kickfd);
952 		vq->kicked = false;
953 	}
954 	spin_unlock(&vq->kick_lock);
955 
956 	return 0;
957 }
958 
959 static bool vduse_dev_is_ready(struct vduse_dev *dev)
960 {
961 	int i;
962 
963 	for (i = 0; i < dev->vq_num; i++)
964 		if (!dev->vqs[i]->num_max)
965 			return false;
966 
967 	return true;
968 }
969 
970 static void vduse_dev_irq_inject(struct work_struct *work)
971 {
972 	struct vduse_dev *dev = container_of(work, struct vduse_dev, inject);
973 
974 	spin_lock_bh(&dev->irq_lock);
975 	if (dev->config_cb.callback)
976 		dev->config_cb.callback(dev->config_cb.private);
977 	spin_unlock_bh(&dev->irq_lock);
978 }
979 
980 static void vduse_vq_irq_inject(struct work_struct *work)
981 {
982 	struct vduse_virtqueue *vq = container_of(work,
983 					struct vduse_virtqueue, inject);
984 
985 	spin_lock_bh(&vq->irq_lock);
986 	if (vq->ready && vq->cb.callback)
987 		vq->cb.callback(vq->cb.private);
988 	spin_unlock_bh(&vq->irq_lock);
989 }
990 
991 static bool vduse_vq_signal_irqfd(struct vduse_virtqueue *vq)
992 {
993 	bool signal = false;
994 
995 	if (!vq->cb.trigger)
996 		return false;
997 
998 	spin_lock_irq(&vq->irq_lock);
999 	if (vq->ready && vq->cb.trigger) {
1000 		eventfd_signal(vq->cb.trigger);
1001 		signal = true;
1002 	}
1003 	spin_unlock_irq(&vq->irq_lock);
1004 
1005 	return signal;
1006 }
1007 
1008 static int vduse_dev_queue_irq_work(struct vduse_dev *dev,
1009 				    struct work_struct *irq_work,
1010 				    int irq_effective_cpu)
1011 {
1012 	int ret = -EINVAL;
1013 
1014 	down_read(&dev->rwsem);
1015 	if (!(dev->status & VIRTIO_CONFIG_S_DRIVER_OK))
1016 		goto unlock;
1017 
1018 	ret = 0;
1019 	if (irq_effective_cpu == IRQ_UNBOUND)
1020 		queue_work(vduse_irq_wq, irq_work);
1021 	else
1022 		queue_work_on(irq_effective_cpu,
1023 			      vduse_irq_bound_wq, irq_work);
1024 unlock:
1025 	up_read(&dev->rwsem);
1026 
1027 	return ret;
1028 }
1029 
1030 static int vduse_dev_dereg_umem(struct vduse_dev *dev,
1031 				u64 iova, u64 size)
1032 {
1033 	int ret;
1034 
1035 	mutex_lock(&dev->mem_lock);
1036 	ret = -ENOENT;
1037 	if (!dev->umem)
1038 		goto unlock;
1039 
1040 	ret = -EINVAL;
1041 	if (!dev->domain)
1042 		goto unlock;
1043 
1044 	if (dev->umem->iova != iova || size != dev->domain->bounce_size)
1045 		goto unlock;
1046 
1047 	vduse_domain_remove_user_bounce_pages(dev->domain);
1048 	unpin_user_pages_dirty_lock(dev->umem->pages,
1049 				    dev->umem->npages, true);
1050 	atomic64_sub(dev->umem->npages, &dev->umem->mm->pinned_vm);
1051 	mmdrop(dev->umem->mm);
1052 	vfree(dev->umem->pages);
1053 	kfree(dev->umem);
1054 	dev->umem = NULL;
1055 	ret = 0;
1056 unlock:
1057 	mutex_unlock(&dev->mem_lock);
1058 	return ret;
1059 }
1060 
1061 static int vduse_dev_reg_umem(struct vduse_dev *dev,
1062 			      u64 iova, u64 uaddr, u64 size)
1063 {
1064 	struct page **page_list = NULL;
1065 	struct vduse_umem *umem = NULL;
1066 	long pinned = 0;
1067 	unsigned long npages, lock_limit;
1068 	int ret;
1069 
1070 	if (!dev->domain || !dev->domain->bounce_map ||
1071 	    size != dev->domain->bounce_size ||
1072 	    iova != 0 || uaddr & ~PAGE_MASK)
1073 		return -EINVAL;
1074 
1075 	mutex_lock(&dev->mem_lock);
1076 	ret = -EEXIST;
1077 	if (dev->umem)
1078 		goto unlock;
1079 
1080 	ret = -ENOMEM;
1081 	npages = size >> PAGE_SHIFT;
1082 	page_list = __vmalloc(array_size(npages, sizeof(struct page *)),
1083 			      GFP_KERNEL_ACCOUNT);
1084 	umem = kzalloc(sizeof(*umem), GFP_KERNEL);
1085 	if (!page_list || !umem)
1086 		goto unlock;
1087 
1088 	mmap_read_lock(current->mm);
1089 
1090 	lock_limit = PFN_DOWN(rlimit(RLIMIT_MEMLOCK));
1091 	if (npages + atomic64_read(&current->mm->pinned_vm) > lock_limit)
1092 		goto out;
1093 
1094 	pinned = pin_user_pages(uaddr, npages, FOLL_LONGTERM | FOLL_WRITE,
1095 				page_list);
1096 	if (pinned != npages) {
1097 		ret = pinned < 0 ? pinned : -ENOMEM;
1098 		goto out;
1099 	}
1100 
1101 	ret = vduse_domain_add_user_bounce_pages(dev->domain,
1102 						 page_list, pinned);
1103 	if (ret)
1104 		goto out;
1105 
1106 	atomic64_add(npages, &current->mm->pinned_vm);
1107 
1108 	umem->pages = page_list;
1109 	umem->npages = pinned;
1110 	umem->iova = iova;
1111 	umem->mm = current->mm;
1112 	mmgrab(current->mm);
1113 
1114 	dev->umem = umem;
1115 out:
1116 	if (ret && pinned > 0)
1117 		unpin_user_pages(page_list, pinned);
1118 
1119 	mmap_read_unlock(current->mm);
1120 unlock:
1121 	if (ret) {
1122 		vfree(page_list);
1123 		kfree(umem);
1124 	}
1125 	mutex_unlock(&dev->mem_lock);
1126 	return ret;
1127 }
1128 
1129 static void vduse_vq_update_effective_cpu(struct vduse_virtqueue *vq)
1130 {
1131 	int curr_cpu = vq->irq_effective_cpu;
1132 
1133 	while (true) {
1134 		curr_cpu = cpumask_next(curr_cpu, &vq->irq_affinity);
1135 		if (cpu_online(curr_cpu))
1136 			break;
1137 
1138 		if (curr_cpu >= nr_cpu_ids)
1139 			curr_cpu = IRQ_UNBOUND;
1140 	}
1141 
1142 	vq->irq_effective_cpu = curr_cpu;
1143 }
1144 
1145 static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
1146 			    unsigned long arg)
1147 {
1148 	struct vduse_dev *dev = file->private_data;
1149 	void __user *argp = (void __user *)arg;
1150 	int ret;
1151 
1152 	if (unlikely(dev->broken))
1153 		return -EPERM;
1154 
1155 	switch (cmd) {
1156 	case VDUSE_IOTLB_GET_FD: {
1157 		struct vduse_iotlb_entry entry;
1158 		struct vhost_iotlb_map *map;
1159 		struct vdpa_map_file *map_file;
1160 		struct file *f = NULL;
1161 
1162 		ret = -EFAULT;
1163 		if (copy_from_user(&entry, argp, sizeof(entry)))
1164 			break;
1165 
1166 		ret = -EINVAL;
1167 		if (entry.start > entry.last)
1168 			break;
1169 
1170 		mutex_lock(&dev->domain_lock);
1171 		if (!dev->domain) {
1172 			mutex_unlock(&dev->domain_lock);
1173 			break;
1174 		}
1175 		spin_lock(&dev->domain->iotlb_lock);
1176 		map = vhost_iotlb_itree_first(dev->domain->iotlb,
1177 					      entry.start, entry.last);
1178 		if (map) {
1179 			map_file = (struct vdpa_map_file *)map->opaque;
1180 			f = get_file(map_file->file);
1181 			entry.offset = map_file->offset;
1182 			entry.start = map->start;
1183 			entry.last = map->last;
1184 			entry.perm = map->perm;
1185 		}
1186 		spin_unlock(&dev->domain->iotlb_lock);
1187 		mutex_unlock(&dev->domain_lock);
1188 		ret = -EINVAL;
1189 		if (!f)
1190 			break;
1191 
1192 		ret = -EFAULT;
1193 		if (copy_to_user(argp, &entry, sizeof(entry))) {
1194 			fput(f);
1195 			break;
1196 		}
1197 		ret = receive_fd(f, NULL, perm_to_file_flags(entry.perm));
1198 		fput(f);
1199 		break;
1200 	}
1201 	case VDUSE_DEV_GET_FEATURES:
1202 		/*
1203 		 * Just mirror what driver wrote here.
1204 		 * The driver is expected to check FEATURE_OK later.
1205 		 */
1206 		ret = put_user(dev->driver_features, (u64 __user *)argp);
1207 		break;
1208 	case VDUSE_DEV_SET_CONFIG: {
1209 		struct vduse_config_data config;
1210 		unsigned long size = offsetof(struct vduse_config_data,
1211 					      buffer);
1212 
1213 		ret = -EFAULT;
1214 		if (copy_from_user(&config, argp, size))
1215 			break;
1216 
1217 		ret = -EINVAL;
1218 		if (config.offset > dev->config_size ||
1219 		    config.length == 0 ||
1220 		    config.length > dev->config_size - config.offset)
1221 			break;
1222 
1223 		ret = -EFAULT;
1224 		if (copy_from_user(dev->config + config.offset, argp + size,
1225 				   config.length))
1226 			break;
1227 
1228 		ret = 0;
1229 		break;
1230 	}
1231 	case VDUSE_DEV_INJECT_CONFIG_IRQ:
1232 		ret = vduse_dev_queue_irq_work(dev, &dev->inject, IRQ_UNBOUND);
1233 		break;
1234 	case VDUSE_VQ_SETUP: {
1235 		struct vduse_vq_config config;
1236 		u32 index;
1237 
1238 		ret = -EFAULT;
1239 		if (copy_from_user(&config, argp, sizeof(config)))
1240 			break;
1241 
1242 		ret = -EINVAL;
1243 		if (config.index >= dev->vq_num)
1244 			break;
1245 
1246 		if (!is_mem_zero((const char *)config.reserved,
1247 				 sizeof(config.reserved)))
1248 			break;
1249 
1250 		index = array_index_nospec(config.index, dev->vq_num);
1251 		dev->vqs[index]->num_max = config.max_size;
1252 		ret = 0;
1253 		break;
1254 	}
1255 	case VDUSE_VQ_GET_INFO: {
1256 		struct vduse_vq_info vq_info;
1257 		struct vduse_virtqueue *vq;
1258 		u32 index;
1259 
1260 		ret = -EFAULT;
1261 		if (copy_from_user(&vq_info, argp, sizeof(vq_info)))
1262 			break;
1263 
1264 		ret = -EINVAL;
1265 		if (vq_info.index >= dev->vq_num)
1266 			break;
1267 
1268 		index = array_index_nospec(vq_info.index, dev->vq_num);
1269 		vq = dev->vqs[index];
1270 		vq_info.desc_addr = vq->desc_addr;
1271 		vq_info.driver_addr = vq->driver_addr;
1272 		vq_info.device_addr = vq->device_addr;
1273 		vq_info.num = vq->num;
1274 
1275 		if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
1276 			vq_info.packed.last_avail_counter =
1277 				vq->state.packed.last_avail_counter;
1278 			vq_info.packed.last_avail_idx =
1279 				vq->state.packed.last_avail_idx;
1280 			vq_info.packed.last_used_counter =
1281 				vq->state.packed.last_used_counter;
1282 			vq_info.packed.last_used_idx =
1283 				vq->state.packed.last_used_idx;
1284 		} else
1285 			vq_info.split.avail_index =
1286 				vq->state.split.avail_index;
1287 
1288 		vq_info.ready = vq->ready;
1289 
1290 		ret = -EFAULT;
1291 		if (copy_to_user(argp, &vq_info, sizeof(vq_info)))
1292 			break;
1293 
1294 		ret = 0;
1295 		break;
1296 	}
1297 	case VDUSE_VQ_SETUP_KICKFD: {
1298 		struct vduse_vq_eventfd eventfd;
1299 
1300 		ret = -EFAULT;
1301 		if (copy_from_user(&eventfd, argp, sizeof(eventfd)))
1302 			break;
1303 
1304 		ret = vduse_kickfd_setup(dev, &eventfd);
1305 		break;
1306 	}
1307 	case VDUSE_VQ_INJECT_IRQ: {
1308 		u32 index;
1309 
1310 		ret = -EFAULT;
1311 		if (get_user(index, (u32 __user *)argp))
1312 			break;
1313 
1314 		ret = -EINVAL;
1315 		if (index >= dev->vq_num)
1316 			break;
1317 
1318 		ret = 0;
1319 		index = array_index_nospec(index, dev->vq_num);
1320 		if (!vduse_vq_signal_irqfd(dev->vqs[index])) {
1321 			vduse_vq_update_effective_cpu(dev->vqs[index]);
1322 			ret = vduse_dev_queue_irq_work(dev,
1323 						&dev->vqs[index]->inject,
1324 						dev->vqs[index]->irq_effective_cpu);
1325 		}
1326 		break;
1327 	}
1328 	case VDUSE_IOTLB_REG_UMEM: {
1329 		struct vduse_iova_umem umem;
1330 
1331 		ret = -EFAULT;
1332 		if (copy_from_user(&umem, argp, sizeof(umem)))
1333 			break;
1334 
1335 		ret = -EINVAL;
1336 		if (!is_mem_zero((const char *)umem.reserved,
1337 				 sizeof(umem.reserved)))
1338 			break;
1339 
1340 		mutex_lock(&dev->domain_lock);
1341 		ret = vduse_dev_reg_umem(dev, umem.iova,
1342 					 umem.uaddr, umem.size);
1343 		mutex_unlock(&dev->domain_lock);
1344 		break;
1345 	}
1346 	case VDUSE_IOTLB_DEREG_UMEM: {
1347 		struct vduse_iova_umem umem;
1348 
1349 		ret = -EFAULT;
1350 		if (copy_from_user(&umem, argp, sizeof(umem)))
1351 			break;
1352 
1353 		ret = -EINVAL;
1354 		if (!is_mem_zero((const char *)umem.reserved,
1355 				 sizeof(umem.reserved)))
1356 			break;
1357 		mutex_lock(&dev->domain_lock);
1358 		ret = vduse_dev_dereg_umem(dev, umem.iova,
1359 					   umem.size);
1360 		mutex_unlock(&dev->domain_lock);
1361 		break;
1362 	}
1363 	case VDUSE_IOTLB_GET_INFO: {
1364 		struct vduse_iova_info info;
1365 		struct vhost_iotlb_map *map;
1366 
1367 		ret = -EFAULT;
1368 		if (copy_from_user(&info, argp, sizeof(info)))
1369 			break;
1370 
1371 		ret = -EINVAL;
1372 		if (info.start > info.last)
1373 			break;
1374 
1375 		if (!is_mem_zero((const char *)info.reserved,
1376 				 sizeof(info.reserved)))
1377 			break;
1378 
1379 		mutex_lock(&dev->domain_lock);
1380 		if (!dev->domain) {
1381 			mutex_unlock(&dev->domain_lock);
1382 			break;
1383 		}
1384 		spin_lock(&dev->domain->iotlb_lock);
1385 		map = vhost_iotlb_itree_first(dev->domain->iotlb,
1386 					      info.start, info.last);
1387 		if (map) {
1388 			info.start = map->start;
1389 			info.last = map->last;
1390 			info.capability = 0;
1391 			if (dev->domain->bounce_map && map->start == 0 &&
1392 			    map->last == dev->domain->bounce_size - 1)
1393 				info.capability |= VDUSE_IOVA_CAP_UMEM;
1394 		}
1395 		spin_unlock(&dev->domain->iotlb_lock);
1396 		mutex_unlock(&dev->domain_lock);
1397 		if (!map)
1398 			break;
1399 
1400 		ret = -EFAULT;
1401 		if (copy_to_user(argp, &info, sizeof(info)))
1402 			break;
1403 
1404 		ret = 0;
1405 		break;
1406 	}
1407 	default:
1408 		ret = -ENOIOCTLCMD;
1409 		break;
1410 	}
1411 
1412 	return ret;
1413 }
1414 
1415 static int vduse_dev_release(struct inode *inode, struct file *file)
1416 {
1417 	struct vduse_dev *dev = file->private_data;
1418 
1419 	mutex_lock(&dev->domain_lock);
1420 	if (dev->domain)
1421 		vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size);
1422 	mutex_unlock(&dev->domain_lock);
1423 	spin_lock(&dev->msg_lock);
1424 	/* Make sure the inflight messages can processed after reconncection */
1425 	list_splice_init(&dev->recv_list, &dev->send_list);
1426 	spin_unlock(&dev->msg_lock);
1427 	dev->connected = false;
1428 
1429 	return 0;
1430 }
1431 
1432 static struct vduse_dev *vduse_dev_get_from_minor(int minor)
1433 {
1434 	struct vduse_dev *dev;
1435 
1436 	mutex_lock(&vduse_lock);
1437 	dev = idr_find(&vduse_idr, minor);
1438 	mutex_unlock(&vduse_lock);
1439 
1440 	return dev;
1441 }
1442 
1443 static int vduse_dev_open(struct inode *inode, struct file *file)
1444 {
1445 	int ret;
1446 	struct vduse_dev *dev = vduse_dev_get_from_minor(iminor(inode));
1447 
1448 	if (!dev)
1449 		return -ENODEV;
1450 
1451 	ret = -EBUSY;
1452 	mutex_lock(&dev->lock);
1453 	if (dev->connected)
1454 		goto unlock;
1455 
1456 	ret = 0;
1457 	dev->connected = true;
1458 	file->private_data = dev;
1459 unlock:
1460 	mutex_unlock(&dev->lock);
1461 
1462 	return ret;
1463 }
1464 
1465 static const struct file_operations vduse_dev_fops = {
1466 	.owner		= THIS_MODULE,
1467 	.open		= vduse_dev_open,
1468 	.release	= vduse_dev_release,
1469 	.read_iter	= vduse_dev_read_iter,
1470 	.write_iter	= vduse_dev_write_iter,
1471 	.poll		= vduse_dev_poll,
1472 	.unlocked_ioctl	= vduse_dev_ioctl,
1473 	.compat_ioctl	= compat_ptr_ioctl,
1474 	.llseek		= noop_llseek,
1475 };
1476 
1477 static ssize_t irq_cb_affinity_show(struct vduse_virtqueue *vq, char *buf)
1478 {
1479 	return sprintf(buf, "%*pb\n", cpumask_pr_args(&vq->irq_affinity));
1480 }
1481 
1482 static ssize_t irq_cb_affinity_store(struct vduse_virtqueue *vq,
1483 				     const char *buf, size_t count)
1484 {
1485 	cpumask_var_t new_value;
1486 	int ret;
1487 
1488 	if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
1489 		return -ENOMEM;
1490 
1491 	ret = cpumask_parse(buf, new_value);
1492 	if (ret)
1493 		goto free_mask;
1494 
1495 	ret = -EINVAL;
1496 	if (!cpumask_intersects(new_value, cpu_online_mask))
1497 		goto free_mask;
1498 
1499 	cpumask_copy(&vq->irq_affinity, new_value);
1500 	ret = count;
1501 free_mask:
1502 	free_cpumask_var(new_value);
1503 	return ret;
1504 }
1505 
1506 struct vq_sysfs_entry {
1507 	struct attribute attr;
1508 	ssize_t (*show)(struct vduse_virtqueue *vq, char *buf);
1509 	ssize_t (*store)(struct vduse_virtqueue *vq, const char *buf,
1510 			 size_t count);
1511 };
1512 
1513 static struct vq_sysfs_entry irq_cb_affinity_attr = __ATTR_RW(irq_cb_affinity);
1514 
1515 static struct attribute *vq_attrs[] = {
1516 	&irq_cb_affinity_attr.attr,
1517 	NULL,
1518 };
1519 ATTRIBUTE_GROUPS(vq);
1520 
1521 static ssize_t vq_attr_show(struct kobject *kobj, struct attribute *attr,
1522 			    char *buf)
1523 {
1524 	struct vduse_virtqueue *vq = container_of(kobj,
1525 					struct vduse_virtqueue, kobj);
1526 	struct vq_sysfs_entry *entry = container_of(attr,
1527 					struct vq_sysfs_entry, attr);
1528 
1529 	if (!entry->show)
1530 		return -EIO;
1531 
1532 	return entry->show(vq, buf);
1533 }
1534 
1535 static ssize_t vq_attr_store(struct kobject *kobj, struct attribute *attr,
1536 			     const char *buf, size_t count)
1537 {
1538 	struct vduse_virtqueue *vq = container_of(kobj,
1539 					struct vduse_virtqueue, kobj);
1540 	struct vq_sysfs_entry *entry = container_of(attr,
1541 					struct vq_sysfs_entry, attr);
1542 
1543 	if (!entry->store)
1544 		return -EIO;
1545 
1546 	return entry->store(vq, buf, count);
1547 }
1548 
1549 static const struct sysfs_ops vq_sysfs_ops = {
1550 	.show = vq_attr_show,
1551 	.store = vq_attr_store,
1552 };
1553 
1554 static void vq_release(struct kobject *kobj)
1555 {
1556 	struct vduse_virtqueue *vq = container_of(kobj,
1557 					struct vduse_virtqueue, kobj);
1558 	kfree(vq);
1559 }
1560 
1561 static const struct kobj_type vq_type = {
1562 	.release	= vq_release,
1563 	.sysfs_ops	= &vq_sysfs_ops,
1564 	.default_groups	= vq_groups,
1565 };
1566 
1567 static char *vduse_devnode(const struct device *dev, umode_t *mode)
1568 {
1569 	return kasprintf(GFP_KERNEL, "vduse/%s", dev_name(dev));
1570 }
1571 
1572 static const struct class vduse_class = {
1573 	.name = "vduse",
1574 	.devnode = vduse_devnode,
1575 };
1576 
1577 static void vduse_dev_deinit_vqs(struct vduse_dev *dev)
1578 {
1579 	int i;
1580 
1581 	if (!dev->vqs)
1582 		return;
1583 
1584 	for (i = 0; i < dev->vq_num; i++)
1585 		kobject_put(&dev->vqs[i]->kobj);
1586 	kfree(dev->vqs);
1587 }
1588 
1589 static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)
1590 {
1591 	int ret, i;
1592 
1593 	dev->vq_align = vq_align;
1594 	dev->vq_num = vq_num;
1595 	dev->vqs = kcalloc(dev->vq_num, sizeof(*dev->vqs), GFP_KERNEL);
1596 	if (!dev->vqs)
1597 		return -ENOMEM;
1598 
1599 	for (i = 0; i < vq_num; i++) {
1600 		dev->vqs[i] = kzalloc(sizeof(*dev->vqs[i]), GFP_KERNEL);
1601 		if (!dev->vqs[i]) {
1602 			ret = -ENOMEM;
1603 			goto err;
1604 		}
1605 
1606 		dev->vqs[i]->index = i;
1607 		dev->vqs[i]->irq_effective_cpu = IRQ_UNBOUND;
1608 		INIT_WORK(&dev->vqs[i]->inject, vduse_vq_irq_inject);
1609 		INIT_WORK(&dev->vqs[i]->kick, vduse_vq_kick_work);
1610 		spin_lock_init(&dev->vqs[i]->kick_lock);
1611 		spin_lock_init(&dev->vqs[i]->irq_lock);
1612 		cpumask_setall(&dev->vqs[i]->irq_affinity);
1613 
1614 		kobject_init(&dev->vqs[i]->kobj, &vq_type);
1615 		ret = kobject_add(&dev->vqs[i]->kobj,
1616 				  &dev->dev->kobj, "vq%d", i);
1617 		if (ret) {
1618 			kfree(dev->vqs[i]);
1619 			goto err;
1620 		}
1621 	}
1622 
1623 	return 0;
1624 err:
1625 	while (i--)
1626 		kobject_put(&dev->vqs[i]->kobj);
1627 	kfree(dev->vqs);
1628 	dev->vqs = NULL;
1629 	return ret;
1630 }
1631 
1632 static struct vduse_dev *vduse_dev_create(void)
1633 {
1634 	struct vduse_dev *dev = kzalloc(sizeof(*dev), GFP_KERNEL);
1635 
1636 	if (!dev)
1637 		return NULL;
1638 
1639 	mutex_init(&dev->lock);
1640 	mutex_init(&dev->mem_lock);
1641 	mutex_init(&dev->domain_lock);
1642 	spin_lock_init(&dev->msg_lock);
1643 	INIT_LIST_HEAD(&dev->send_list);
1644 	INIT_LIST_HEAD(&dev->recv_list);
1645 	spin_lock_init(&dev->irq_lock);
1646 	init_rwsem(&dev->rwsem);
1647 
1648 	INIT_WORK(&dev->inject, vduse_dev_irq_inject);
1649 	init_waitqueue_head(&dev->waitq);
1650 
1651 	return dev;
1652 }
1653 
1654 static void vduse_dev_destroy(struct vduse_dev *dev)
1655 {
1656 	kfree(dev);
1657 }
1658 
1659 static struct vduse_dev *vduse_find_dev(const char *name)
1660 {
1661 	struct vduse_dev *dev;
1662 	int id;
1663 
1664 	idr_for_each_entry(&vduse_idr, dev, id)
1665 		if (!strcmp(dev->name, name))
1666 			return dev;
1667 
1668 	return NULL;
1669 }
1670 
1671 static int vduse_destroy_dev(char *name)
1672 {
1673 	struct vduse_dev *dev = vduse_find_dev(name);
1674 
1675 	if (!dev)
1676 		return -EINVAL;
1677 
1678 	mutex_lock(&dev->lock);
1679 	if (dev->vdev || dev->connected) {
1680 		mutex_unlock(&dev->lock);
1681 		return -EBUSY;
1682 	}
1683 	dev->connected = true;
1684 	mutex_unlock(&dev->lock);
1685 
1686 	vduse_dev_reset(dev);
1687 	device_destroy(&vduse_class, MKDEV(MAJOR(vduse_major), dev->minor));
1688 	idr_remove(&vduse_idr, dev->minor);
1689 	kvfree(dev->config);
1690 	vduse_dev_deinit_vqs(dev);
1691 	if (dev->domain)
1692 		vduse_domain_destroy(dev->domain);
1693 	kfree(dev->name);
1694 	vduse_dev_destroy(dev);
1695 	module_put(THIS_MODULE);
1696 
1697 	return 0;
1698 }
1699 
1700 static bool device_is_allowed(u32 device_id)
1701 {
1702 	int i;
1703 
1704 	for (i = 0; i < ARRAY_SIZE(allowed_device_id); i++)
1705 		if (allowed_device_id[i] == device_id)
1706 			return true;
1707 
1708 	return false;
1709 }
1710 
1711 static bool features_is_valid(struct vduse_dev_config *config)
1712 {
1713 	if (!(config->features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)))
1714 		return false;
1715 
1716 	/* Now we only support read-only configuration space */
1717 	if ((config->device_id == VIRTIO_ID_BLOCK) &&
1718 			(config->features & BIT_ULL(VIRTIO_BLK_F_CONFIG_WCE)))
1719 		return false;
1720 	else if ((config->device_id == VIRTIO_ID_NET) &&
1721 			(config->features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
1722 		return false;
1723 
1724 	if ((config->device_id == VIRTIO_ID_NET) &&
1725 			!(config->features & BIT_ULL(VIRTIO_F_VERSION_1)))
1726 		return false;
1727 
1728 	return true;
1729 }
1730 
1731 static bool vduse_validate_config(struct vduse_dev_config *config)
1732 {
1733 	if (!is_mem_zero((const char *)config->reserved,
1734 			 sizeof(config->reserved)))
1735 		return false;
1736 
1737 	if (config->vq_align > PAGE_SIZE)
1738 		return false;
1739 
1740 	if (config->config_size > PAGE_SIZE)
1741 		return false;
1742 
1743 	if (config->vq_num > 0xffff)
1744 		return false;
1745 
1746 	if (!config->name[0])
1747 		return false;
1748 
1749 	if (!device_is_allowed(config->device_id))
1750 		return false;
1751 
1752 	if (!features_is_valid(config))
1753 		return false;
1754 
1755 	return true;
1756 }
1757 
1758 static ssize_t msg_timeout_show(struct device *device,
1759 				struct device_attribute *attr, char *buf)
1760 {
1761 	struct vduse_dev *dev = dev_get_drvdata(device);
1762 
1763 	return sysfs_emit(buf, "%u\n", dev->msg_timeout);
1764 }
1765 
1766 static ssize_t msg_timeout_store(struct device *device,
1767 				 struct device_attribute *attr,
1768 				 const char *buf, size_t count)
1769 {
1770 	struct vduse_dev *dev = dev_get_drvdata(device);
1771 	int ret;
1772 
1773 	ret = kstrtouint(buf, 10, &dev->msg_timeout);
1774 	if (ret < 0)
1775 		return ret;
1776 
1777 	return count;
1778 }
1779 
1780 static DEVICE_ATTR_RW(msg_timeout);
1781 
1782 static ssize_t bounce_size_show(struct device *device,
1783 				struct device_attribute *attr, char *buf)
1784 {
1785 	struct vduse_dev *dev = dev_get_drvdata(device);
1786 
1787 	return sysfs_emit(buf, "%u\n", dev->bounce_size);
1788 }
1789 
1790 static ssize_t bounce_size_store(struct device *device,
1791 				 struct device_attribute *attr,
1792 				 const char *buf, size_t count)
1793 {
1794 	struct vduse_dev *dev = dev_get_drvdata(device);
1795 	unsigned int bounce_size;
1796 	int ret;
1797 
1798 	ret = -EPERM;
1799 	mutex_lock(&dev->domain_lock);
1800 	if (dev->domain)
1801 		goto unlock;
1802 
1803 	ret = kstrtouint(buf, 10, &bounce_size);
1804 	if (ret < 0)
1805 		goto unlock;
1806 
1807 	ret = -EINVAL;
1808 	if (bounce_size > VDUSE_MAX_BOUNCE_SIZE ||
1809 	    bounce_size < VDUSE_MIN_BOUNCE_SIZE)
1810 		goto unlock;
1811 
1812 	dev->bounce_size = bounce_size & PAGE_MASK;
1813 	ret = count;
1814 unlock:
1815 	mutex_unlock(&dev->domain_lock);
1816 	return ret;
1817 }
1818 
1819 static DEVICE_ATTR_RW(bounce_size);
1820 
1821 static struct attribute *vduse_dev_attrs[] = {
1822 	&dev_attr_msg_timeout.attr,
1823 	&dev_attr_bounce_size.attr,
1824 	NULL
1825 };
1826 
1827 ATTRIBUTE_GROUPS(vduse_dev);
1828 
1829 static int vduse_create_dev(struct vduse_dev_config *config,
1830 			    void *config_buf, u64 api_version)
1831 {
1832 	int ret;
1833 	struct vduse_dev *dev;
1834 
1835 	ret = -EPERM;
1836 	if ((config->device_id == VIRTIO_ID_NET) && !capable(CAP_NET_ADMIN))
1837 		goto err;
1838 
1839 	ret = -EEXIST;
1840 	if (vduse_find_dev(config->name))
1841 		goto err;
1842 
1843 	ret = -ENOMEM;
1844 	dev = vduse_dev_create();
1845 	if (!dev)
1846 		goto err;
1847 
1848 	dev->api_version = api_version;
1849 	dev->device_features = config->features;
1850 	dev->device_id = config->device_id;
1851 	dev->vendor_id = config->vendor_id;
1852 	dev->name = kstrdup(config->name, GFP_KERNEL);
1853 	if (!dev->name)
1854 		goto err_str;
1855 
1856 	dev->bounce_size = VDUSE_BOUNCE_SIZE;
1857 	dev->config = config_buf;
1858 	dev->config_size = config->config_size;
1859 
1860 	ret = idr_alloc(&vduse_idr, dev, 1, VDUSE_DEV_MAX, GFP_KERNEL);
1861 	if (ret < 0)
1862 		goto err_idr;
1863 
1864 	dev->minor = ret;
1865 	dev->msg_timeout = VDUSE_MSG_DEFAULT_TIMEOUT;
1866 	dev->dev = device_create_with_groups(&vduse_class, NULL,
1867 				MKDEV(MAJOR(vduse_major), dev->minor),
1868 				dev, vduse_dev_groups, "%s", config->name);
1869 	if (IS_ERR(dev->dev)) {
1870 		ret = PTR_ERR(dev->dev);
1871 		goto err_dev;
1872 	}
1873 
1874 	ret = vduse_dev_init_vqs(dev, config->vq_align, config->vq_num);
1875 	if (ret)
1876 		goto err_vqs;
1877 
1878 	__module_get(THIS_MODULE);
1879 
1880 	return 0;
1881 err_vqs:
1882 	device_destroy(&vduse_class, MKDEV(MAJOR(vduse_major), dev->minor));
1883 err_dev:
1884 	idr_remove(&vduse_idr, dev->minor);
1885 err_idr:
1886 	kfree(dev->name);
1887 err_str:
1888 	vduse_dev_destroy(dev);
1889 err:
1890 	return ret;
1891 }
1892 
1893 static long vduse_ioctl(struct file *file, unsigned int cmd,
1894 			unsigned long arg)
1895 {
1896 	int ret;
1897 	void __user *argp = (void __user *)arg;
1898 	struct vduse_control *control = file->private_data;
1899 
1900 	mutex_lock(&vduse_lock);
1901 	switch (cmd) {
1902 	case VDUSE_GET_API_VERSION:
1903 		ret = put_user(control->api_version, (u64 __user *)argp);
1904 		break;
1905 	case VDUSE_SET_API_VERSION: {
1906 		u64 api_version;
1907 
1908 		ret = -EFAULT;
1909 		if (get_user(api_version, (u64 __user *)argp))
1910 			break;
1911 
1912 		ret = -EINVAL;
1913 		if (api_version > VDUSE_API_VERSION)
1914 			break;
1915 
1916 		ret = 0;
1917 		control->api_version = api_version;
1918 		break;
1919 	}
1920 	case VDUSE_CREATE_DEV: {
1921 		struct vduse_dev_config config;
1922 		unsigned long size = offsetof(struct vduse_dev_config, config);
1923 		void *buf;
1924 
1925 		ret = -EFAULT;
1926 		if (copy_from_user(&config, argp, size))
1927 			break;
1928 
1929 		ret = -EINVAL;
1930 		if (vduse_validate_config(&config) == false)
1931 			break;
1932 
1933 		buf = vmemdup_user(argp + size, config.config_size);
1934 		if (IS_ERR(buf)) {
1935 			ret = PTR_ERR(buf);
1936 			break;
1937 		}
1938 		config.name[VDUSE_NAME_MAX - 1] = '\0';
1939 		ret = vduse_create_dev(&config, buf, control->api_version);
1940 		if (ret)
1941 			kvfree(buf);
1942 		break;
1943 	}
1944 	case VDUSE_DESTROY_DEV: {
1945 		char name[VDUSE_NAME_MAX];
1946 
1947 		ret = -EFAULT;
1948 		if (copy_from_user(name, argp, VDUSE_NAME_MAX))
1949 			break;
1950 
1951 		name[VDUSE_NAME_MAX - 1] = '\0';
1952 		ret = vduse_destroy_dev(name);
1953 		break;
1954 	}
1955 	default:
1956 		ret = -EINVAL;
1957 		break;
1958 	}
1959 	mutex_unlock(&vduse_lock);
1960 
1961 	return ret;
1962 }
1963 
1964 static int vduse_release(struct inode *inode, struct file *file)
1965 {
1966 	struct vduse_control *control = file->private_data;
1967 
1968 	kfree(control);
1969 	return 0;
1970 }
1971 
1972 static int vduse_open(struct inode *inode, struct file *file)
1973 {
1974 	struct vduse_control *control;
1975 
1976 	control = kmalloc(sizeof(struct vduse_control), GFP_KERNEL);
1977 	if (!control)
1978 		return -ENOMEM;
1979 
1980 	control->api_version = VDUSE_API_VERSION;
1981 	file->private_data = control;
1982 
1983 	return 0;
1984 }
1985 
1986 static const struct file_operations vduse_ctrl_fops = {
1987 	.owner		= THIS_MODULE,
1988 	.open		= vduse_open,
1989 	.release	= vduse_release,
1990 	.unlocked_ioctl	= vduse_ioctl,
1991 	.compat_ioctl	= compat_ptr_ioctl,
1992 	.llseek		= noop_llseek,
1993 };
1994 
1995 struct vduse_mgmt_dev {
1996 	struct vdpa_mgmt_dev mgmt_dev;
1997 	struct device dev;
1998 };
1999 
2000 static struct vduse_mgmt_dev *vduse_mgmt;
2001 
2002 static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name)
2003 {
2004 	struct vduse_vdpa *vdev;
2005 	int ret;
2006 
2007 	if (dev->vdev)
2008 		return -EEXIST;
2009 
2010 	vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev,
2011 				 &vduse_vdpa_config_ops, 1, 1, name, true);
2012 	if (IS_ERR(vdev))
2013 		return PTR_ERR(vdev);
2014 
2015 	dev->vdev = vdev;
2016 	vdev->dev = dev;
2017 	vdev->vdpa.dev.dma_mask = &vdev->vdpa.dev.coherent_dma_mask;
2018 	ret = dma_set_mask_and_coherent(&vdev->vdpa.dev, DMA_BIT_MASK(64));
2019 	if (ret) {
2020 		put_device(&vdev->vdpa.dev);
2021 		return ret;
2022 	}
2023 	set_dma_ops(&vdev->vdpa.dev, &vduse_dev_dma_ops);
2024 	vdev->vdpa.dma_dev = &vdev->vdpa.dev;
2025 	vdev->vdpa.mdev = &vduse_mgmt->mgmt_dev;
2026 
2027 	return 0;
2028 }
2029 
2030 static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
2031 			const struct vdpa_dev_set_config *config)
2032 {
2033 	struct vduse_dev *dev;
2034 	int ret;
2035 
2036 	mutex_lock(&vduse_lock);
2037 	dev = vduse_find_dev(name);
2038 	if (!dev || !vduse_dev_is_ready(dev)) {
2039 		mutex_unlock(&vduse_lock);
2040 		return -EINVAL;
2041 	}
2042 	ret = vduse_dev_init_vdpa(dev, name);
2043 	mutex_unlock(&vduse_lock);
2044 	if (ret)
2045 		return ret;
2046 
2047 	mutex_lock(&dev->domain_lock);
2048 	if (!dev->domain)
2049 		dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1,
2050 						  dev->bounce_size);
2051 	mutex_unlock(&dev->domain_lock);
2052 	if (!dev->domain) {
2053 		put_device(&dev->vdev->vdpa.dev);
2054 		return -ENOMEM;
2055 	}
2056 
2057 	ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num);
2058 	if (ret) {
2059 		put_device(&dev->vdev->vdpa.dev);
2060 		mutex_lock(&dev->domain_lock);
2061 		vduse_domain_destroy(dev->domain);
2062 		dev->domain = NULL;
2063 		mutex_unlock(&dev->domain_lock);
2064 		return ret;
2065 	}
2066 
2067 	return 0;
2068 }
2069 
2070 static void vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev)
2071 {
2072 	_vdpa_unregister_device(dev);
2073 }
2074 
2075 static const struct vdpa_mgmtdev_ops vdpa_dev_mgmtdev_ops = {
2076 	.dev_add = vdpa_dev_add,
2077 	.dev_del = vdpa_dev_del,
2078 };
2079 
2080 static struct virtio_device_id id_table[] = {
2081 	{ VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
2082 	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
2083 	{ 0 },
2084 };
2085 
2086 static void vduse_mgmtdev_release(struct device *dev)
2087 {
2088 	struct vduse_mgmt_dev *mgmt_dev;
2089 
2090 	mgmt_dev = container_of(dev, struct vduse_mgmt_dev, dev);
2091 	kfree(mgmt_dev);
2092 }
2093 
2094 static int vduse_mgmtdev_init(void)
2095 {
2096 	int ret;
2097 
2098 	vduse_mgmt = kzalloc(sizeof(*vduse_mgmt), GFP_KERNEL);
2099 	if (!vduse_mgmt)
2100 		return -ENOMEM;
2101 
2102 	ret = dev_set_name(&vduse_mgmt->dev, "vduse");
2103 	if (ret) {
2104 		kfree(vduse_mgmt);
2105 		return ret;
2106 	}
2107 
2108 	vduse_mgmt->dev.release = vduse_mgmtdev_release;
2109 
2110 	ret = device_register(&vduse_mgmt->dev);
2111 	if (ret)
2112 		goto dev_reg_err;
2113 
2114 	vduse_mgmt->mgmt_dev.id_table = id_table;
2115 	vduse_mgmt->mgmt_dev.ops = &vdpa_dev_mgmtdev_ops;
2116 	vduse_mgmt->mgmt_dev.device = &vduse_mgmt->dev;
2117 	ret = vdpa_mgmtdev_register(&vduse_mgmt->mgmt_dev);
2118 	if (ret)
2119 		device_unregister(&vduse_mgmt->dev);
2120 
2121 	return ret;
2122 
2123 dev_reg_err:
2124 	put_device(&vduse_mgmt->dev);
2125 	return ret;
2126 }
2127 
2128 static void vduse_mgmtdev_exit(void)
2129 {
2130 	vdpa_mgmtdev_unregister(&vduse_mgmt->mgmt_dev);
2131 	device_unregister(&vduse_mgmt->dev);
2132 }
2133 
2134 static int vduse_init(void)
2135 {
2136 	int ret;
2137 	struct device *dev;
2138 
2139 	ret = class_register(&vduse_class);
2140 	if (ret)
2141 		return ret;
2142 
2143 	ret = alloc_chrdev_region(&vduse_major, 0, VDUSE_DEV_MAX, "vduse");
2144 	if (ret)
2145 		goto err_chardev_region;
2146 
2147 	/* /dev/vduse/control */
2148 	cdev_init(&vduse_ctrl_cdev, &vduse_ctrl_fops);
2149 	vduse_ctrl_cdev.owner = THIS_MODULE;
2150 	ret = cdev_add(&vduse_ctrl_cdev, vduse_major, 1);
2151 	if (ret)
2152 		goto err_ctrl_cdev;
2153 
2154 	dev = device_create(&vduse_class, NULL, vduse_major, NULL, "control");
2155 	if (IS_ERR(dev)) {
2156 		ret = PTR_ERR(dev);
2157 		goto err_device;
2158 	}
2159 
2160 	/* /dev/vduse/$DEVICE */
2161 	cdev_init(&vduse_cdev, &vduse_dev_fops);
2162 	vduse_cdev.owner = THIS_MODULE;
2163 	ret = cdev_add(&vduse_cdev, MKDEV(MAJOR(vduse_major), 1),
2164 		       VDUSE_DEV_MAX - 1);
2165 	if (ret)
2166 		goto err_cdev;
2167 
2168 	ret = -ENOMEM;
2169 	vduse_irq_wq = alloc_workqueue("vduse-irq",
2170 				WQ_HIGHPRI | WQ_SYSFS | WQ_UNBOUND, 0);
2171 	if (!vduse_irq_wq)
2172 		goto err_wq;
2173 
2174 	vduse_irq_bound_wq = alloc_workqueue("vduse-irq-bound", WQ_HIGHPRI, 0);
2175 	if (!vduse_irq_bound_wq)
2176 		goto err_bound_wq;
2177 
2178 	ret = vduse_domain_init();
2179 	if (ret)
2180 		goto err_domain;
2181 
2182 	ret = vduse_mgmtdev_init();
2183 	if (ret)
2184 		goto err_mgmtdev;
2185 
2186 	return 0;
2187 err_mgmtdev:
2188 	vduse_domain_exit();
2189 err_domain:
2190 	destroy_workqueue(vduse_irq_bound_wq);
2191 err_bound_wq:
2192 	destroy_workqueue(vduse_irq_wq);
2193 err_wq:
2194 	cdev_del(&vduse_cdev);
2195 err_cdev:
2196 	device_destroy(&vduse_class, vduse_major);
2197 err_device:
2198 	cdev_del(&vduse_ctrl_cdev);
2199 err_ctrl_cdev:
2200 	unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX);
2201 err_chardev_region:
2202 	class_unregister(&vduse_class);
2203 	return ret;
2204 }
2205 module_init(vduse_init);
2206 
2207 static void vduse_exit(void)
2208 {
2209 	vduse_mgmtdev_exit();
2210 	vduse_domain_exit();
2211 	destroy_workqueue(vduse_irq_bound_wq);
2212 	destroy_workqueue(vduse_irq_wq);
2213 	cdev_del(&vduse_cdev);
2214 	device_destroy(&vduse_class, vduse_major);
2215 	cdev_del(&vduse_ctrl_cdev);
2216 	unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX);
2217 	class_unregister(&vduse_class);
2218 }
2219 module_exit(vduse_exit);
2220 
2221 MODULE_LICENSE(DRV_LICENSE);
2222 MODULE_AUTHOR(DRV_AUTHOR);
2223 MODULE_DESCRIPTION(DRV_DESC);
2224