xref: /linux/drivers/vdpa/vdpa_user/vduse_dev.c (revision e3966940559d52aa1800a008dcfeec218dd31f88)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VDUSE: vDPA Device in Userspace
4  *
5  * Copyright (C) 2020-2021 Bytedance Inc. and/or its affiliates. All rights reserved.
6  *
7  * Author: Xie Yongji <xieyongji@bytedance.com>
8  *
9  */
10 
11 #include "linux/virtio_net.h"
12 #include <linux/init.h>
13 #include <linux/module.h>
14 #include <linux/cdev.h>
15 #include <linux/device.h>
16 #include <linux/eventfd.h>
17 #include <linux/slab.h>
18 #include <linux/wait.h>
19 #include <linux/dma-map-ops.h>
20 #include <linux/poll.h>
21 #include <linux/file.h>
22 #include <linux/uio.h>
23 #include <linux/vdpa.h>
24 #include <linux/nospec.h>
25 #include <linux/vmalloc.h>
26 #include <linux/sched/mm.h>
27 #include <uapi/linux/vduse.h>
28 #include <uapi/linux/vdpa.h>
29 #include <uapi/linux/virtio_config.h>
30 #include <uapi/linux/virtio_ids.h>
31 #include <uapi/linux/virtio_blk.h>
32 #include <uapi/linux/virtio_ring.h>
33 #include <linux/mod_devicetable.h>
34 
35 #include "iova_domain.h"
36 
37 #define DRV_AUTHOR   "Yongji Xie <xieyongji@bytedance.com>"
38 #define DRV_DESC     "vDPA Device in Userspace"
39 #define DRV_LICENSE  "GPL v2"
40 
41 #define VDUSE_DEV_MAX (1U << MINORBITS)
42 #define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024)
43 #define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024)
44 #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
45 /* 128 MB reserved for virtqueue creation */
46 #define VDUSE_IOVA_SIZE (VDUSE_MAX_BOUNCE_SIZE + 128 * 1024 * 1024)
47 #define VDUSE_MSG_DEFAULT_TIMEOUT 30
48 
49 #define IRQ_UNBOUND -1
50 
51 struct vduse_virtqueue {
52 	u16 index;
53 	u16 num_max;
54 	u32 num;
55 	u64 desc_addr;
56 	u64 driver_addr;
57 	u64 device_addr;
58 	struct vdpa_vq_state state;
59 	bool ready;
60 	bool kicked;
61 	spinlock_t kick_lock;
62 	spinlock_t irq_lock;
63 	struct eventfd_ctx *kickfd;
64 	struct vdpa_callback cb;
65 	struct work_struct inject;
66 	struct work_struct kick;
67 	int irq_effective_cpu;
68 	struct cpumask irq_affinity;
69 	struct kobject kobj;
70 };
71 
72 struct vduse_dev;
73 
74 struct vduse_vdpa {
75 	struct vdpa_device vdpa;
76 	struct vduse_dev *dev;
77 };
78 
79 struct vduse_umem {
80 	unsigned long iova;
81 	unsigned long npages;
82 	struct page **pages;
83 	struct mm_struct *mm;
84 };
85 
86 struct vduse_dev {
87 	struct vduse_vdpa *vdev;
88 	struct device *dev;
89 	struct vduse_virtqueue **vqs;
90 	struct vduse_iova_domain *domain;
91 	char *name;
92 	struct mutex lock;
93 	spinlock_t msg_lock;
94 	u64 msg_unique;
95 	u32 msg_timeout;
96 	wait_queue_head_t waitq;
97 	struct list_head send_list;
98 	struct list_head recv_list;
99 	struct vdpa_callback config_cb;
100 	struct work_struct inject;
101 	spinlock_t irq_lock;
102 	struct rw_semaphore rwsem;
103 	int minor;
104 	bool broken;
105 	bool connected;
106 	u64 api_version;
107 	u64 device_features;
108 	u64 driver_features;
109 	u32 device_id;
110 	u32 vendor_id;
111 	u32 generation;
112 	u32 config_size;
113 	void *config;
114 	u8 status;
115 	u32 vq_num;
116 	u32 vq_align;
117 	struct vduse_umem *umem;
118 	struct mutex mem_lock;
119 	unsigned int bounce_size;
120 	struct mutex domain_lock;
121 };
122 
123 struct vduse_dev_msg {
124 	struct vduse_dev_request req;
125 	struct vduse_dev_response resp;
126 	struct list_head list;
127 	wait_queue_head_t waitq;
128 	bool completed;
129 };
130 
131 struct vduse_control {
132 	u64 api_version;
133 };
134 
135 static DEFINE_MUTEX(vduse_lock);
136 static DEFINE_IDR(vduse_idr);
137 
138 static dev_t vduse_major;
139 static struct cdev vduse_ctrl_cdev;
140 static struct cdev vduse_cdev;
141 static struct workqueue_struct *vduse_irq_wq;
142 static struct workqueue_struct *vduse_irq_bound_wq;
143 
144 static u32 allowed_device_id[] = {
145 	VIRTIO_ID_BLOCK,
146 	VIRTIO_ID_NET,
147 	VIRTIO_ID_FS,
148 };
149 
150 static inline struct vduse_dev *vdpa_to_vduse(struct vdpa_device *vdpa)
151 {
152 	struct vduse_vdpa *vdev = container_of(vdpa, struct vduse_vdpa, vdpa);
153 
154 	return vdev->dev;
155 }
156 
157 static inline struct vduse_dev *dev_to_vduse(struct device *dev)
158 {
159 	struct vdpa_device *vdpa = dev_to_vdpa(dev);
160 
161 	return vdpa_to_vduse(vdpa);
162 }
163 
164 static struct vduse_dev_msg *vduse_find_msg(struct list_head *head,
165 					    uint32_t request_id)
166 {
167 	struct vduse_dev_msg *msg;
168 
169 	list_for_each_entry(msg, head, list) {
170 		if (msg->req.request_id == request_id) {
171 			list_del(&msg->list);
172 			return msg;
173 		}
174 	}
175 
176 	return NULL;
177 }
178 
179 static struct vduse_dev_msg *vduse_dequeue_msg(struct list_head *head)
180 {
181 	struct vduse_dev_msg *msg = NULL;
182 
183 	if (!list_empty(head)) {
184 		msg = list_first_entry(head, struct vduse_dev_msg, list);
185 		list_del(&msg->list);
186 	}
187 
188 	return msg;
189 }
190 
191 static void vduse_enqueue_msg(struct list_head *head,
192 			      struct vduse_dev_msg *msg)
193 {
194 	list_add_tail(&msg->list, head);
195 }
196 
197 static void vduse_dev_broken(struct vduse_dev *dev)
198 {
199 	struct vduse_dev_msg *msg, *tmp;
200 
201 	if (unlikely(dev->broken))
202 		return;
203 
204 	list_splice_init(&dev->recv_list, &dev->send_list);
205 	list_for_each_entry_safe(msg, tmp, &dev->send_list, list) {
206 		list_del(&msg->list);
207 		msg->completed = 1;
208 		msg->resp.result = VDUSE_REQ_RESULT_FAILED;
209 		wake_up(&msg->waitq);
210 	}
211 	dev->broken = true;
212 	wake_up(&dev->waitq);
213 }
214 
215 static int vduse_dev_msg_sync(struct vduse_dev *dev,
216 			      struct vduse_dev_msg *msg)
217 {
218 	int ret;
219 
220 	if (unlikely(dev->broken))
221 		return -EIO;
222 
223 	init_waitqueue_head(&msg->waitq);
224 	spin_lock(&dev->msg_lock);
225 	if (unlikely(dev->broken)) {
226 		spin_unlock(&dev->msg_lock);
227 		return -EIO;
228 	}
229 	msg->req.request_id = dev->msg_unique++;
230 	vduse_enqueue_msg(&dev->send_list, msg);
231 	wake_up(&dev->waitq);
232 	spin_unlock(&dev->msg_lock);
233 	if (dev->msg_timeout)
234 		ret = wait_event_killable_timeout(msg->waitq, msg->completed,
235 						  (long)dev->msg_timeout * HZ);
236 	else
237 		ret = wait_event_killable(msg->waitq, msg->completed);
238 
239 	spin_lock(&dev->msg_lock);
240 	if (!msg->completed) {
241 		list_del(&msg->list);
242 		msg->resp.result = VDUSE_REQ_RESULT_FAILED;
243 		/* Mark the device as malfunction when there is a timeout */
244 		if (!ret)
245 			vduse_dev_broken(dev);
246 	}
247 	ret = (msg->resp.result == VDUSE_REQ_RESULT_OK) ? 0 : -EIO;
248 	spin_unlock(&dev->msg_lock);
249 
250 	return ret;
251 }
252 
253 static int vduse_dev_get_vq_state_packed(struct vduse_dev *dev,
254 					 struct vduse_virtqueue *vq,
255 					 struct vdpa_vq_state_packed *packed)
256 {
257 	struct vduse_dev_msg msg = { 0 };
258 	int ret;
259 
260 	msg.req.type = VDUSE_GET_VQ_STATE;
261 	msg.req.vq_state.index = vq->index;
262 
263 	ret = vduse_dev_msg_sync(dev, &msg);
264 	if (ret)
265 		return ret;
266 
267 	packed->last_avail_counter =
268 			msg.resp.vq_state.packed.last_avail_counter & 0x0001;
269 	packed->last_avail_idx =
270 			msg.resp.vq_state.packed.last_avail_idx & 0x7FFF;
271 	packed->last_used_counter =
272 			msg.resp.vq_state.packed.last_used_counter & 0x0001;
273 	packed->last_used_idx =
274 			msg.resp.vq_state.packed.last_used_idx & 0x7FFF;
275 
276 	return 0;
277 }
278 
279 static int vduse_dev_get_vq_state_split(struct vduse_dev *dev,
280 					struct vduse_virtqueue *vq,
281 					struct vdpa_vq_state_split *split)
282 {
283 	struct vduse_dev_msg msg = { 0 };
284 	int ret;
285 
286 	msg.req.type = VDUSE_GET_VQ_STATE;
287 	msg.req.vq_state.index = vq->index;
288 
289 	ret = vduse_dev_msg_sync(dev, &msg);
290 	if (ret)
291 		return ret;
292 
293 	split->avail_index = msg.resp.vq_state.split.avail_index;
294 
295 	return 0;
296 }
297 
298 static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
299 {
300 	struct vduse_dev_msg msg = { 0 };
301 
302 	msg.req.type = VDUSE_SET_STATUS;
303 	msg.req.s.status = status;
304 
305 	return vduse_dev_msg_sync(dev, &msg);
306 }
307 
308 static int vduse_dev_update_iotlb(struct vduse_dev *dev,
309 				  u64 start, u64 last)
310 {
311 	struct vduse_dev_msg msg = { 0 };
312 
313 	if (last < start)
314 		return -EINVAL;
315 
316 	msg.req.type = VDUSE_UPDATE_IOTLB;
317 	msg.req.iova.start = start;
318 	msg.req.iova.last = last;
319 
320 	return vduse_dev_msg_sync(dev, &msg);
321 }
322 
323 static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
324 {
325 	struct file *file = iocb->ki_filp;
326 	struct vduse_dev *dev = file->private_data;
327 	struct vduse_dev_msg *msg;
328 	int size = sizeof(struct vduse_dev_request);
329 	ssize_t ret;
330 
331 	if (iov_iter_count(to) < size)
332 		return -EINVAL;
333 
334 	spin_lock(&dev->msg_lock);
335 	while (1) {
336 		msg = vduse_dequeue_msg(&dev->send_list);
337 		if (msg)
338 			break;
339 
340 		ret = -EAGAIN;
341 		if (file->f_flags & O_NONBLOCK)
342 			goto unlock;
343 
344 		spin_unlock(&dev->msg_lock);
345 		ret = wait_event_interruptible_exclusive(dev->waitq,
346 					!list_empty(&dev->send_list));
347 		if (ret)
348 			return ret;
349 
350 		spin_lock(&dev->msg_lock);
351 	}
352 	spin_unlock(&dev->msg_lock);
353 	ret = copy_to_iter(&msg->req, size, to);
354 	spin_lock(&dev->msg_lock);
355 	if (ret != size) {
356 		ret = -EFAULT;
357 		vduse_enqueue_msg(&dev->send_list, msg);
358 		goto unlock;
359 	}
360 	vduse_enqueue_msg(&dev->recv_list, msg);
361 unlock:
362 	spin_unlock(&dev->msg_lock);
363 
364 	return ret;
365 }
366 
367 static bool is_mem_zero(const char *ptr, int size)
368 {
369 	int i;
370 
371 	for (i = 0; i < size; i++) {
372 		if (ptr[i])
373 			return false;
374 	}
375 	return true;
376 }
377 
378 static ssize_t vduse_dev_write_iter(struct kiocb *iocb, struct iov_iter *from)
379 {
380 	struct file *file = iocb->ki_filp;
381 	struct vduse_dev *dev = file->private_data;
382 	struct vduse_dev_response resp;
383 	struct vduse_dev_msg *msg;
384 	size_t ret;
385 
386 	ret = copy_from_iter(&resp, sizeof(resp), from);
387 	if (ret != sizeof(resp))
388 		return -EINVAL;
389 
390 	if (!is_mem_zero((const char *)resp.reserved, sizeof(resp.reserved)))
391 		return -EINVAL;
392 
393 	spin_lock(&dev->msg_lock);
394 	msg = vduse_find_msg(&dev->recv_list, resp.request_id);
395 	if (!msg) {
396 		ret = -ENOENT;
397 		goto unlock;
398 	}
399 
400 	memcpy(&msg->resp, &resp, sizeof(resp));
401 	msg->completed = 1;
402 	wake_up(&msg->waitq);
403 unlock:
404 	spin_unlock(&dev->msg_lock);
405 
406 	return ret;
407 }
408 
409 static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
410 {
411 	struct vduse_dev *dev = file->private_data;
412 	__poll_t mask = 0;
413 
414 	poll_wait(file, &dev->waitq, wait);
415 
416 	spin_lock(&dev->msg_lock);
417 
418 	if (unlikely(dev->broken))
419 		mask |= EPOLLERR;
420 	if (!list_empty(&dev->send_list))
421 		mask |= EPOLLIN | EPOLLRDNORM;
422 	if (!list_empty(&dev->recv_list))
423 		mask |= EPOLLOUT | EPOLLWRNORM;
424 
425 	spin_unlock(&dev->msg_lock);
426 
427 	return mask;
428 }
429 
430 static void vduse_dev_reset(struct vduse_dev *dev)
431 {
432 	int i;
433 	struct vduse_iova_domain *domain = dev->domain;
434 
435 	/* The coherent mappings are handled in vduse_dev_free_coherent() */
436 	if (domain && domain->bounce_map)
437 		vduse_domain_reset_bounce_map(domain);
438 
439 	down_write(&dev->rwsem);
440 
441 	dev->status = 0;
442 	dev->driver_features = 0;
443 	dev->generation++;
444 	spin_lock(&dev->irq_lock);
445 	dev->config_cb.callback = NULL;
446 	dev->config_cb.private = NULL;
447 	spin_unlock(&dev->irq_lock);
448 	flush_work(&dev->inject);
449 
450 	for (i = 0; i < dev->vq_num; i++) {
451 		struct vduse_virtqueue *vq = dev->vqs[i];
452 
453 		vq->ready = false;
454 		vq->desc_addr = 0;
455 		vq->driver_addr = 0;
456 		vq->device_addr = 0;
457 		vq->num = 0;
458 		memset(&vq->state, 0, sizeof(vq->state));
459 
460 		spin_lock(&vq->kick_lock);
461 		vq->kicked = false;
462 		if (vq->kickfd)
463 			eventfd_ctx_put(vq->kickfd);
464 		vq->kickfd = NULL;
465 		spin_unlock(&vq->kick_lock);
466 
467 		spin_lock(&vq->irq_lock);
468 		vq->cb.callback = NULL;
469 		vq->cb.private = NULL;
470 		vq->cb.trigger = NULL;
471 		spin_unlock(&vq->irq_lock);
472 		flush_work(&vq->inject);
473 		flush_work(&vq->kick);
474 	}
475 
476 	up_write(&dev->rwsem);
477 }
478 
479 static int vduse_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 idx,
480 				u64 desc_area, u64 driver_area,
481 				u64 device_area)
482 {
483 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
484 	struct vduse_virtqueue *vq = dev->vqs[idx];
485 
486 	vq->desc_addr = desc_area;
487 	vq->driver_addr = driver_area;
488 	vq->device_addr = device_area;
489 
490 	return 0;
491 }
492 
493 static void vduse_vq_kick(struct vduse_virtqueue *vq)
494 {
495 	spin_lock(&vq->kick_lock);
496 	if (!vq->ready)
497 		goto unlock;
498 
499 	if (vq->kickfd)
500 		eventfd_signal(vq->kickfd);
501 	else
502 		vq->kicked = true;
503 unlock:
504 	spin_unlock(&vq->kick_lock);
505 }
506 
507 static void vduse_vq_kick_work(struct work_struct *work)
508 {
509 	struct vduse_virtqueue *vq = container_of(work,
510 					struct vduse_virtqueue, kick);
511 
512 	vduse_vq_kick(vq);
513 }
514 
515 static void vduse_vdpa_kick_vq(struct vdpa_device *vdpa, u16 idx)
516 {
517 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
518 	struct vduse_virtqueue *vq = dev->vqs[idx];
519 
520 	if (!eventfd_signal_allowed()) {
521 		schedule_work(&vq->kick);
522 		return;
523 	}
524 	vduse_vq_kick(vq);
525 }
526 
527 static void vduse_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 idx,
528 			      struct vdpa_callback *cb)
529 {
530 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
531 	struct vduse_virtqueue *vq = dev->vqs[idx];
532 
533 	spin_lock(&vq->irq_lock);
534 	vq->cb.callback = cb->callback;
535 	vq->cb.private = cb->private;
536 	vq->cb.trigger = cb->trigger;
537 	spin_unlock(&vq->irq_lock);
538 }
539 
540 static void vduse_vdpa_set_vq_num(struct vdpa_device *vdpa, u16 idx, u32 num)
541 {
542 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
543 	struct vduse_virtqueue *vq = dev->vqs[idx];
544 
545 	vq->num = num;
546 }
547 
548 static u16 vduse_vdpa_get_vq_size(struct vdpa_device *vdpa, u16 idx)
549 {
550 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
551 	struct vduse_virtqueue *vq = dev->vqs[idx];
552 
553 	if (vq->num)
554 		return vq->num;
555 	else
556 		return vq->num_max;
557 }
558 
559 static void vduse_vdpa_set_vq_ready(struct vdpa_device *vdpa,
560 					u16 idx, bool ready)
561 {
562 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
563 	struct vduse_virtqueue *vq = dev->vqs[idx];
564 
565 	vq->ready = ready;
566 }
567 
568 static bool vduse_vdpa_get_vq_ready(struct vdpa_device *vdpa, u16 idx)
569 {
570 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
571 	struct vduse_virtqueue *vq = dev->vqs[idx];
572 
573 	return vq->ready;
574 }
575 
576 static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx,
577 				const struct vdpa_vq_state *state)
578 {
579 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
580 	struct vduse_virtqueue *vq = dev->vqs[idx];
581 
582 	if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
583 		vq->state.packed.last_avail_counter =
584 				state->packed.last_avail_counter;
585 		vq->state.packed.last_avail_idx = state->packed.last_avail_idx;
586 		vq->state.packed.last_used_counter =
587 				state->packed.last_used_counter;
588 		vq->state.packed.last_used_idx = state->packed.last_used_idx;
589 	} else
590 		vq->state.split.avail_index = state->split.avail_index;
591 
592 	return 0;
593 }
594 
595 static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
596 				struct vdpa_vq_state *state)
597 {
598 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
599 	struct vduse_virtqueue *vq = dev->vqs[idx];
600 
601 	if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED))
602 		return vduse_dev_get_vq_state_packed(dev, vq, &state->packed);
603 
604 	return vduse_dev_get_vq_state_split(dev, vq, &state->split);
605 }
606 
607 static u32 vduse_vdpa_get_vq_align(struct vdpa_device *vdpa)
608 {
609 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
610 
611 	return dev->vq_align;
612 }
613 
614 static u64 vduse_vdpa_get_device_features(struct vdpa_device *vdpa)
615 {
616 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
617 
618 	return dev->device_features;
619 }
620 
621 static int vduse_vdpa_set_driver_features(struct vdpa_device *vdpa, u64 features)
622 {
623 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
624 
625 	dev->driver_features = features;
626 	return 0;
627 }
628 
629 static u64 vduse_vdpa_get_driver_features(struct vdpa_device *vdpa)
630 {
631 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
632 
633 	return dev->driver_features;
634 }
635 
636 static void vduse_vdpa_set_config_cb(struct vdpa_device *vdpa,
637 				  struct vdpa_callback *cb)
638 {
639 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
640 
641 	spin_lock(&dev->irq_lock);
642 	dev->config_cb.callback = cb->callback;
643 	dev->config_cb.private = cb->private;
644 	spin_unlock(&dev->irq_lock);
645 }
646 
647 static u16 vduse_vdpa_get_vq_num_max(struct vdpa_device *vdpa)
648 {
649 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
650 	u16 num_max = 0;
651 	int i;
652 
653 	for (i = 0; i < dev->vq_num; i++)
654 		if (num_max < dev->vqs[i]->num_max)
655 			num_max = dev->vqs[i]->num_max;
656 
657 	return num_max;
658 }
659 
660 static u32 vduse_vdpa_get_device_id(struct vdpa_device *vdpa)
661 {
662 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
663 
664 	return dev->device_id;
665 }
666 
667 static u32 vduse_vdpa_get_vendor_id(struct vdpa_device *vdpa)
668 {
669 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
670 
671 	return dev->vendor_id;
672 }
673 
674 static u8 vduse_vdpa_get_status(struct vdpa_device *vdpa)
675 {
676 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
677 
678 	return dev->status;
679 }
680 
681 static void vduse_vdpa_set_status(struct vdpa_device *vdpa, u8 status)
682 {
683 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
684 
685 	if (vduse_dev_set_status(dev, status))
686 		return;
687 
688 	dev->status = status;
689 }
690 
691 static size_t vduse_vdpa_get_config_size(struct vdpa_device *vdpa)
692 {
693 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
694 
695 	return dev->config_size;
696 }
697 
698 static void vduse_vdpa_get_config(struct vdpa_device *vdpa, unsigned int offset,
699 				  void *buf, unsigned int len)
700 {
701 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
702 
703 	/* Initialize the buffer in case of partial copy. */
704 	memset(buf, 0, len);
705 
706 	if (offset > dev->config_size)
707 		return;
708 
709 	if (len > dev->config_size - offset)
710 		len = dev->config_size - offset;
711 
712 	memcpy(buf, dev->config + offset, len);
713 }
714 
715 static void vduse_vdpa_set_config(struct vdpa_device *vdpa, unsigned int offset,
716 			const void *buf, unsigned int len)
717 {
718 	/* Now we only support read-only configuration space */
719 }
720 
721 static int vduse_vdpa_reset(struct vdpa_device *vdpa)
722 {
723 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
724 	int ret = vduse_dev_set_status(dev, 0);
725 
726 	vduse_dev_reset(dev);
727 
728 	return ret;
729 }
730 
731 static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa)
732 {
733 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
734 
735 	return dev->generation;
736 }
737 
738 static int vduse_vdpa_set_vq_affinity(struct vdpa_device *vdpa, u16 idx,
739 				      const struct cpumask *cpu_mask)
740 {
741 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
742 
743 	if (cpu_mask)
744 		cpumask_copy(&dev->vqs[idx]->irq_affinity, cpu_mask);
745 	else
746 		cpumask_setall(&dev->vqs[idx]->irq_affinity);
747 
748 	return 0;
749 }
750 
751 static const struct cpumask *
752 vduse_vdpa_get_vq_affinity(struct vdpa_device *vdpa, u16 idx)
753 {
754 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
755 
756 	return &dev->vqs[idx]->irq_affinity;
757 }
758 
759 static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
760 				unsigned int asid,
761 				struct vhost_iotlb *iotlb)
762 {
763 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
764 	int ret;
765 
766 	ret = vduse_domain_set_map(dev->domain, iotlb);
767 	if (ret)
768 		return ret;
769 
770 	ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX);
771 	if (ret) {
772 		vduse_domain_clear_map(dev->domain, iotlb);
773 		return ret;
774 	}
775 
776 	return 0;
777 }
778 
779 static void vduse_vdpa_free(struct vdpa_device *vdpa)
780 {
781 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
782 
783 	dev->vdev = NULL;
784 }
785 
786 static const struct vdpa_config_ops vduse_vdpa_config_ops = {
787 	.set_vq_address		= vduse_vdpa_set_vq_address,
788 	.kick_vq		= vduse_vdpa_kick_vq,
789 	.set_vq_cb		= vduse_vdpa_set_vq_cb,
790 	.set_vq_num             = vduse_vdpa_set_vq_num,
791 	.get_vq_size		= vduse_vdpa_get_vq_size,
792 	.set_vq_ready		= vduse_vdpa_set_vq_ready,
793 	.get_vq_ready		= vduse_vdpa_get_vq_ready,
794 	.set_vq_state		= vduse_vdpa_set_vq_state,
795 	.get_vq_state		= vduse_vdpa_get_vq_state,
796 	.get_vq_align		= vduse_vdpa_get_vq_align,
797 	.get_device_features	= vduse_vdpa_get_device_features,
798 	.set_driver_features	= vduse_vdpa_set_driver_features,
799 	.get_driver_features	= vduse_vdpa_get_driver_features,
800 	.set_config_cb		= vduse_vdpa_set_config_cb,
801 	.get_vq_num_max		= vduse_vdpa_get_vq_num_max,
802 	.get_device_id		= vduse_vdpa_get_device_id,
803 	.get_vendor_id		= vduse_vdpa_get_vendor_id,
804 	.get_status		= vduse_vdpa_get_status,
805 	.set_status		= vduse_vdpa_set_status,
806 	.get_config_size	= vduse_vdpa_get_config_size,
807 	.get_config		= vduse_vdpa_get_config,
808 	.set_config		= vduse_vdpa_set_config,
809 	.get_generation		= vduse_vdpa_get_generation,
810 	.set_vq_affinity	= vduse_vdpa_set_vq_affinity,
811 	.get_vq_affinity	= vduse_vdpa_get_vq_affinity,
812 	.reset			= vduse_vdpa_reset,
813 	.set_map		= vduse_vdpa_set_map,
814 	.free			= vduse_vdpa_free,
815 };
816 
817 static void vduse_dev_sync_single_for_device(union virtio_map token,
818 					     dma_addr_t dma_addr, size_t size,
819 					     enum dma_data_direction dir)
820 {
821 	struct vduse_iova_domain *domain = token.iova_domain;
822 
823 	vduse_domain_sync_single_for_device(domain, dma_addr, size, dir);
824 }
825 
826 static void vduse_dev_sync_single_for_cpu(union virtio_map token,
827 					     dma_addr_t dma_addr, size_t size,
828 					     enum dma_data_direction dir)
829 {
830 	struct vduse_iova_domain *domain = token.iova_domain;
831 
832 	vduse_domain_sync_single_for_cpu(domain, dma_addr, size, dir);
833 }
834 
835 static dma_addr_t vduse_dev_map_page(union virtio_map token, struct page *page,
836 				     unsigned long offset, size_t size,
837 				     enum dma_data_direction dir,
838 				     unsigned long attrs)
839 {
840 	struct vduse_iova_domain *domain = token.iova_domain;
841 
842 	return vduse_domain_map_page(domain, page, offset, size, dir, attrs);
843 }
844 
845 static void vduse_dev_unmap_page(union virtio_map token, dma_addr_t dma_addr,
846 				 size_t size, enum dma_data_direction dir,
847 				 unsigned long attrs)
848 {
849 	struct vduse_iova_domain *domain = token.iova_domain;
850 
851 	return vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs);
852 }
853 
854 static void *vduse_dev_alloc_coherent(union virtio_map token, size_t size,
855 				      dma_addr_t *dma_addr, gfp_t flag)
856 {
857 	struct vduse_iova_domain *domain = token.iova_domain;
858 	unsigned long iova;
859 	void *addr;
860 
861 	*dma_addr = DMA_MAPPING_ERROR;
862 	addr = vduse_domain_alloc_coherent(domain, size,
863 					   (dma_addr_t *)&iova, flag);
864 	if (!addr)
865 		return NULL;
866 
867 	*dma_addr = (dma_addr_t)iova;
868 
869 	return addr;
870 }
871 
872 static void vduse_dev_free_coherent(union virtio_map token, size_t size,
873 				    void *vaddr, dma_addr_t dma_addr,
874 				    unsigned long attrs)
875 {
876 	struct vduse_iova_domain *domain = token.iova_domain;
877 
878 	vduse_domain_free_coherent(domain, size, vaddr, dma_addr, attrs);
879 }
880 
881 static bool vduse_dev_need_sync(union virtio_map token, dma_addr_t dma_addr)
882 {
883 	struct vduse_iova_domain *domain = token.iova_domain;
884 
885 	return dma_addr < domain->bounce_size;
886 }
887 
888 static int vduse_dev_mapping_error(union virtio_map token, dma_addr_t dma_addr)
889 {
890 	if (unlikely(dma_addr == DMA_MAPPING_ERROR))
891 		return -ENOMEM;
892 	return 0;
893 }
894 
895 static size_t vduse_dev_max_mapping_size(union virtio_map token)
896 {
897 	struct vduse_iova_domain *domain = token.iova_domain;
898 
899 	return domain->bounce_size;
900 }
901 
902 static const struct virtio_map_ops vduse_map_ops = {
903 	.sync_single_for_device = vduse_dev_sync_single_for_device,
904 	.sync_single_for_cpu = vduse_dev_sync_single_for_cpu,
905 	.map_page = vduse_dev_map_page,
906 	.unmap_page = vduse_dev_unmap_page,
907 	.alloc = vduse_dev_alloc_coherent,
908 	.free = vduse_dev_free_coherent,
909 	.need_sync = vduse_dev_need_sync,
910 	.mapping_error = vduse_dev_mapping_error,
911 	.max_mapping_size = vduse_dev_max_mapping_size,
912 };
913 
914 static unsigned int perm_to_file_flags(u8 perm)
915 {
916 	unsigned int flags = 0;
917 
918 	switch (perm) {
919 	case VDUSE_ACCESS_WO:
920 		flags |= O_WRONLY;
921 		break;
922 	case VDUSE_ACCESS_RO:
923 		flags |= O_RDONLY;
924 		break;
925 	case VDUSE_ACCESS_RW:
926 		flags |= O_RDWR;
927 		break;
928 	default:
929 		WARN(1, "invalidate vhost IOTLB permission\n");
930 		break;
931 	}
932 
933 	return flags;
934 }
935 
936 static int vduse_kickfd_setup(struct vduse_dev *dev,
937 			struct vduse_vq_eventfd *eventfd)
938 {
939 	struct eventfd_ctx *ctx = NULL;
940 	struct vduse_virtqueue *vq;
941 	u32 index;
942 
943 	if (eventfd->index >= dev->vq_num)
944 		return -EINVAL;
945 
946 	index = array_index_nospec(eventfd->index, dev->vq_num);
947 	vq = dev->vqs[index];
948 	if (eventfd->fd >= 0) {
949 		ctx = eventfd_ctx_fdget(eventfd->fd);
950 		if (IS_ERR(ctx))
951 			return PTR_ERR(ctx);
952 	} else if (eventfd->fd != VDUSE_EVENTFD_DEASSIGN)
953 		return 0;
954 
955 	spin_lock(&vq->kick_lock);
956 	if (vq->kickfd)
957 		eventfd_ctx_put(vq->kickfd);
958 	vq->kickfd = ctx;
959 	if (vq->ready && vq->kicked && vq->kickfd) {
960 		eventfd_signal(vq->kickfd);
961 		vq->kicked = false;
962 	}
963 	spin_unlock(&vq->kick_lock);
964 
965 	return 0;
966 }
967 
968 static bool vduse_dev_is_ready(struct vduse_dev *dev)
969 {
970 	int i;
971 
972 	for (i = 0; i < dev->vq_num; i++)
973 		if (!dev->vqs[i]->num_max)
974 			return false;
975 
976 	return true;
977 }
978 
979 static void vduse_dev_irq_inject(struct work_struct *work)
980 {
981 	struct vduse_dev *dev = container_of(work, struct vduse_dev, inject);
982 
983 	spin_lock_bh(&dev->irq_lock);
984 	if (dev->config_cb.callback)
985 		dev->config_cb.callback(dev->config_cb.private);
986 	spin_unlock_bh(&dev->irq_lock);
987 }
988 
989 static void vduse_vq_irq_inject(struct work_struct *work)
990 {
991 	struct vduse_virtqueue *vq = container_of(work,
992 					struct vduse_virtqueue, inject);
993 
994 	spin_lock_bh(&vq->irq_lock);
995 	if (vq->ready && vq->cb.callback)
996 		vq->cb.callback(vq->cb.private);
997 	spin_unlock_bh(&vq->irq_lock);
998 }
999 
1000 static bool vduse_vq_signal_irqfd(struct vduse_virtqueue *vq)
1001 {
1002 	bool signal = false;
1003 
1004 	if (!vq->cb.trigger)
1005 		return false;
1006 
1007 	spin_lock_irq(&vq->irq_lock);
1008 	if (vq->ready && vq->cb.trigger) {
1009 		eventfd_signal(vq->cb.trigger);
1010 		signal = true;
1011 	}
1012 	spin_unlock_irq(&vq->irq_lock);
1013 
1014 	return signal;
1015 }
1016 
1017 static int vduse_dev_queue_irq_work(struct vduse_dev *dev,
1018 				    struct work_struct *irq_work,
1019 				    int irq_effective_cpu)
1020 {
1021 	int ret = -EINVAL;
1022 
1023 	down_read(&dev->rwsem);
1024 	if (!(dev->status & VIRTIO_CONFIG_S_DRIVER_OK))
1025 		goto unlock;
1026 
1027 	ret = 0;
1028 	if (irq_effective_cpu == IRQ_UNBOUND)
1029 		queue_work(vduse_irq_wq, irq_work);
1030 	else
1031 		queue_work_on(irq_effective_cpu,
1032 			      vduse_irq_bound_wq, irq_work);
1033 unlock:
1034 	up_read(&dev->rwsem);
1035 
1036 	return ret;
1037 }
1038 
1039 static int vduse_dev_dereg_umem(struct vduse_dev *dev,
1040 				u64 iova, u64 size)
1041 {
1042 	int ret;
1043 
1044 	mutex_lock(&dev->mem_lock);
1045 	ret = -ENOENT;
1046 	if (!dev->umem)
1047 		goto unlock;
1048 
1049 	ret = -EINVAL;
1050 	if (!dev->domain)
1051 		goto unlock;
1052 
1053 	if (dev->umem->iova != iova || size != dev->domain->bounce_size)
1054 		goto unlock;
1055 
1056 	vduse_domain_remove_user_bounce_pages(dev->domain);
1057 	unpin_user_pages_dirty_lock(dev->umem->pages,
1058 				    dev->umem->npages, true);
1059 	atomic64_sub(dev->umem->npages, &dev->umem->mm->pinned_vm);
1060 	mmdrop(dev->umem->mm);
1061 	vfree(dev->umem->pages);
1062 	kfree(dev->umem);
1063 	dev->umem = NULL;
1064 	ret = 0;
1065 unlock:
1066 	mutex_unlock(&dev->mem_lock);
1067 	return ret;
1068 }
1069 
1070 static int vduse_dev_reg_umem(struct vduse_dev *dev,
1071 			      u64 iova, u64 uaddr, u64 size)
1072 {
1073 	struct page **page_list = NULL;
1074 	struct vduse_umem *umem = NULL;
1075 	long pinned = 0;
1076 	unsigned long npages, lock_limit;
1077 	int ret;
1078 
1079 	if (!dev->domain || !dev->domain->bounce_map ||
1080 	    size != dev->domain->bounce_size ||
1081 	    iova != 0 || uaddr & ~PAGE_MASK)
1082 		return -EINVAL;
1083 
1084 	mutex_lock(&dev->mem_lock);
1085 	ret = -EEXIST;
1086 	if (dev->umem)
1087 		goto unlock;
1088 
1089 	ret = -ENOMEM;
1090 	npages = size >> PAGE_SHIFT;
1091 	page_list = __vmalloc(array_size(npages, sizeof(struct page *)),
1092 			      GFP_KERNEL_ACCOUNT);
1093 	umem = kzalloc(sizeof(*umem), GFP_KERNEL);
1094 	if (!page_list || !umem)
1095 		goto unlock;
1096 
1097 	mmap_read_lock(current->mm);
1098 
1099 	lock_limit = PFN_DOWN(rlimit(RLIMIT_MEMLOCK));
1100 	if (npages + atomic64_read(&current->mm->pinned_vm) > lock_limit)
1101 		goto out;
1102 
1103 	pinned = pin_user_pages(uaddr, npages, FOLL_LONGTERM | FOLL_WRITE,
1104 				page_list);
1105 	if (pinned != npages) {
1106 		ret = pinned < 0 ? pinned : -ENOMEM;
1107 		goto out;
1108 	}
1109 
1110 	ret = vduse_domain_add_user_bounce_pages(dev->domain,
1111 						 page_list, pinned);
1112 	if (ret)
1113 		goto out;
1114 
1115 	atomic64_add(npages, &current->mm->pinned_vm);
1116 
1117 	umem->pages = page_list;
1118 	umem->npages = pinned;
1119 	umem->iova = iova;
1120 	umem->mm = current->mm;
1121 	mmgrab(current->mm);
1122 
1123 	dev->umem = umem;
1124 out:
1125 	if (ret && pinned > 0)
1126 		unpin_user_pages(page_list, pinned);
1127 
1128 	mmap_read_unlock(current->mm);
1129 unlock:
1130 	if (ret) {
1131 		vfree(page_list);
1132 		kfree(umem);
1133 	}
1134 	mutex_unlock(&dev->mem_lock);
1135 	return ret;
1136 }
1137 
1138 static void vduse_vq_update_effective_cpu(struct vduse_virtqueue *vq)
1139 {
1140 	int curr_cpu = vq->irq_effective_cpu;
1141 
1142 	while (true) {
1143 		curr_cpu = cpumask_next(curr_cpu, &vq->irq_affinity);
1144 		if (cpu_online(curr_cpu))
1145 			break;
1146 
1147 		if (curr_cpu >= nr_cpu_ids)
1148 			curr_cpu = IRQ_UNBOUND;
1149 	}
1150 
1151 	vq->irq_effective_cpu = curr_cpu;
1152 }
1153 
1154 static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
1155 			    unsigned long arg)
1156 {
1157 	struct vduse_dev *dev = file->private_data;
1158 	void __user *argp = (void __user *)arg;
1159 	int ret;
1160 
1161 	if (unlikely(dev->broken))
1162 		return -EPERM;
1163 
1164 	switch (cmd) {
1165 	case VDUSE_IOTLB_GET_FD: {
1166 		struct vduse_iotlb_entry entry;
1167 		struct vhost_iotlb_map *map;
1168 		struct vdpa_map_file *map_file;
1169 		struct file *f = NULL;
1170 
1171 		ret = -EFAULT;
1172 		if (copy_from_user(&entry, argp, sizeof(entry)))
1173 			break;
1174 
1175 		ret = -EINVAL;
1176 		if (entry.start > entry.last)
1177 			break;
1178 
1179 		mutex_lock(&dev->domain_lock);
1180 		if (!dev->domain) {
1181 			mutex_unlock(&dev->domain_lock);
1182 			break;
1183 		}
1184 		spin_lock(&dev->domain->iotlb_lock);
1185 		map = vhost_iotlb_itree_first(dev->domain->iotlb,
1186 					      entry.start, entry.last);
1187 		if (map) {
1188 			map_file = (struct vdpa_map_file *)map->opaque;
1189 			f = get_file(map_file->file);
1190 			entry.offset = map_file->offset;
1191 			entry.start = map->start;
1192 			entry.last = map->last;
1193 			entry.perm = map->perm;
1194 		}
1195 		spin_unlock(&dev->domain->iotlb_lock);
1196 		mutex_unlock(&dev->domain_lock);
1197 		ret = -EINVAL;
1198 		if (!f)
1199 			break;
1200 
1201 		ret = -EFAULT;
1202 		if (copy_to_user(argp, &entry, sizeof(entry))) {
1203 			fput(f);
1204 			break;
1205 		}
1206 		ret = receive_fd(f, NULL, perm_to_file_flags(entry.perm));
1207 		fput(f);
1208 		break;
1209 	}
1210 	case VDUSE_DEV_GET_FEATURES:
1211 		/*
1212 		 * Just mirror what driver wrote here.
1213 		 * The driver is expected to check FEATURE_OK later.
1214 		 */
1215 		ret = put_user(dev->driver_features, (u64 __user *)argp);
1216 		break;
1217 	case VDUSE_DEV_SET_CONFIG: {
1218 		struct vduse_config_data config;
1219 		unsigned long size = offsetof(struct vduse_config_data,
1220 					      buffer);
1221 
1222 		ret = -EFAULT;
1223 		if (copy_from_user(&config, argp, size))
1224 			break;
1225 
1226 		ret = -EINVAL;
1227 		if (config.offset > dev->config_size ||
1228 		    config.length == 0 ||
1229 		    config.length > dev->config_size - config.offset)
1230 			break;
1231 
1232 		ret = -EFAULT;
1233 		if (copy_from_user(dev->config + config.offset, argp + size,
1234 				   config.length))
1235 			break;
1236 
1237 		ret = 0;
1238 		break;
1239 	}
1240 	case VDUSE_DEV_INJECT_CONFIG_IRQ:
1241 		ret = vduse_dev_queue_irq_work(dev, &dev->inject, IRQ_UNBOUND);
1242 		break;
1243 	case VDUSE_VQ_SETUP: {
1244 		struct vduse_vq_config config;
1245 		u32 index;
1246 
1247 		ret = -EFAULT;
1248 		if (copy_from_user(&config, argp, sizeof(config)))
1249 			break;
1250 
1251 		ret = -EINVAL;
1252 		if (config.index >= dev->vq_num)
1253 			break;
1254 
1255 		if (!is_mem_zero((const char *)config.reserved,
1256 				 sizeof(config.reserved)))
1257 			break;
1258 
1259 		index = array_index_nospec(config.index, dev->vq_num);
1260 		dev->vqs[index]->num_max = config.max_size;
1261 		ret = 0;
1262 		break;
1263 	}
1264 	case VDUSE_VQ_GET_INFO: {
1265 		struct vduse_vq_info vq_info;
1266 		struct vduse_virtqueue *vq;
1267 		u32 index;
1268 
1269 		ret = -EFAULT;
1270 		if (copy_from_user(&vq_info, argp, sizeof(vq_info)))
1271 			break;
1272 
1273 		ret = -EINVAL;
1274 		if (vq_info.index >= dev->vq_num)
1275 			break;
1276 
1277 		index = array_index_nospec(vq_info.index, dev->vq_num);
1278 		vq = dev->vqs[index];
1279 		vq_info.desc_addr = vq->desc_addr;
1280 		vq_info.driver_addr = vq->driver_addr;
1281 		vq_info.device_addr = vq->device_addr;
1282 		vq_info.num = vq->num;
1283 
1284 		if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
1285 			vq_info.packed.last_avail_counter =
1286 				vq->state.packed.last_avail_counter;
1287 			vq_info.packed.last_avail_idx =
1288 				vq->state.packed.last_avail_idx;
1289 			vq_info.packed.last_used_counter =
1290 				vq->state.packed.last_used_counter;
1291 			vq_info.packed.last_used_idx =
1292 				vq->state.packed.last_used_idx;
1293 		} else
1294 			vq_info.split.avail_index =
1295 				vq->state.split.avail_index;
1296 
1297 		vq_info.ready = vq->ready;
1298 
1299 		ret = -EFAULT;
1300 		if (copy_to_user(argp, &vq_info, sizeof(vq_info)))
1301 			break;
1302 
1303 		ret = 0;
1304 		break;
1305 	}
1306 	case VDUSE_VQ_SETUP_KICKFD: {
1307 		struct vduse_vq_eventfd eventfd;
1308 
1309 		ret = -EFAULT;
1310 		if (copy_from_user(&eventfd, argp, sizeof(eventfd)))
1311 			break;
1312 
1313 		ret = vduse_kickfd_setup(dev, &eventfd);
1314 		break;
1315 	}
1316 	case VDUSE_VQ_INJECT_IRQ: {
1317 		u32 index;
1318 
1319 		ret = -EFAULT;
1320 		if (get_user(index, (u32 __user *)argp))
1321 			break;
1322 
1323 		ret = -EINVAL;
1324 		if (index >= dev->vq_num)
1325 			break;
1326 
1327 		ret = 0;
1328 		index = array_index_nospec(index, dev->vq_num);
1329 		if (!vduse_vq_signal_irqfd(dev->vqs[index])) {
1330 			vduse_vq_update_effective_cpu(dev->vqs[index]);
1331 			ret = vduse_dev_queue_irq_work(dev,
1332 						&dev->vqs[index]->inject,
1333 						dev->vqs[index]->irq_effective_cpu);
1334 		}
1335 		break;
1336 	}
1337 	case VDUSE_IOTLB_REG_UMEM: {
1338 		struct vduse_iova_umem umem;
1339 
1340 		ret = -EFAULT;
1341 		if (copy_from_user(&umem, argp, sizeof(umem)))
1342 			break;
1343 
1344 		ret = -EINVAL;
1345 		if (!is_mem_zero((const char *)umem.reserved,
1346 				 sizeof(umem.reserved)))
1347 			break;
1348 
1349 		mutex_lock(&dev->domain_lock);
1350 		ret = vduse_dev_reg_umem(dev, umem.iova,
1351 					 umem.uaddr, umem.size);
1352 		mutex_unlock(&dev->domain_lock);
1353 		break;
1354 	}
1355 	case VDUSE_IOTLB_DEREG_UMEM: {
1356 		struct vduse_iova_umem umem;
1357 
1358 		ret = -EFAULT;
1359 		if (copy_from_user(&umem, argp, sizeof(umem)))
1360 			break;
1361 
1362 		ret = -EINVAL;
1363 		if (!is_mem_zero((const char *)umem.reserved,
1364 				 sizeof(umem.reserved)))
1365 			break;
1366 		mutex_lock(&dev->domain_lock);
1367 		ret = vduse_dev_dereg_umem(dev, umem.iova,
1368 					   umem.size);
1369 		mutex_unlock(&dev->domain_lock);
1370 		break;
1371 	}
1372 	case VDUSE_IOTLB_GET_INFO: {
1373 		struct vduse_iova_info info;
1374 		struct vhost_iotlb_map *map;
1375 
1376 		ret = -EFAULT;
1377 		if (copy_from_user(&info, argp, sizeof(info)))
1378 			break;
1379 
1380 		ret = -EINVAL;
1381 		if (info.start > info.last)
1382 			break;
1383 
1384 		if (!is_mem_zero((const char *)info.reserved,
1385 				 sizeof(info.reserved)))
1386 			break;
1387 
1388 		mutex_lock(&dev->domain_lock);
1389 		if (!dev->domain) {
1390 			mutex_unlock(&dev->domain_lock);
1391 			break;
1392 		}
1393 		spin_lock(&dev->domain->iotlb_lock);
1394 		map = vhost_iotlb_itree_first(dev->domain->iotlb,
1395 					      info.start, info.last);
1396 		if (map) {
1397 			info.start = map->start;
1398 			info.last = map->last;
1399 			info.capability = 0;
1400 			if (dev->domain->bounce_map && map->start == 0 &&
1401 			    map->last == dev->domain->bounce_size - 1)
1402 				info.capability |= VDUSE_IOVA_CAP_UMEM;
1403 		}
1404 		spin_unlock(&dev->domain->iotlb_lock);
1405 		mutex_unlock(&dev->domain_lock);
1406 		if (!map)
1407 			break;
1408 
1409 		ret = -EFAULT;
1410 		if (copy_to_user(argp, &info, sizeof(info)))
1411 			break;
1412 
1413 		ret = 0;
1414 		break;
1415 	}
1416 	default:
1417 		ret = -ENOIOCTLCMD;
1418 		break;
1419 	}
1420 
1421 	return ret;
1422 }
1423 
1424 static int vduse_dev_release(struct inode *inode, struct file *file)
1425 {
1426 	struct vduse_dev *dev = file->private_data;
1427 
1428 	mutex_lock(&dev->domain_lock);
1429 	if (dev->domain)
1430 		vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size);
1431 	mutex_unlock(&dev->domain_lock);
1432 	spin_lock(&dev->msg_lock);
1433 	/* Make sure the inflight messages can processed after reconncection */
1434 	list_splice_init(&dev->recv_list, &dev->send_list);
1435 	spin_unlock(&dev->msg_lock);
1436 	dev->connected = false;
1437 
1438 	return 0;
1439 }
1440 
1441 static struct vduse_dev *vduse_dev_get_from_minor(int minor)
1442 {
1443 	struct vduse_dev *dev;
1444 
1445 	mutex_lock(&vduse_lock);
1446 	dev = idr_find(&vduse_idr, minor);
1447 	mutex_unlock(&vduse_lock);
1448 
1449 	return dev;
1450 }
1451 
1452 static int vduse_dev_open(struct inode *inode, struct file *file)
1453 {
1454 	int ret;
1455 	struct vduse_dev *dev = vduse_dev_get_from_minor(iminor(inode));
1456 
1457 	if (!dev)
1458 		return -ENODEV;
1459 
1460 	ret = -EBUSY;
1461 	mutex_lock(&dev->lock);
1462 	if (dev->connected)
1463 		goto unlock;
1464 
1465 	ret = 0;
1466 	dev->connected = true;
1467 	file->private_data = dev;
1468 unlock:
1469 	mutex_unlock(&dev->lock);
1470 
1471 	return ret;
1472 }
1473 
1474 static const struct file_operations vduse_dev_fops = {
1475 	.owner		= THIS_MODULE,
1476 	.open		= vduse_dev_open,
1477 	.release	= vduse_dev_release,
1478 	.read_iter	= vduse_dev_read_iter,
1479 	.write_iter	= vduse_dev_write_iter,
1480 	.poll		= vduse_dev_poll,
1481 	.unlocked_ioctl	= vduse_dev_ioctl,
1482 	.compat_ioctl	= compat_ptr_ioctl,
1483 	.llseek		= noop_llseek,
1484 };
1485 
1486 static ssize_t irq_cb_affinity_show(struct vduse_virtqueue *vq, char *buf)
1487 {
1488 	return sprintf(buf, "%*pb\n", cpumask_pr_args(&vq->irq_affinity));
1489 }
1490 
1491 static ssize_t irq_cb_affinity_store(struct vduse_virtqueue *vq,
1492 				     const char *buf, size_t count)
1493 {
1494 	cpumask_var_t new_value;
1495 	int ret;
1496 
1497 	if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
1498 		return -ENOMEM;
1499 
1500 	ret = cpumask_parse(buf, new_value);
1501 	if (ret)
1502 		goto free_mask;
1503 
1504 	ret = -EINVAL;
1505 	if (!cpumask_intersects(new_value, cpu_online_mask))
1506 		goto free_mask;
1507 
1508 	cpumask_copy(&vq->irq_affinity, new_value);
1509 	ret = count;
1510 free_mask:
1511 	free_cpumask_var(new_value);
1512 	return ret;
1513 }
1514 
1515 struct vq_sysfs_entry {
1516 	struct attribute attr;
1517 	ssize_t (*show)(struct vduse_virtqueue *vq, char *buf);
1518 	ssize_t (*store)(struct vduse_virtqueue *vq, const char *buf,
1519 			 size_t count);
1520 };
1521 
1522 static struct vq_sysfs_entry irq_cb_affinity_attr = __ATTR_RW(irq_cb_affinity);
1523 
1524 static struct attribute *vq_attrs[] = {
1525 	&irq_cb_affinity_attr.attr,
1526 	NULL,
1527 };
1528 ATTRIBUTE_GROUPS(vq);
1529 
1530 static ssize_t vq_attr_show(struct kobject *kobj, struct attribute *attr,
1531 			    char *buf)
1532 {
1533 	struct vduse_virtqueue *vq = container_of(kobj,
1534 					struct vduse_virtqueue, kobj);
1535 	struct vq_sysfs_entry *entry = container_of(attr,
1536 					struct vq_sysfs_entry, attr);
1537 
1538 	if (!entry->show)
1539 		return -EIO;
1540 
1541 	return entry->show(vq, buf);
1542 }
1543 
1544 static ssize_t vq_attr_store(struct kobject *kobj, struct attribute *attr,
1545 			     const char *buf, size_t count)
1546 {
1547 	struct vduse_virtqueue *vq = container_of(kobj,
1548 					struct vduse_virtqueue, kobj);
1549 	struct vq_sysfs_entry *entry = container_of(attr,
1550 					struct vq_sysfs_entry, attr);
1551 
1552 	if (!entry->store)
1553 		return -EIO;
1554 
1555 	return entry->store(vq, buf, count);
1556 }
1557 
1558 static const struct sysfs_ops vq_sysfs_ops = {
1559 	.show = vq_attr_show,
1560 	.store = vq_attr_store,
1561 };
1562 
1563 static void vq_release(struct kobject *kobj)
1564 {
1565 	struct vduse_virtqueue *vq = container_of(kobj,
1566 					struct vduse_virtqueue, kobj);
1567 	kfree(vq);
1568 }
1569 
1570 static const struct kobj_type vq_type = {
1571 	.release	= vq_release,
1572 	.sysfs_ops	= &vq_sysfs_ops,
1573 	.default_groups	= vq_groups,
1574 };
1575 
1576 static char *vduse_devnode(const struct device *dev, umode_t *mode)
1577 {
1578 	return kasprintf(GFP_KERNEL, "vduse/%s", dev_name(dev));
1579 }
1580 
1581 static const struct class vduse_class = {
1582 	.name = "vduse",
1583 	.devnode = vduse_devnode,
1584 };
1585 
1586 static void vduse_dev_deinit_vqs(struct vduse_dev *dev)
1587 {
1588 	int i;
1589 
1590 	if (!dev->vqs)
1591 		return;
1592 
1593 	for (i = 0; i < dev->vq_num; i++)
1594 		kobject_put(&dev->vqs[i]->kobj);
1595 	kfree(dev->vqs);
1596 }
1597 
1598 static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)
1599 {
1600 	int ret, i;
1601 
1602 	dev->vq_align = vq_align;
1603 	dev->vq_num = vq_num;
1604 	dev->vqs = kcalloc(dev->vq_num, sizeof(*dev->vqs), GFP_KERNEL);
1605 	if (!dev->vqs)
1606 		return -ENOMEM;
1607 
1608 	for (i = 0; i < vq_num; i++) {
1609 		dev->vqs[i] = kzalloc(sizeof(*dev->vqs[i]), GFP_KERNEL);
1610 		if (!dev->vqs[i]) {
1611 			ret = -ENOMEM;
1612 			goto err;
1613 		}
1614 
1615 		dev->vqs[i]->index = i;
1616 		dev->vqs[i]->irq_effective_cpu = IRQ_UNBOUND;
1617 		INIT_WORK(&dev->vqs[i]->inject, vduse_vq_irq_inject);
1618 		INIT_WORK(&dev->vqs[i]->kick, vduse_vq_kick_work);
1619 		spin_lock_init(&dev->vqs[i]->kick_lock);
1620 		spin_lock_init(&dev->vqs[i]->irq_lock);
1621 		cpumask_setall(&dev->vqs[i]->irq_affinity);
1622 
1623 		kobject_init(&dev->vqs[i]->kobj, &vq_type);
1624 		ret = kobject_add(&dev->vqs[i]->kobj,
1625 				  &dev->dev->kobj, "vq%d", i);
1626 		if (ret) {
1627 			kfree(dev->vqs[i]);
1628 			goto err;
1629 		}
1630 	}
1631 
1632 	return 0;
1633 err:
1634 	while (i--)
1635 		kobject_put(&dev->vqs[i]->kobj);
1636 	kfree(dev->vqs);
1637 	dev->vqs = NULL;
1638 	return ret;
1639 }
1640 
1641 static struct vduse_dev *vduse_dev_create(void)
1642 {
1643 	struct vduse_dev *dev = kzalloc(sizeof(*dev), GFP_KERNEL);
1644 
1645 	if (!dev)
1646 		return NULL;
1647 
1648 	mutex_init(&dev->lock);
1649 	mutex_init(&dev->mem_lock);
1650 	mutex_init(&dev->domain_lock);
1651 	spin_lock_init(&dev->msg_lock);
1652 	INIT_LIST_HEAD(&dev->send_list);
1653 	INIT_LIST_HEAD(&dev->recv_list);
1654 	spin_lock_init(&dev->irq_lock);
1655 	init_rwsem(&dev->rwsem);
1656 
1657 	INIT_WORK(&dev->inject, vduse_dev_irq_inject);
1658 	init_waitqueue_head(&dev->waitq);
1659 
1660 	return dev;
1661 }
1662 
1663 static void vduse_dev_destroy(struct vduse_dev *dev)
1664 {
1665 	kfree(dev);
1666 }
1667 
1668 static struct vduse_dev *vduse_find_dev(const char *name)
1669 {
1670 	struct vduse_dev *dev;
1671 	int id;
1672 
1673 	idr_for_each_entry(&vduse_idr, dev, id)
1674 		if (!strcmp(dev->name, name))
1675 			return dev;
1676 
1677 	return NULL;
1678 }
1679 
1680 static int vduse_destroy_dev(char *name)
1681 {
1682 	struct vduse_dev *dev = vduse_find_dev(name);
1683 
1684 	if (!dev)
1685 		return -EINVAL;
1686 
1687 	mutex_lock(&dev->lock);
1688 	if (dev->vdev || dev->connected) {
1689 		mutex_unlock(&dev->lock);
1690 		return -EBUSY;
1691 	}
1692 	dev->connected = true;
1693 	mutex_unlock(&dev->lock);
1694 
1695 	vduse_dev_reset(dev);
1696 	device_destroy(&vduse_class, MKDEV(MAJOR(vduse_major), dev->minor));
1697 	idr_remove(&vduse_idr, dev->minor);
1698 	kvfree(dev->config);
1699 	vduse_dev_deinit_vqs(dev);
1700 	if (dev->domain)
1701 		vduse_domain_destroy(dev->domain);
1702 	kfree(dev->name);
1703 	vduse_dev_destroy(dev);
1704 	module_put(THIS_MODULE);
1705 
1706 	return 0;
1707 }
1708 
1709 static bool device_is_allowed(u32 device_id)
1710 {
1711 	int i;
1712 
1713 	for (i = 0; i < ARRAY_SIZE(allowed_device_id); i++)
1714 		if (allowed_device_id[i] == device_id)
1715 			return true;
1716 
1717 	return false;
1718 }
1719 
1720 static bool features_is_valid(struct vduse_dev_config *config)
1721 {
1722 	if (!(config->features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)))
1723 		return false;
1724 
1725 	/* Now we only support read-only configuration space */
1726 	if ((config->device_id == VIRTIO_ID_BLOCK) &&
1727 			(config->features & BIT_ULL(VIRTIO_BLK_F_CONFIG_WCE)))
1728 		return false;
1729 	else if ((config->device_id == VIRTIO_ID_NET) &&
1730 			(config->features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
1731 		return false;
1732 
1733 	if ((config->device_id == VIRTIO_ID_NET) &&
1734 			!(config->features & BIT_ULL(VIRTIO_F_VERSION_1)))
1735 		return false;
1736 
1737 	return true;
1738 }
1739 
1740 static bool vduse_validate_config(struct vduse_dev_config *config)
1741 {
1742 	if (!is_mem_zero((const char *)config->reserved,
1743 			 sizeof(config->reserved)))
1744 		return false;
1745 
1746 	if (config->vq_align > PAGE_SIZE)
1747 		return false;
1748 
1749 	if (config->config_size > PAGE_SIZE)
1750 		return false;
1751 
1752 	if (config->vq_num > 0xffff)
1753 		return false;
1754 
1755 	if (!config->name[0])
1756 		return false;
1757 
1758 	if (!device_is_allowed(config->device_id))
1759 		return false;
1760 
1761 	if (!features_is_valid(config))
1762 		return false;
1763 
1764 	return true;
1765 }
1766 
1767 static ssize_t msg_timeout_show(struct device *device,
1768 				struct device_attribute *attr, char *buf)
1769 {
1770 	struct vduse_dev *dev = dev_get_drvdata(device);
1771 
1772 	return sysfs_emit(buf, "%u\n", dev->msg_timeout);
1773 }
1774 
1775 static ssize_t msg_timeout_store(struct device *device,
1776 				 struct device_attribute *attr,
1777 				 const char *buf, size_t count)
1778 {
1779 	struct vduse_dev *dev = dev_get_drvdata(device);
1780 	int ret;
1781 
1782 	ret = kstrtouint(buf, 10, &dev->msg_timeout);
1783 	if (ret < 0)
1784 		return ret;
1785 
1786 	return count;
1787 }
1788 
1789 static DEVICE_ATTR_RW(msg_timeout);
1790 
1791 static ssize_t bounce_size_show(struct device *device,
1792 				struct device_attribute *attr, char *buf)
1793 {
1794 	struct vduse_dev *dev = dev_get_drvdata(device);
1795 
1796 	return sysfs_emit(buf, "%u\n", dev->bounce_size);
1797 }
1798 
1799 static ssize_t bounce_size_store(struct device *device,
1800 				 struct device_attribute *attr,
1801 				 const char *buf, size_t count)
1802 {
1803 	struct vduse_dev *dev = dev_get_drvdata(device);
1804 	unsigned int bounce_size;
1805 	int ret;
1806 
1807 	ret = -EPERM;
1808 	mutex_lock(&dev->domain_lock);
1809 	if (dev->domain)
1810 		goto unlock;
1811 
1812 	ret = kstrtouint(buf, 10, &bounce_size);
1813 	if (ret < 0)
1814 		goto unlock;
1815 
1816 	ret = -EINVAL;
1817 	if (bounce_size > VDUSE_MAX_BOUNCE_SIZE ||
1818 	    bounce_size < VDUSE_MIN_BOUNCE_SIZE)
1819 		goto unlock;
1820 
1821 	dev->bounce_size = bounce_size & PAGE_MASK;
1822 	ret = count;
1823 unlock:
1824 	mutex_unlock(&dev->domain_lock);
1825 	return ret;
1826 }
1827 
1828 static DEVICE_ATTR_RW(bounce_size);
1829 
1830 static struct attribute *vduse_dev_attrs[] = {
1831 	&dev_attr_msg_timeout.attr,
1832 	&dev_attr_bounce_size.attr,
1833 	NULL
1834 };
1835 
1836 ATTRIBUTE_GROUPS(vduse_dev);
1837 
1838 static int vduse_create_dev(struct vduse_dev_config *config,
1839 			    void *config_buf, u64 api_version)
1840 {
1841 	int ret;
1842 	struct vduse_dev *dev;
1843 
1844 	ret = -EPERM;
1845 	if ((config->device_id == VIRTIO_ID_NET) && !capable(CAP_NET_ADMIN))
1846 		goto err;
1847 
1848 	ret = -EEXIST;
1849 	if (vduse_find_dev(config->name))
1850 		goto err;
1851 
1852 	ret = -ENOMEM;
1853 	dev = vduse_dev_create();
1854 	if (!dev)
1855 		goto err;
1856 
1857 	dev->api_version = api_version;
1858 	dev->device_features = config->features;
1859 	dev->device_id = config->device_id;
1860 	dev->vendor_id = config->vendor_id;
1861 	dev->name = kstrdup(config->name, GFP_KERNEL);
1862 	if (!dev->name)
1863 		goto err_str;
1864 
1865 	dev->bounce_size = VDUSE_BOUNCE_SIZE;
1866 	dev->config = config_buf;
1867 	dev->config_size = config->config_size;
1868 
1869 	ret = idr_alloc(&vduse_idr, dev, 1, VDUSE_DEV_MAX, GFP_KERNEL);
1870 	if (ret < 0)
1871 		goto err_idr;
1872 
1873 	dev->minor = ret;
1874 	dev->msg_timeout = VDUSE_MSG_DEFAULT_TIMEOUT;
1875 	dev->dev = device_create_with_groups(&vduse_class, NULL,
1876 				MKDEV(MAJOR(vduse_major), dev->minor),
1877 				dev, vduse_dev_groups, "%s", config->name);
1878 	if (IS_ERR(dev->dev)) {
1879 		ret = PTR_ERR(dev->dev);
1880 		goto err_dev;
1881 	}
1882 
1883 	ret = vduse_dev_init_vqs(dev, config->vq_align, config->vq_num);
1884 	if (ret)
1885 		goto err_vqs;
1886 
1887 	__module_get(THIS_MODULE);
1888 
1889 	return 0;
1890 err_vqs:
1891 	device_destroy(&vduse_class, MKDEV(MAJOR(vduse_major), dev->minor));
1892 err_dev:
1893 	idr_remove(&vduse_idr, dev->minor);
1894 err_idr:
1895 	kfree(dev->name);
1896 err_str:
1897 	vduse_dev_destroy(dev);
1898 err:
1899 	return ret;
1900 }
1901 
1902 static long vduse_ioctl(struct file *file, unsigned int cmd,
1903 			unsigned long arg)
1904 {
1905 	int ret;
1906 	void __user *argp = (void __user *)arg;
1907 	struct vduse_control *control = file->private_data;
1908 
1909 	mutex_lock(&vduse_lock);
1910 	switch (cmd) {
1911 	case VDUSE_GET_API_VERSION:
1912 		ret = put_user(control->api_version, (u64 __user *)argp);
1913 		break;
1914 	case VDUSE_SET_API_VERSION: {
1915 		u64 api_version;
1916 
1917 		ret = -EFAULT;
1918 		if (get_user(api_version, (u64 __user *)argp))
1919 			break;
1920 
1921 		ret = -EINVAL;
1922 		if (api_version > VDUSE_API_VERSION)
1923 			break;
1924 
1925 		ret = 0;
1926 		control->api_version = api_version;
1927 		break;
1928 	}
1929 	case VDUSE_CREATE_DEV: {
1930 		struct vduse_dev_config config;
1931 		unsigned long size = offsetof(struct vduse_dev_config, config);
1932 		void *buf;
1933 
1934 		ret = -EFAULT;
1935 		if (copy_from_user(&config, argp, size))
1936 			break;
1937 
1938 		ret = -EINVAL;
1939 		if (vduse_validate_config(&config) == false)
1940 			break;
1941 
1942 		buf = vmemdup_user(argp + size, config.config_size);
1943 		if (IS_ERR(buf)) {
1944 			ret = PTR_ERR(buf);
1945 			break;
1946 		}
1947 		config.name[VDUSE_NAME_MAX - 1] = '\0';
1948 		ret = vduse_create_dev(&config, buf, control->api_version);
1949 		if (ret)
1950 			kvfree(buf);
1951 		break;
1952 	}
1953 	case VDUSE_DESTROY_DEV: {
1954 		char name[VDUSE_NAME_MAX];
1955 
1956 		ret = -EFAULT;
1957 		if (copy_from_user(name, argp, VDUSE_NAME_MAX))
1958 			break;
1959 
1960 		name[VDUSE_NAME_MAX - 1] = '\0';
1961 		ret = vduse_destroy_dev(name);
1962 		break;
1963 	}
1964 	default:
1965 		ret = -EINVAL;
1966 		break;
1967 	}
1968 	mutex_unlock(&vduse_lock);
1969 
1970 	return ret;
1971 }
1972 
1973 static int vduse_release(struct inode *inode, struct file *file)
1974 {
1975 	struct vduse_control *control = file->private_data;
1976 
1977 	kfree(control);
1978 	return 0;
1979 }
1980 
1981 static int vduse_open(struct inode *inode, struct file *file)
1982 {
1983 	struct vduse_control *control;
1984 
1985 	control = kmalloc(sizeof(struct vduse_control), GFP_KERNEL);
1986 	if (!control)
1987 		return -ENOMEM;
1988 
1989 	control->api_version = VDUSE_API_VERSION;
1990 	file->private_data = control;
1991 
1992 	return 0;
1993 }
1994 
1995 static const struct file_operations vduse_ctrl_fops = {
1996 	.owner		= THIS_MODULE,
1997 	.open		= vduse_open,
1998 	.release	= vduse_release,
1999 	.unlocked_ioctl	= vduse_ioctl,
2000 	.compat_ioctl	= compat_ptr_ioctl,
2001 	.llseek		= noop_llseek,
2002 };
2003 
2004 struct vduse_mgmt_dev {
2005 	struct vdpa_mgmt_dev mgmt_dev;
2006 	struct device dev;
2007 };
2008 
2009 static struct vduse_mgmt_dev *vduse_mgmt;
2010 
2011 static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name)
2012 {
2013 	struct vduse_vdpa *vdev;
2014 
2015 	if (dev->vdev)
2016 		return -EEXIST;
2017 
2018 	vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev,
2019 				 &vduse_vdpa_config_ops, &vduse_map_ops,
2020 				 1, 1, name, true);
2021 	if (IS_ERR(vdev))
2022 		return PTR_ERR(vdev);
2023 
2024 	dev->vdev = vdev;
2025 	vdev->dev = dev;
2026 	vdev->vdpa.mdev = &vduse_mgmt->mgmt_dev;
2027 
2028 	return 0;
2029 }
2030 
2031 static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
2032 			const struct vdpa_dev_set_config *config)
2033 {
2034 	struct vduse_dev *dev;
2035 	int ret;
2036 
2037 	mutex_lock(&vduse_lock);
2038 	dev = vduse_find_dev(name);
2039 	if (!dev || !vduse_dev_is_ready(dev)) {
2040 		mutex_unlock(&vduse_lock);
2041 		return -EINVAL;
2042 	}
2043 	ret = vduse_dev_init_vdpa(dev, name);
2044 	mutex_unlock(&vduse_lock);
2045 	if (ret)
2046 		return ret;
2047 
2048 	mutex_lock(&dev->domain_lock);
2049 	if (!dev->domain)
2050 		dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1,
2051 						  dev->bounce_size);
2052 	mutex_unlock(&dev->domain_lock);
2053 	if (!dev->domain) {
2054 		put_device(&dev->vdev->vdpa.dev);
2055 		return -ENOMEM;
2056 	}
2057 
2058 	dev->vdev->vdpa.vmap.iova_domain = dev->domain;
2059 	ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num);
2060 	if (ret) {
2061 		put_device(&dev->vdev->vdpa.dev);
2062 		mutex_lock(&dev->domain_lock);
2063 		vduse_domain_destroy(dev->domain);
2064 		dev->domain = NULL;
2065 		mutex_unlock(&dev->domain_lock);
2066 		return ret;
2067 	}
2068 
2069 	return 0;
2070 }
2071 
2072 static void vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev)
2073 {
2074 	_vdpa_unregister_device(dev);
2075 }
2076 
2077 static const struct vdpa_mgmtdev_ops vdpa_dev_mgmtdev_ops = {
2078 	.dev_add = vdpa_dev_add,
2079 	.dev_del = vdpa_dev_del,
2080 };
2081 
2082 static struct virtio_device_id id_table[] = {
2083 	{ VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
2084 	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
2085 	{ 0 },
2086 };
2087 
2088 static void vduse_mgmtdev_release(struct device *dev)
2089 {
2090 	struct vduse_mgmt_dev *mgmt_dev;
2091 
2092 	mgmt_dev = container_of(dev, struct vduse_mgmt_dev, dev);
2093 	kfree(mgmt_dev);
2094 }
2095 
2096 static int vduse_mgmtdev_init(void)
2097 {
2098 	int ret;
2099 
2100 	vduse_mgmt = kzalloc(sizeof(*vduse_mgmt), GFP_KERNEL);
2101 	if (!vduse_mgmt)
2102 		return -ENOMEM;
2103 
2104 	ret = dev_set_name(&vduse_mgmt->dev, "vduse");
2105 	if (ret) {
2106 		kfree(vduse_mgmt);
2107 		return ret;
2108 	}
2109 
2110 	vduse_mgmt->dev.release = vduse_mgmtdev_release;
2111 
2112 	ret = device_register(&vduse_mgmt->dev);
2113 	if (ret)
2114 		goto dev_reg_err;
2115 
2116 	vduse_mgmt->mgmt_dev.id_table = id_table;
2117 	vduse_mgmt->mgmt_dev.ops = &vdpa_dev_mgmtdev_ops;
2118 	vduse_mgmt->mgmt_dev.device = &vduse_mgmt->dev;
2119 	ret = vdpa_mgmtdev_register(&vduse_mgmt->mgmt_dev);
2120 	if (ret)
2121 		device_unregister(&vduse_mgmt->dev);
2122 
2123 	return ret;
2124 
2125 dev_reg_err:
2126 	put_device(&vduse_mgmt->dev);
2127 	return ret;
2128 }
2129 
2130 static void vduse_mgmtdev_exit(void)
2131 {
2132 	vdpa_mgmtdev_unregister(&vduse_mgmt->mgmt_dev);
2133 	device_unregister(&vduse_mgmt->dev);
2134 }
2135 
2136 static int vduse_init(void)
2137 {
2138 	int ret;
2139 	struct device *dev;
2140 
2141 	ret = class_register(&vduse_class);
2142 	if (ret)
2143 		return ret;
2144 
2145 	ret = alloc_chrdev_region(&vduse_major, 0, VDUSE_DEV_MAX, "vduse");
2146 	if (ret)
2147 		goto err_chardev_region;
2148 
2149 	/* /dev/vduse/control */
2150 	cdev_init(&vduse_ctrl_cdev, &vduse_ctrl_fops);
2151 	vduse_ctrl_cdev.owner = THIS_MODULE;
2152 	ret = cdev_add(&vduse_ctrl_cdev, vduse_major, 1);
2153 	if (ret)
2154 		goto err_ctrl_cdev;
2155 
2156 	dev = device_create(&vduse_class, NULL, vduse_major, NULL, "control");
2157 	if (IS_ERR(dev)) {
2158 		ret = PTR_ERR(dev);
2159 		goto err_device;
2160 	}
2161 
2162 	/* /dev/vduse/$DEVICE */
2163 	cdev_init(&vduse_cdev, &vduse_dev_fops);
2164 	vduse_cdev.owner = THIS_MODULE;
2165 	ret = cdev_add(&vduse_cdev, MKDEV(MAJOR(vduse_major), 1),
2166 		       VDUSE_DEV_MAX - 1);
2167 	if (ret)
2168 		goto err_cdev;
2169 
2170 	ret = -ENOMEM;
2171 	vduse_irq_wq = alloc_workqueue("vduse-irq",
2172 				WQ_HIGHPRI | WQ_SYSFS | WQ_UNBOUND, 0);
2173 	if (!vduse_irq_wq)
2174 		goto err_wq;
2175 
2176 	vduse_irq_bound_wq = alloc_workqueue("vduse-irq-bound", WQ_HIGHPRI, 0);
2177 	if (!vduse_irq_bound_wq)
2178 		goto err_bound_wq;
2179 
2180 	ret = vduse_domain_init();
2181 	if (ret)
2182 		goto err_domain;
2183 
2184 	ret = vduse_mgmtdev_init();
2185 	if (ret)
2186 		goto err_mgmtdev;
2187 
2188 	return 0;
2189 err_mgmtdev:
2190 	vduse_domain_exit();
2191 err_domain:
2192 	destroy_workqueue(vduse_irq_bound_wq);
2193 err_bound_wq:
2194 	destroy_workqueue(vduse_irq_wq);
2195 err_wq:
2196 	cdev_del(&vduse_cdev);
2197 err_cdev:
2198 	device_destroy(&vduse_class, vduse_major);
2199 err_device:
2200 	cdev_del(&vduse_ctrl_cdev);
2201 err_ctrl_cdev:
2202 	unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX);
2203 err_chardev_region:
2204 	class_unregister(&vduse_class);
2205 	return ret;
2206 }
2207 module_init(vduse_init);
2208 
2209 static void vduse_exit(void)
2210 {
2211 	vduse_mgmtdev_exit();
2212 	vduse_domain_exit();
2213 	destroy_workqueue(vduse_irq_bound_wq);
2214 	destroy_workqueue(vduse_irq_wq);
2215 	cdev_del(&vduse_cdev);
2216 	device_destroy(&vduse_class, vduse_major);
2217 	cdev_del(&vduse_ctrl_cdev);
2218 	unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX);
2219 	class_unregister(&vduse_class);
2220 	idr_destroy(&vduse_idr);
2221 }
2222 module_exit(vduse_exit);
2223 
2224 MODULE_LICENSE(DRV_LICENSE);
2225 MODULE_AUTHOR(DRV_AUTHOR);
2226 MODULE_DESCRIPTION(DRV_DESC);
2227