xref: /linux/drivers/vdpa/vdpa_user/vduse_dev.c (revision ebcff9dacaf2c1418f8bc927388186d7d3674603)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VDUSE: vDPA Device in Userspace
4  *
5  * Copyright (C) 2020-2021 Bytedance Inc. and/or its affiliates. All rights reserved.
6  *
7  * Author: Xie Yongji <xieyongji@bytedance.com>
8  *
9  */
10 
11 #include "linux/virtio_net.h"
12 #include <linux/cleanup.h>
13 #include <linux/init.h>
14 #include <linux/module.h>
15 #include <linux/cdev.h>
16 #include <linux/device.h>
17 #include <linux/eventfd.h>
18 #include <linux/slab.h>
19 #include <linux/wait.h>
20 #include <linux/dma-map-ops.h>
21 #include <linux/poll.h>
22 #include <linux/file.h>
23 #include <linux/uio.h>
24 #include <linux/vdpa.h>
25 #include <linux/nospec.h>
26 #include <linux/virtio.h>
27 #include <linux/vmalloc.h>
28 #include <linux/sched/mm.h>
29 #include <uapi/linux/vduse.h>
30 #include <uapi/linux/vdpa.h>
31 #include <uapi/linux/virtio_config.h>
32 #include <uapi/linux/virtio_ids.h>
33 #include <uapi/linux/virtio_blk.h>
34 #include <uapi/linux/virtio_ring.h>
35 #include <linux/mod_devicetable.h>
36 
37 #include "iova_domain.h"
38 
39 #define DRV_AUTHOR   "Yongji Xie <xieyongji@bytedance.com>"
40 #define DRV_DESC     "vDPA Device in Userspace"
41 #define DRV_LICENSE  "GPL v2"
42 
43 #define VDUSE_DEV_MAX (1U << MINORBITS)
44 #define VDUSE_DEV_MAX_GROUPS 0xffff
45 #define VDUSE_DEV_MAX_AS 0xffff
46 #define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024)
47 #define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024)
48 #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
49 /* 128 MB reserved for virtqueue creation */
50 #define VDUSE_IOVA_SIZE (VDUSE_MAX_BOUNCE_SIZE + 128 * 1024 * 1024)
51 #define VDUSE_MSG_DEFAULT_TIMEOUT 30
52 
53 #define IRQ_UNBOUND -1
54 
55 /*
56  * VDUSE instance have not asked the vduse API version, so assume 0.
57  *
58  * Old devices may not ask for the device version and assume it is 0.  Keep
59  * this value for these.  From the moment the VDUSE instance ask for the
60  * version, convert to the latests supported one and continue regular flow
61  */
62 #define VDUSE_API_VERSION_NOT_ASKED U64_MAX
63 
64 struct vduse_virtqueue {
65 	u16 index;
66 	u16 num_max;
67 	u32 num;
68 	u64 desc_addr;
69 	u64 driver_addr;
70 	u64 device_addr;
71 	struct vdpa_vq_state state;
72 	bool ready;
73 	bool kicked;
74 	u32 group;
75 	spinlock_t kick_lock;
76 	spinlock_t irq_lock;
77 	struct eventfd_ctx *kickfd;
78 	struct vdpa_callback cb;
79 	struct work_struct inject;
80 	struct work_struct kick;
81 	int irq_effective_cpu;
82 	struct cpumask irq_affinity;
83 	struct kobject kobj;
84 };
85 
86 struct vduse_dev;
87 
88 struct vduse_vdpa {
89 	struct vdpa_device vdpa;
90 	struct vduse_dev *dev;
91 };
92 
93 struct vduse_umem {
94 	unsigned long iova;
95 	unsigned long npages;
96 	struct page **pages;
97 	struct mm_struct *mm;
98 };
99 
100 struct vduse_as {
101 	struct vduse_iova_domain *domain;
102 	struct vduse_umem *umem;
103 	struct mutex mem_lock;
104 };
105 
106 struct vduse_vq_group {
107 	rwlock_t as_lock;
108 	struct vduse_as *as; /* Protected by as_lock */
109 	struct vduse_dev *dev;
110 };
111 
112 struct vduse_dev {
113 	struct vduse_vdpa *vdev;
114 	struct device *dev;
115 	struct vduse_virtqueue **vqs;
116 	struct vduse_as *as;
117 	char *name;
118 	struct mutex lock;
119 	spinlock_t msg_lock;
120 	u64 msg_unique;
121 	u32 msg_timeout;
122 	wait_queue_head_t waitq;
123 	struct list_head send_list;
124 	struct list_head recv_list;
125 	struct vdpa_callback config_cb;
126 	struct work_struct inject;
127 	spinlock_t irq_lock;
128 	struct rw_semaphore rwsem;
129 	int minor;
130 	bool broken;
131 	bool connected;
132 	u64 api_version;
133 	u64 device_features;
134 	u64 driver_features;
135 	u32 device_id;
136 	u32 vendor_id;
137 	u32 generation;
138 	u32 config_size;
139 	void *config;
140 	u8 status;
141 	u32 vq_num;
142 	u32 vq_align;
143 	u32 ngroups;
144 	u32 nas;
145 	struct vduse_vq_group *groups;
146 	unsigned int bounce_size;
147 	struct mutex domain_lock;
148 };
149 
150 struct vduse_dev_msg {
151 	struct vduse_dev_request req;
152 	struct vduse_dev_response resp;
153 	struct list_head list;
154 	wait_queue_head_t waitq;
155 	bool completed;
156 };
157 
158 struct vduse_control {
159 	u64 api_version;
160 };
161 
162 static DEFINE_MUTEX(vduse_lock);
163 static DEFINE_IDR(vduse_idr);
164 
165 static dev_t vduse_major;
166 static struct cdev vduse_ctrl_cdev;
167 static struct cdev vduse_cdev;
168 static struct workqueue_struct *vduse_irq_wq;
169 static struct workqueue_struct *vduse_irq_bound_wq;
170 
171 static u32 allowed_device_id[] = {
172 	VIRTIO_ID_BLOCK,
173 	VIRTIO_ID_NET,
174 	VIRTIO_ID_FS,
175 };
176 
177 static inline struct vduse_dev *vdpa_to_vduse(struct vdpa_device *vdpa)
178 {
179 	struct vduse_vdpa *vdev = container_of(vdpa, struct vduse_vdpa, vdpa);
180 
181 	return vdev->dev;
182 }
183 
184 static inline struct vduse_dev *dev_to_vduse(struct device *dev)
185 {
186 	struct vdpa_device *vdpa = dev_to_vdpa(dev);
187 
188 	return vdpa_to_vduse(vdpa);
189 }
190 
191 static struct vduse_dev_msg *vduse_find_msg(struct list_head *head,
192 					    uint32_t request_id)
193 {
194 	struct vduse_dev_msg *msg;
195 
196 	list_for_each_entry(msg, head, list) {
197 		if (msg->req.request_id == request_id) {
198 			list_del(&msg->list);
199 			return msg;
200 		}
201 	}
202 
203 	return NULL;
204 }
205 
206 static struct vduse_dev_msg *vduse_dequeue_msg(struct list_head *head)
207 {
208 	struct vduse_dev_msg *msg = NULL;
209 
210 	if (!list_empty(head)) {
211 		msg = list_first_entry(head, struct vduse_dev_msg, list);
212 		list_del(&msg->list);
213 	}
214 
215 	return msg;
216 }
217 
218 static void vduse_enqueue_msg(struct list_head *head,
219 			      struct vduse_dev_msg *msg)
220 {
221 	list_add_tail(&msg->list, head);
222 }
223 
224 static void vduse_dev_broken(struct vduse_dev *dev)
225 {
226 	struct vduse_dev_msg *msg, *tmp;
227 
228 	if (unlikely(dev->broken))
229 		return;
230 
231 	list_splice_init(&dev->recv_list, &dev->send_list);
232 	list_for_each_entry_safe(msg, tmp, &dev->send_list, list) {
233 		list_del(&msg->list);
234 		msg->completed = 1;
235 		msg->resp.result = VDUSE_REQ_RESULT_FAILED;
236 		wake_up(&msg->waitq);
237 	}
238 	dev->broken = true;
239 	wake_up(&dev->waitq);
240 }
241 
242 static int vduse_dev_msg_sync(struct vduse_dev *dev,
243 			      struct vduse_dev_msg *msg)
244 {
245 	int ret;
246 
247 	if (unlikely(dev->broken))
248 		return -EIO;
249 
250 	init_waitqueue_head(&msg->waitq);
251 	spin_lock(&dev->msg_lock);
252 	if (unlikely(dev->broken)) {
253 		spin_unlock(&dev->msg_lock);
254 		return -EIO;
255 	}
256 	msg->req.request_id = dev->msg_unique++;
257 	vduse_enqueue_msg(&dev->send_list, msg);
258 	wake_up(&dev->waitq);
259 	spin_unlock(&dev->msg_lock);
260 	if (dev->msg_timeout)
261 		ret = wait_event_killable_timeout(msg->waitq, msg->completed,
262 						  (long)dev->msg_timeout * HZ);
263 	else
264 		ret = wait_event_killable(msg->waitq, msg->completed);
265 
266 	spin_lock(&dev->msg_lock);
267 	if (!msg->completed) {
268 		list_del(&msg->list);
269 		msg->resp.result = VDUSE_REQ_RESULT_FAILED;
270 		/* Mark the device as malfunction when there is a timeout */
271 		if (!ret)
272 			vduse_dev_broken(dev);
273 	}
274 	ret = (msg->resp.result == VDUSE_REQ_RESULT_OK) ? 0 : -EIO;
275 	spin_unlock(&dev->msg_lock);
276 
277 	return ret;
278 }
279 
280 static int vduse_dev_get_vq_state_packed(struct vduse_dev *dev,
281 					 struct vduse_virtqueue *vq,
282 					 struct vdpa_vq_state_packed *packed)
283 {
284 	struct vduse_dev_msg msg = { 0 };
285 	int ret;
286 
287 	msg.req.type = VDUSE_GET_VQ_STATE;
288 	msg.req.vq_state.index = vq->index;
289 
290 	ret = vduse_dev_msg_sync(dev, &msg);
291 	if (ret)
292 		return ret;
293 
294 	packed->last_avail_counter =
295 			msg.resp.vq_state.packed.last_avail_counter & 0x0001;
296 	packed->last_avail_idx =
297 			msg.resp.vq_state.packed.last_avail_idx & 0x7FFF;
298 	packed->last_used_counter =
299 			msg.resp.vq_state.packed.last_used_counter & 0x0001;
300 	packed->last_used_idx =
301 			msg.resp.vq_state.packed.last_used_idx & 0x7FFF;
302 
303 	return 0;
304 }
305 
306 static int vduse_dev_get_vq_state_split(struct vduse_dev *dev,
307 					struct vduse_virtqueue *vq,
308 					struct vdpa_vq_state_split *split)
309 {
310 	struct vduse_dev_msg msg = { 0 };
311 	int ret;
312 
313 	msg.req.type = VDUSE_GET_VQ_STATE;
314 	msg.req.vq_state.index = vq->index;
315 
316 	ret = vduse_dev_msg_sync(dev, &msg);
317 	if (ret)
318 		return ret;
319 
320 	split->avail_index = msg.resp.vq_state.split.avail_index;
321 
322 	return 0;
323 }
324 
325 static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
326 {
327 	struct vduse_dev_msg msg = { 0 };
328 
329 	msg.req.type = VDUSE_SET_STATUS;
330 	msg.req.s.status = status;
331 
332 	return vduse_dev_msg_sync(dev, &msg);
333 }
334 
335 static int vduse_dev_update_iotlb(struct vduse_dev *dev, u32 asid,
336 				  u64 start, u64 last)
337 {
338 	struct vduse_dev_msg msg = { 0 };
339 
340 	if (last < start)
341 		return -EINVAL;
342 
343 	msg.req.type = VDUSE_UPDATE_IOTLB;
344 	if (dev->api_version < VDUSE_API_VERSION_1) {
345 		msg.req.iova.start = start;
346 		msg.req.iova.last = last;
347 	} else {
348 		msg.req.iova_v2.start = start;
349 		msg.req.iova_v2.last = last;
350 		msg.req.iova_v2.asid = asid;
351 	}
352 
353 	return vduse_dev_msg_sync(dev, &msg);
354 }
355 
356 static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
357 {
358 	struct file *file = iocb->ki_filp;
359 	struct vduse_dev *dev = file->private_data;
360 	struct vduse_dev_msg *msg;
361 	int size = sizeof(struct vduse_dev_request);
362 	ssize_t ret;
363 
364 	if (iov_iter_count(to) < size)
365 		return -EINVAL;
366 
367 	spin_lock(&dev->msg_lock);
368 	while (1) {
369 		msg = vduse_dequeue_msg(&dev->send_list);
370 		if (msg)
371 			break;
372 
373 		ret = -EAGAIN;
374 		if (file->f_flags & O_NONBLOCK)
375 			goto unlock;
376 
377 		spin_unlock(&dev->msg_lock);
378 		ret = wait_event_interruptible_exclusive(dev->waitq,
379 					!list_empty(&dev->send_list));
380 		if (ret)
381 			return ret;
382 
383 		spin_lock(&dev->msg_lock);
384 	}
385 	spin_unlock(&dev->msg_lock);
386 	ret = copy_to_iter(&msg->req, size, to);
387 	spin_lock(&dev->msg_lock);
388 	if (ret != size) {
389 		ret = -EFAULT;
390 		vduse_enqueue_msg(&dev->send_list, msg);
391 		goto unlock;
392 	}
393 	vduse_enqueue_msg(&dev->recv_list, msg);
394 unlock:
395 	spin_unlock(&dev->msg_lock);
396 
397 	return ret;
398 }
399 
400 static bool is_mem_zero(const char *ptr, int size)
401 {
402 	int i;
403 
404 	for (i = 0; i < size; i++) {
405 		if (ptr[i])
406 			return false;
407 	}
408 	return true;
409 }
410 
411 static ssize_t vduse_dev_write_iter(struct kiocb *iocb, struct iov_iter *from)
412 {
413 	struct file *file = iocb->ki_filp;
414 	struct vduse_dev *dev = file->private_data;
415 	struct vduse_dev_response resp;
416 	struct vduse_dev_msg *msg;
417 	size_t ret;
418 
419 	ret = copy_from_iter(&resp, sizeof(resp), from);
420 	if (ret != sizeof(resp))
421 		return -EINVAL;
422 
423 	if (!is_mem_zero((const char *)resp.reserved, sizeof(resp.reserved)))
424 		return -EINVAL;
425 
426 	spin_lock(&dev->msg_lock);
427 	msg = vduse_find_msg(&dev->recv_list, resp.request_id);
428 	if (!msg) {
429 		ret = -ENOENT;
430 		goto unlock;
431 	}
432 
433 	memcpy(&msg->resp, &resp, sizeof(resp));
434 	msg->completed = 1;
435 	wake_up(&msg->waitq);
436 unlock:
437 	spin_unlock(&dev->msg_lock);
438 
439 	return ret;
440 }
441 
442 static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
443 {
444 	struct vduse_dev *dev = file->private_data;
445 	__poll_t mask = 0;
446 
447 	poll_wait(file, &dev->waitq, wait);
448 
449 	spin_lock(&dev->msg_lock);
450 
451 	if (unlikely(dev->broken))
452 		mask |= EPOLLERR;
453 	if (!list_empty(&dev->send_list))
454 		mask |= EPOLLIN | EPOLLRDNORM;
455 	if (!list_empty(&dev->recv_list))
456 		mask |= EPOLLOUT | EPOLLWRNORM;
457 
458 	spin_unlock(&dev->msg_lock);
459 
460 	return mask;
461 }
462 
463 static void vduse_dev_reset(struct vduse_dev *dev)
464 {
465 	int i;
466 
467 	/* The coherent mappings are handled in vduse_dev_free_coherent() */
468 	for (i = 0; i < dev->nas; i++) {
469 		struct vduse_iova_domain *domain = dev->as[i].domain;
470 
471 		if (domain && domain->bounce_map)
472 			vduse_domain_reset_bounce_map(domain);
473 	}
474 
475 	down_write(&dev->rwsem);
476 
477 	dev->status = 0;
478 	dev->driver_features = 0;
479 	dev->generation++;
480 	spin_lock(&dev->irq_lock);
481 	dev->config_cb.callback = NULL;
482 	dev->config_cb.private = NULL;
483 	spin_unlock(&dev->irq_lock);
484 	flush_work(&dev->inject);
485 
486 	for (i = 0; i < dev->vq_num; i++) {
487 		struct vduse_virtqueue *vq = dev->vqs[i];
488 
489 		vq->ready = false;
490 		vq->desc_addr = 0;
491 		vq->driver_addr = 0;
492 		vq->device_addr = 0;
493 		vq->num = 0;
494 		memset(&vq->state, 0, sizeof(vq->state));
495 
496 		spin_lock(&vq->kick_lock);
497 		vq->kicked = false;
498 		if (vq->kickfd)
499 			eventfd_ctx_put(vq->kickfd);
500 		vq->kickfd = NULL;
501 		spin_unlock(&vq->kick_lock);
502 
503 		spin_lock(&vq->irq_lock);
504 		vq->cb.callback = NULL;
505 		vq->cb.private = NULL;
506 		vq->cb.trigger = NULL;
507 		spin_unlock(&vq->irq_lock);
508 		flush_work(&vq->inject);
509 		flush_work(&vq->kick);
510 	}
511 
512 	up_write(&dev->rwsem);
513 }
514 
515 static int vduse_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 idx,
516 				u64 desc_area, u64 driver_area,
517 				u64 device_area)
518 {
519 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
520 	struct vduse_virtqueue *vq = dev->vqs[idx];
521 
522 	vq->desc_addr = desc_area;
523 	vq->driver_addr = driver_area;
524 	vq->device_addr = device_area;
525 
526 	return 0;
527 }
528 
529 static void vduse_vq_kick(struct vduse_virtqueue *vq)
530 {
531 	spin_lock(&vq->kick_lock);
532 	if (!vq->ready)
533 		goto unlock;
534 
535 	if (vq->kickfd)
536 		eventfd_signal(vq->kickfd);
537 	else
538 		vq->kicked = true;
539 unlock:
540 	spin_unlock(&vq->kick_lock);
541 }
542 
543 static void vduse_vq_kick_work(struct work_struct *work)
544 {
545 	struct vduse_virtqueue *vq = container_of(work,
546 					struct vduse_virtqueue, kick);
547 
548 	vduse_vq_kick(vq);
549 }
550 
551 static void vduse_vdpa_kick_vq(struct vdpa_device *vdpa, u16 idx)
552 {
553 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
554 	struct vduse_virtqueue *vq = dev->vqs[idx];
555 
556 	if (!eventfd_signal_allowed()) {
557 		schedule_work(&vq->kick);
558 		return;
559 	}
560 	vduse_vq_kick(vq);
561 }
562 
563 static void vduse_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 idx,
564 			      struct vdpa_callback *cb)
565 {
566 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
567 	struct vduse_virtqueue *vq = dev->vqs[idx];
568 
569 	spin_lock(&vq->irq_lock);
570 	vq->cb.callback = cb->callback;
571 	vq->cb.private = cb->private;
572 	vq->cb.trigger = cb->trigger;
573 	spin_unlock(&vq->irq_lock);
574 }
575 
576 static void vduse_vdpa_set_vq_num(struct vdpa_device *vdpa, u16 idx, u32 num)
577 {
578 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
579 	struct vduse_virtqueue *vq = dev->vqs[idx];
580 
581 	vq->num = num;
582 }
583 
584 static u16 vduse_vdpa_get_vq_size(struct vdpa_device *vdpa, u16 idx)
585 {
586 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
587 	struct vduse_virtqueue *vq = dev->vqs[idx];
588 
589 	if (vq->num)
590 		return vq->num;
591 	else
592 		return vq->num_max;
593 }
594 
595 static void vduse_vdpa_set_vq_ready(struct vdpa_device *vdpa,
596 					u16 idx, bool ready)
597 {
598 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
599 	struct vduse_virtqueue *vq = dev->vqs[idx];
600 
601 	vq->ready = ready;
602 }
603 
604 static bool vduse_vdpa_get_vq_ready(struct vdpa_device *vdpa, u16 idx)
605 {
606 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
607 	struct vduse_virtqueue *vq = dev->vqs[idx];
608 
609 	return vq->ready;
610 }
611 
612 static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx,
613 				const struct vdpa_vq_state *state)
614 {
615 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
616 	struct vduse_virtqueue *vq = dev->vqs[idx];
617 
618 	if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
619 		vq->state.packed.last_avail_counter =
620 				state->packed.last_avail_counter;
621 		vq->state.packed.last_avail_idx = state->packed.last_avail_idx;
622 		vq->state.packed.last_used_counter =
623 				state->packed.last_used_counter;
624 		vq->state.packed.last_used_idx = state->packed.last_used_idx;
625 	} else
626 		vq->state.split.avail_index = state->split.avail_index;
627 
628 	return 0;
629 }
630 
631 static u32 vduse_get_vq_group(struct vdpa_device *vdpa, u16 idx)
632 {
633 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
634 
635 	if (dev->api_version < VDUSE_API_VERSION_1)
636 		return 0;
637 
638 	return dev->vqs[idx]->group;
639 }
640 
641 static union virtio_map vduse_get_vq_map(struct vdpa_device *vdpa, u16 idx)
642 {
643 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
644 	u32 vq_group = vduse_get_vq_group(vdpa, idx);
645 	union virtio_map ret = {
646 		.group = &dev->groups[vq_group],
647 	};
648 
649 	return ret;
650 }
651 
652 DEFINE_GUARD(vq_group_as_read_lock, struct vduse_vq_group *,
653 	if (_T->dev->nas > 1)
654 		read_lock(&_T->as_lock),
655 	if (_T->dev->nas > 1)
656 		read_unlock(&_T->as_lock))
657 
658 DEFINE_GUARD(vq_group_as_write_lock, struct vduse_vq_group *,
659 	if (_T->dev->nas > 1)
660 		write_lock(&_T->as_lock),
661 	if (_T->dev->nas > 1)
662 		write_unlock(&_T->as_lock))
663 
664 static int vduse_set_group_asid(struct vdpa_device *vdpa, unsigned int group,
665 				unsigned int asid)
666 {
667 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
668 	struct vduse_dev_msg msg = { 0 };
669 	int r;
670 
671 	if (dev->api_version < VDUSE_API_VERSION_1)
672 		return -EINVAL;
673 
674 	msg.req.type = VDUSE_SET_VQ_GROUP_ASID;
675 	msg.req.vq_group_asid.group = group;
676 	msg.req.vq_group_asid.asid = asid;
677 
678 	r = vduse_dev_msg_sync(dev, &msg);
679 	if (r < 0)
680 		return r;
681 
682 	guard(vq_group_as_write_lock)(&dev->groups[group]);
683 	dev->groups[group].as = &dev->as[asid];
684 
685 	return 0;
686 }
687 
688 static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
689 				struct vdpa_vq_state *state)
690 {
691 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
692 	struct vduse_virtqueue *vq = dev->vqs[idx];
693 
694 	if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED))
695 		return vduse_dev_get_vq_state_packed(dev, vq, &state->packed);
696 
697 	return vduse_dev_get_vq_state_split(dev, vq, &state->split);
698 }
699 
700 static u32 vduse_vdpa_get_vq_align(struct vdpa_device *vdpa)
701 {
702 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
703 
704 	return dev->vq_align;
705 }
706 
707 static u64 vduse_vdpa_get_device_features(struct vdpa_device *vdpa)
708 {
709 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
710 
711 	return dev->device_features;
712 }
713 
714 static int vduse_vdpa_set_driver_features(struct vdpa_device *vdpa, u64 features)
715 {
716 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
717 
718 	dev->driver_features = features;
719 	return 0;
720 }
721 
722 static u64 vduse_vdpa_get_driver_features(struct vdpa_device *vdpa)
723 {
724 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
725 
726 	return dev->driver_features;
727 }
728 
729 static void vduse_vdpa_set_config_cb(struct vdpa_device *vdpa,
730 				  struct vdpa_callback *cb)
731 {
732 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
733 
734 	spin_lock(&dev->irq_lock);
735 	dev->config_cb.callback = cb->callback;
736 	dev->config_cb.private = cb->private;
737 	spin_unlock(&dev->irq_lock);
738 }
739 
740 static u16 vduse_vdpa_get_vq_num_max(struct vdpa_device *vdpa)
741 {
742 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
743 	u16 num_max = 0;
744 	int i;
745 
746 	for (i = 0; i < dev->vq_num; i++)
747 		if (num_max < dev->vqs[i]->num_max)
748 			num_max = dev->vqs[i]->num_max;
749 
750 	return num_max;
751 }
752 
753 static u32 vduse_vdpa_get_device_id(struct vdpa_device *vdpa)
754 {
755 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
756 
757 	return dev->device_id;
758 }
759 
760 static u32 vduse_vdpa_get_vendor_id(struct vdpa_device *vdpa)
761 {
762 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
763 
764 	return dev->vendor_id;
765 }
766 
767 static u8 vduse_vdpa_get_status(struct vdpa_device *vdpa)
768 {
769 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
770 
771 	return dev->status;
772 }
773 
774 static void vduse_vdpa_set_status(struct vdpa_device *vdpa, u8 status)
775 {
776 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
777 
778 	if (vduse_dev_set_status(dev, status))
779 		return;
780 
781 	dev->status = status;
782 }
783 
784 static size_t vduse_vdpa_get_config_size(struct vdpa_device *vdpa)
785 {
786 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
787 
788 	return dev->config_size;
789 }
790 
791 static void vduse_vdpa_get_config(struct vdpa_device *vdpa, unsigned int offset,
792 				  void *buf, unsigned int len)
793 {
794 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
795 
796 	/* Initialize the buffer in case of partial copy. */
797 	memset(buf, 0, len);
798 
799 	if (offset > dev->config_size)
800 		return;
801 
802 	if (len > dev->config_size - offset)
803 		len = dev->config_size - offset;
804 
805 	memcpy(buf, dev->config + offset, len);
806 }
807 
808 static void vduse_vdpa_set_config(struct vdpa_device *vdpa, unsigned int offset,
809 			const void *buf, unsigned int len)
810 {
811 	/* Now we only support read-only configuration space */
812 }
813 
814 static int vduse_vdpa_reset(struct vdpa_device *vdpa)
815 {
816 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
817 	int ret = vduse_dev_set_status(dev, 0);
818 
819 	vduse_dev_reset(dev);
820 
821 	return ret;
822 }
823 
824 static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa)
825 {
826 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
827 
828 	return dev->generation;
829 }
830 
831 static int vduse_vdpa_set_vq_affinity(struct vdpa_device *vdpa, u16 idx,
832 				      const struct cpumask *cpu_mask)
833 {
834 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
835 
836 	if (cpu_mask)
837 		cpumask_copy(&dev->vqs[idx]->irq_affinity, cpu_mask);
838 	else
839 		cpumask_setall(&dev->vqs[idx]->irq_affinity);
840 
841 	return 0;
842 }
843 
844 static const struct cpumask *
845 vduse_vdpa_get_vq_affinity(struct vdpa_device *vdpa, u16 idx)
846 {
847 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
848 
849 	return &dev->vqs[idx]->irq_affinity;
850 }
851 
852 static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
853 				unsigned int asid,
854 				struct vhost_iotlb *iotlb)
855 {
856 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
857 	int ret;
858 
859 	ret = vduse_domain_set_map(dev->as[asid].domain, iotlb);
860 	if (ret)
861 		return ret;
862 
863 	ret = vduse_dev_update_iotlb(dev, asid, 0ULL, ULLONG_MAX);
864 	if (ret) {
865 		vduse_domain_clear_map(dev->as[asid].domain, iotlb);
866 		return ret;
867 	}
868 
869 	return 0;
870 }
871 
872 static void vduse_vdpa_free(struct vdpa_device *vdpa)
873 {
874 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
875 
876 	dev->vdev = NULL;
877 }
878 
879 static const struct vdpa_config_ops vduse_vdpa_config_ops = {
880 	.set_vq_address		= vduse_vdpa_set_vq_address,
881 	.kick_vq		= vduse_vdpa_kick_vq,
882 	.set_vq_cb		= vduse_vdpa_set_vq_cb,
883 	.set_vq_num             = vduse_vdpa_set_vq_num,
884 	.get_vq_size		= vduse_vdpa_get_vq_size,
885 	.get_vq_group		= vduse_get_vq_group,
886 	.set_vq_ready		= vduse_vdpa_set_vq_ready,
887 	.get_vq_ready		= vduse_vdpa_get_vq_ready,
888 	.set_vq_state		= vduse_vdpa_set_vq_state,
889 	.get_vq_state		= vduse_vdpa_get_vq_state,
890 	.get_vq_align		= vduse_vdpa_get_vq_align,
891 	.get_device_features	= vduse_vdpa_get_device_features,
892 	.set_driver_features	= vduse_vdpa_set_driver_features,
893 	.get_driver_features	= vduse_vdpa_get_driver_features,
894 	.set_config_cb		= vduse_vdpa_set_config_cb,
895 	.get_vq_num_max		= vduse_vdpa_get_vq_num_max,
896 	.get_device_id		= vduse_vdpa_get_device_id,
897 	.get_vendor_id		= vduse_vdpa_get_vendor_id,
898 	.get_status		= vduse_vdpa_get_status,
899 	.set_status		= vduse_vdpa_set_status,
900 	.get_config_size	= vduse_vdpa_get_config_size,
901 	.get_config		= vduse_vdpa_get_config,
902 	.set_config		= vduse_vdpa_set_config,
903 	.get_generation		= vduse_vdpa_get_generation,
904 	.set_vq_affinity	= vduse_vdpa_set_vq_affinity,
905 	.get_vq_affinity	= vduse_vdpa_get_vq_affinity,
906 	.reset			= vduse_vdpa_reset,
907 	.set_map		= vduse_vdpa_set_map,
908 	.set_group_asid		= vduse_set_group_asid,
909 	.get_vq_map		= vduse_get_vq_map,
910 	.free			= vduse_vdpa_free,
911 };
912 
913 static void vduse_dev_sync_single_for_device(union virtio_map token,
914 					     dma_addr_t dma_addr, size_t size,
915 					     enum dma_data_direction dir)
916 {
917 	struct vduse_iova_domain *domain;
918 
919 	if (!token.group)
920 		return;
921 
922 	guard(vq_group_as_read_lock)(token.group);
923 	domain = token.group->as->domain;
924 	vduse_domain_sync_single_for_device(domain, dma_addr, size, dir);
925 }
926 
927 static void vduse_dev_sync_single_for_cpu(union virtio_map token,
928 					     dma_addr_t dma_addr, size_t size,
929 					     enum dma_data_direction dir)
930 {
931 	struct vduse_iova_domain *domain;
932 
933 	if (!token.group)
934 		return;
935 
936 	guard(vq_group_as_read_lock)(token.group);
937 	domain = token.group->as->domain;
938 	vduse_domain_sync_single_for_cpu(domain, dma_addr, size, dir);
939 }
940 
941 static dma_addr_t vduse_dev_map_page(union virtio_map token, struct page *page,
942 				     unsigned long offset, size_t size,
943 				     enum dma_data_direction dir,
944 				     unsigned long attrs)
945 {
946 	struct vduse_iova_domain *domain;
947 
948 	if (!token.group)
949 		return DMA_MAPPING_ERROR;
950 
951 	guard(vq_group_as_read_lock)(token.group);
952 	domain = token.group->as->domain;
953 	return vduse_domain_map_page(domain, page, offset, size, dir, attrs);
954 }
955 
956 static void vduse_dev_unmap_page(union virtio_map token, dma_addr_t dma_addr,
957 				 size_t size, enum dma_data_direction dir,
958 				 unsigned long attrs)
959 {
960 	struct vduse_iova_domain *domain;
961 
962 	if (!token.group)
963 		return;
964 
965 	guard(vq_group_as_read_lock)(token.group);
966 	domain = token.group->as->domain;
967 	vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs);
968 }
969 
970 static void *vduse_dev_alloc_coherent(union virtio_map token, size_t size,
971 				      dma_addr_t *dma_addr, gfp_t flag)
972 {
973 	void *addr;
974 
975 	*dma_addr = DMA_MAPPING_ERROR;
976 	if (!token.group)
977 		return NULL;
978 
979 	addr = alloc_pages_exact(size, flag);
980 	if (!addr)
981 		return NULL;
982 
983 	{
984 		struct vduse_iova_domain *domain;
985 
986 		guard(vq_group_as_read_lock)(token.group);
987 		domain = token.group->as->domain;
988 		*dma_addr = vduse_domain_alloc_coherent(domain, size, addr);
989 		if (*dma_addr == DMA_MAPPING_ERROR)
990 			goto err;
991 	}
992 
993 	return addr;
994 
995 err:
996 	free_pages_exact(addr, size);
997 	return NULL;
998 }
999 
1000 static void vduse_dev_free_coherent(union virtio_map token, size_t size,
1001 				    void *vaddr, dma_addr_t dma_addr,
1002 				    unsigned long attrs)
1003 {
1004 	if (!token.group)
1005 		return;
1006 
1007 	{
1008 		struct vduse_iova_domain *domain;
1009 
1010 		guard(vq_group_as_read_lock)(token.group);
1011 		domain = token.group->as->domain;
1012 		vduse_domain_free_coherent(domain, size, dma_addr, attrs);
1013 	}
1014 
1015 	free_pages_exact(vaddr, size);
1016 }
1017 
1018 static bool vduse_dev_need_sync(union virtio_map token, dma_addr_t dma_addr)
1019 {
1020 	if (!token.group)
1021 		return false;
1022 
1023 	guard(vq_group_as_read_lock)(token.group);
1024 	return dma_addr < token.group->as->domain->bounce_size;
1025 }
1026 
1027 static int vduse_dev_mapping_error(union virtio_map token, dma_addr_t dma_addr)
1028 {
1029 	if (unlikely(dma_addr == DMA_MAPPING_ERROR))
1030 		return -ENOMEM;
1031 	return 0;
1032 }
1033 
1034 static size_t vduse_dev_max_mapping_size(union virtio_map token)
1035 {
1036 	if (!token.group)
1037 		return 0;
1038 
1039 	guard(vq_group_as_read_lock)(token.group);
1040 	return token.group->as->domain->bounce_size;
1041 }
1042 
1043 static const struct virtio_map_ops vduse_map_ops = {
1044 	.sync_single_for_device = vduse_dev_sync_single_for_device,
1045 	.sync_single_for_cpu = vduse_dev_sync_single_for_cpu,
1046 	.map_page = vduse_dev_map_page,
1047 	.unmap_page = vduse_dev_unmap_page,
1048 	.alloc = vduse_dev_alloc_coherent,
1049 	.free = vduse_dev_free_coherent,
1050 	.need_sync = vduse_dev_need_sync,
1051 	.mapping_error = vduse_dev_mapping_error,
1052 	.max_mapping_size = vduse_dev_max_mapping_size,
1053 };
1054 
1055 static unsigned int perm_to_file_flags(u8 perm)
1056 {
1057 	unsigned int flags = 0;
1058 
1059 	switch (perm) {
1060 	case VDUSE_ACCESS_WO:
1061 		flags |= O_WRONLY;
1062 		break;
1063 	case VDUSE_ACCESS_RO:
1064 		flags |= O_RDONLY;
1065 		break;
1066 	case VDUSE_ACCESS_RW:
1067 		flags |= O_RDWR;
1068 		break;
1069 	default:
1070 		WARN(1, "invalidate vhost IOTLB permission\n");
1071 		break;
1072 	}
1073 
1074 	return flags;
1075 }
1076 
1077 static int vduse_kickfd_setup(struct vduse_dev *dev,
1078 			struct vduse_vq_eventfd *eventfd)
1079 {
1080 	struct eventfd_ctx *ctx = NULL;
1081 	struct vduse_virtqueue *vq;
1082 	u32 index;
1083 
1084 	if (eventfd->index >= dev->vq_num)
1085 		return -EINVAL;
1086 
1087 	index = array_index_nospec(eventfd->index, dev->vq_num);
1088 	vq = dev->vqs[index];
1089 	if (eventfd->fd >= 0) {
1090 		ctx = eventfd_ctx_fdget(eventfd->fd);
1091 		if (IS_ERR(ctx))
1092 			return PTR_ERR(ctx);
1093 	} else if (eventfd->fd != VDUSE_EVENTFD_DEASSIGN)
1094 		return 0;
1095 
1096 	spin_lock(&vq->kick_lock);
1097 	if (vq->kickfd)
1098 		eventfd_ctx_put(vq->kickfd);
1099 	vq->kickfd = ctx;
1100 	if (vq->ready && vq->kicked && vq->kickfd) {
1101 		eventfd_signal(vq->kickfd);
1102 		vq->kicked = false;
1103 	}
1104 	spin_unlock(&vq->kick_lock);
1105 
1106 	return 0;
1107 }
1108 
1109 static bool vduse_dev_is_ready(struct vduse_dev *dev)
1110 {
1111 	int i;
1112 
1113 	for (i = 0; i < dev->vq_num; i++)
1114 		if (!dev->vqs[i]->num_max)
1115 			return false;
1116 
1117 	return true;
1118 }
1119 
1120 static void vduse_dev_irq_inject(struct work_struct *work)
1121 {
1122 	struct vduse_dev *dev = container_of(work, struct vduse_dev, inject);
1123 
1124 	spin_lock_bh(&dev->irq_lock);
1125 	if (dev->config_cb.callback)
1126 		dev->config_cb.callback(dev->config_cb.private);
1127 	spin_unlock_bh(&dev->irq_lock);
1128 }
1129 
1130 static void vduse_vq_irq_inject(struct work_struct *work)
1131 {
1132 	struct vduse_virtqueue *vq = container_of(work,
1133 					struct vduse_virtqueue, inject);
1134 
1135 	spin_lock_bh(&vq->irq_lock);
1136 	if (vq->ready && vq->cb.callback)
1137 		vq->cb.callback(vq->cb.private);
1138 	spin_unlock_bh(&vq->irq_lock);
1139 }
1140 
1141 static bool vduse_vq_signal_irqfd(struct vduse_virtqueue *vq)
1142 {
1143 	bool signal = false;
1144 
1145 	if (!vq->cb.trigger)
1146 		return false;
1147 
1148 	spin_lock_irq(&vq->irq_lock);
1149 	if (vq->ready && vq->cb.trigger) {
1150 		eventfd_signal(vq->cb.trigger);
1151 		signal = true;
1152 	}
1153 	spin_unlock_irq(&vq->irq_lock);
1154 
1155 	return signal;
1156 }
1157 
1158 static int vduse_dev_queue_irq_work(struct vduse_dev *dev,
1159 				    struct work_struct *irq_work,
1160 				    int irq_effective_cpu)
1161 {
1162 	int ret = -EINVAL;
1163 
1164 	down_read(&dev->rwsem);
1165 	if (!(dev->status & VIRTIO_CONFIG_S_DRIVER_OK))
1166 		goto unlock;
1167 
1168 	ret = 0;
1169 	if (irq_effective_cpu == IRQ_UNBOUND)
1170 		queue_work(vduse_irq_wq, irq_work);
1171 	else
1172 		queue_work_on(irq_effective_cpu,
1173 			      vduse_irq_bound_wq, irq_work);
1174 unlock:
1175 	up_read(&dev->rwsem);
1176 
1177 	return ret;
1178 }
1179 
1180 static int vduse_dev_dereg_umem(struct vduse_dev *dev, u32 asid,
1181 				u64 iova, u64 size)
1182 {
1183 	int ret;
1184 
1185 	mutex_lock(&dev->as[asid].mem_lock);
1186 	ret = -ENOENT;
1187 	if (!dev->as[asid].umem)
1188 		goto unlock;
1189 
1190 	ret = -EINVAL;
1191 	if (!dev->as[asid].domain)
1192 		goto unlock;
1193 
1194 	if (dev->as[asid].umem->iova != iova ||
1195 	    size != dev->as[asid].domain->bounce_size)
1196 		goto unlock;
1197 
1198 	vduse_domain_remove_user_bounce_pages(dev->as[asid].domain);
1199 	unpin_user_pages_dirty_lock(dev->as[asid].umem->pages,
1200 				    dev->as[asid].umem->npages, true);
1201 	atomic64_sub(dev->as[asid].umem->npages, &dev->as[asid].umem->mm->pinned_vm);
1202 	mmdrop(dev->as[asid].umem->mm);
1203 	vfree(dev->as[asid].umem->pages);
1204 	kfree(dev->as[asid].umem);
1205 	dev->as[asid].umem = NULL;
1206 	ret = 0;
1207 unlock:
1208 	mutex_unlock(&dev->as[asid].mem_lock);
1209 	return ret;
1210 }
1211 
1212 static int vduse_dev_reg_umem(struct vduse_dev *dev,
1213 			      u32 asid, u64 iova, u64 uaddr, u64 size)
1214 {
1215 	struct page **page_list = NULL;
1216 	struct vduse_umem *umem = NULL;
1217 	long pinned = 0;
1218 	unsigned long npages, lock_limit;
1219 	int ret;
1220 
1221 	if (!dev->as[asid].domain || !dev->as[asid].domain->bounce_map ||
1222 	    size != dev->as[asid].domain->bounce_size ||
1223 	    iova != 0 || uaddr & ~PAGE_MASK)
1224 		return -EINVAL;
1225 
1226 	mutex_lock(&dev->as[asid].mem_lock);
1227 	ret = -EEXIST;
1228 	if (dev->as[asid].umem)
1229 		goto unlock;
1230 
1231 	ret = -ENOMEM;
1232 	npages = size >> PAGE_SHIFT;
1233 	page_list = __vmalloc(array_size(npages, sizeof(struct page *)),
1234 			      GFP_KERNEL_ACCOUNT);
1235 	umem = kzalloc(sizeof(*umem), GFP_KERNEL);
1236 	if (!page_list || !umem)
1237 		goto unlock;
1238 
1239 	mmap_read_lock(current->mm);
1240 
1241 	lock_limit = PFN_DOWN(rlimit(RLIMIT_MEMLOCK));
1242 	if (npages + atomic64_read(&current->mm->pinned_vm) > lock_limit)
1243 		goto out;
1244 
1245 	pinned = pin_user_pages(uaddr, npages, FOLL_LONGTERM | FOLL_WRITE,
1246 				page_list);
1247 	if (pinned != npages) {
1248 		ret = pinned < 0 ? pinned : -ENOMEM;
1249 		goto out;
1250 	}
1251 
1252 	ret = vduse_domain_add_user_bounce_pages(dev->as[asid].domain,
1253 						 page_list, pinned);
1254 	if (ret)
1255 		goto out;
1256 
1257 	atomic64_add(npages, &current->mm->pinned_vm);
1258 
1259 	umem->pages = page_list;
1260 	umem->npages = pinned;
1261 	umem->iova = iova;
1262 	umem->mm = current->mm;
1263 	mmgrab(current->mm);
1264 
1265 	dev->as[asid].umem = umem;
1266 out:
1267 	if (ret && pinned > 0)
1268 		unpin_user_pages(page_list, pinned);
1269 
1270 	mmap_read_unlock(current->mm);
1271 unlock:
1272 	if (ret) {
1273 		vfree(page_list);
1274 		kfree(umem);
1275 	}
1276 	mutex_unlock(&dev->as[asid].mem_lock);
1277 	return ret;
1278 }
1279 
1280 static void vduse_vq_update_effective_cpu(struct vduse_virtqueue *vq)
1281 {
1282 	int curr_cpu = vq->irq_effective_cpu;
1283 
1284 	while (true) {
1285 		curr_cpu = cpumask_next(curr_cpu, &vq->irq_affinity);
1286 		if (cpu_online(curr_cpu))
1287 			break;
1288 
1289 		if (curr_cpu >= nr_cpu_ids)
1290 			curr_cpu = IRQ_UNBOUND;
1291 	}
1292 
1293 	vq->irq_effective_cpu = curr_cpu;
1294 }
1295 
1296 static int vduse_dev_iotlb_entry(struct vduse_dev *dev,
1297 				 struct vduse_iotlb_entry_v2 *entry,
1298 				 struct file **f, uint64_t *capability)
1299 {
1300 	u32 asid;
1301 	int r = -EINVAL;
1302 	struct vhost_iotlb_map *map;
1303 
1304 	if (entry->start > entry->last || entry->asid >= dev->nas)
1305 		return -EINVAL;
1306 
1307 	asid = array_index_nospec(entry->asid, dev->nas);
1308 	mutex_lock(&dev->domain_lock);
1309 
1310 	if (!dev->as[asid].domain)
1311 		goto out;
1312 
1313 	spin_lock(&dev->as[asid].domain->iotlb_lock);
1314 	map = vhost_iotlb_itree_first(dev->as[asid].domain->iotlb,
1315 				      entry->start, entry->last);
1316 	if (map) {
1317 		if (f) {
1318 			const struct vdpa_map_file *map_file;
1319 
1320 			map_file = (struct vdpa_map_file *)map->opaque;
1321 			entry->offset = map_file->offset;
1322 			*f = get_file(map_file->file);
1323 		}
1324 		entry->start = map->start;
1325 		entry->last = map->last;
1326 		entry->perm = map->perm;
1327 		if (capability) {
1328 			*capability = 0;
1329 
1330 			if (dev->as[asid].domain->bounce_map && map->start == 0 &&
1331 			    map->last == dev->as[asid].domain->bounce_size - 1)
1332 				*capability |= VDUSE_IOVA_CAP_UMEM;
1333 		}
1334 
1335 		r = 0;
1336 	}
1337 	spin_unlock(&dev->as[asid].domain->iotlb_lock);
1338 
1339 out:
1340 	mutex_unlock(&dev->domain_lock);
1341 	return r;
1342 }
1343 
1344 static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
1345 			    unsigned long arg)
1346 {
1347 	struct vduse_dev *dev = file->private_data;
1348 	void __user *argp = (void __user *)arg;
1349 	int ret;
1350 
1351 	if (unlikely(dev->broken))
1352 		return -EPERM;
1353 
1354 	switch (cmd) {
1355 	case VDUSE_IOTLB_GET_FD:
1356 	case VDUSE_IOTLB_GET_FD2: {
1357 		struct vduse_iotlb_entry_v2 entry = {0};
1358 		struct file *f = NULL;
1359 
1360 		ret = -ENOIOCTLCMD;
1361 		if (dev->api_version < VDUSE_API_VERSION_1 &&
1362 		    cmd == VDUSE_IOTLB_GET_FD2)
1363 			break;
1364 
1365 		ret = -EFAULT;
1366 		if (copy_from_user(&entry, argp, _IOC_SIZE(cmd)))
1367 			break;
1368 
1369 		ret = -EINVAL;
1370 		if (!is_mem_zero((const char *)entry.reserved,
1371 				 sizeof(entry.reserved)))
1372 			break;
1373 
1374 		ret = vduse_dev_iotlb_entry(dev, &entry, &f, NULL);
1375 		if (ret)
1376 			break;
1377 
1378 		ret = -EINVAL;
1379 		if (!f)
1380 			break;
1381 
1382 		ret = copy_to_user(argp, &entry, _IOC_SIZE(cmd));
1383 		if (ret) {
1384 			ret = -EFAULT;
1385 			fput(f);
1386 			break;
1387 		}
1388 		ret = receive_fd(f, NULL, perm_to_file_flags(entry.perm));
1389 		fput(f);
1390 		break;
1391 	}
1392 	case VDUSE_DEV_GET_FEATURES:
1393 		/*
1394 		 * Just mirror what driver wrote here.
1395 		 * The driver is expected to check FEATURE_OK later.
1396 		 */
1397 		ret = put_user(dev->driver_features, (u64 __user *)argp);
1398 		break;
1399 	case VDUSE_DEV_SET_CONFIG: {
1400 		struct vduse_config_data config;
1401 		unsigned long size = offsetof(struct vduse_config_data,
1402 					      buffer);
1403 
1404 		ret = -EFAULT;
1405 		if (copy_from_user(&config, argp, size))
1406 			break;
1407 
1408 		ret = -EINVAL;
1409 		if (config.offset > dev->config_size ||
1410 		    config.length == 0 ||
1411 		    config.length > dev->config_size - config.offset)
1412 			break;
1413 
1414 		ret = -EFAULT;
1415 		if (copy_from_user(dev->config + config.offset, argp + size,
1416 				   config.length))
1417 			break;
1418 
1419 		ret = 0;
1420 		break;
1421 	}
1422 	case VDUSE_DEV_INJECT_CONFIG_IRQ:
1423 		ret = vduse_dev_queue_irq_work(dev, &dev->inject, IRQ_UNBOUND);
1424 		break;
1425 	case VDUSE_VQ_SETUP: {
1426 		struct vduse_vq_config config;
1427 		u32 index;
1428 
1429 		ret = -EFAULT;
1430 		if (copy_from_user(&config, argp, sizeof(config)))
1431 			break;
1432 
1433 		ret = -EINVAL;
1434 		if (config.index >= dev->vq_num)
1435 			break;
1436 
1437 		if (dev->api_version < VDUSE_API_VERSION_1) {
1438 			if (config.group)
1439 				break;
1440 		} else {
1441 			if (config.group >= dev->ngroups)
1442 				break;
1443 			if (dev->status & VIRTIO_CONFIG_S_DRIVER_OK)
1444 				break;
1445 		}
1446 
1447 		if (config.reserved1 ||
1448 		    !is_mem_zero((const char *)config.reserved2,
1449 				 sizeof(config.reserved2)))
1450 			break;
1451 
1452 		index = array_index_nospec(config.index, dev->vq_num);
1453 		dev->vqs[index]->num_max = config.max_size;
1454 		dev->vqs[index]->group = config.group;
1455 		ret = 0;
1456 		break;
1457 	}
1458 	case VDUSE_VQ_GET_INFO: {
1459 		struct vduse_vq_info vq_info;
1460 		struct vduse_virtqueue *vq;
1461 		u32 index;
1462 
1463 		ret = -EFAULT;
1464 		if (copy_from_user(&vq_info, argp, sizeof(vq_info)))
1465 			break;
1466 
1467 		ret = -EINVAL;
1468 		if (vq_info.index >= dev->vq_num)
1469 			break;
1470 
1471 		index = array_index_nospec(vq_info.index, dev->vq_num);
1472 		vq = dev->vqs[index];
1473 		vq_info.desc_addr = vq->desc_addr;
1474 		vq_info.driver_addr = vq->driver_addr;
1475 		vq_info.device_addr = vq->device_addr;
1476 		vq_info.num = vq->num;
1477 
1478 		if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
1479 			vq_info.packed.last_avail_counter =
1480 				vq->state.packed.last_avail_counter;
1481 			vq_info.packed.last_avail_idx =
1482 				vq->state.packed.last_avail_idx;
1483 			vq_info.packed.last_used_counter =
1484 				vq->state.packed.last_used_counter;
1485 			vq_info.packed.last_used_idx =
1486 				vq->state.packed.last_used_idx;
1487 		} else
1488 			vq_info.split.avail_index =
1489 				vq->state.split.avail_index;
1490 
1491 		vq_info.ready = vq->ready;
1492 
1493 		ret = -EFAULT;
1494 		if (copy_to_user(argp, &vq_info, sizeof(vq_info)))
1495 			break;
1496 
1497 		ret = 0;
1498 		break;
1499 	}
1500 	case VDUSE_VQ_SETUP_KICKFD: {
1501 		struct vduse_vq_eventfd eventfd;
1502 
1503 		ret = -EFAULT;
1504 		if (copy_from_user(&eventfd, argp, sizeof(eventfd)))
1505 			break;
1506 
1507 		ret = vduse_kickfd_setup(dev, &eventfd);
1508 		break;
1509 	}
1510 	case VDUSE_VQ_INJECT_IRQ: {
1511 		u32 index;
1512 
1513 		ret = -EFAULT;
1514 		if (get_user(index, (u32 __user *)argp))
1515 			break;
1516 
1517 		ret = -EINVAL;
1518 		if (index >= dev->vq_num)
1519 			break;
1520 
1521 		ret = 0;
1522 		index = array_index_nospec(index, dev->vq_num);
1523 		if (!vduse_vq_signal_irqfd(dev->vqs[index])) {
1524 			vduse_vq_update_effective_cpu(dev->vqs[index]);
1525 			ret = vduse_dev_queue_irq_work(dev,
1526 						&dev->vqs[index]->inject,
1527 						dev->vqs[index]->irq_effective_cpu);
1528 		}
1529 		break;
1530 	}
1531 	case VDUSE_IOTLB_REG_UMEM: {
1532 		struct vduse_iova_umem umem;
1533 		u32 asid;
1534 
1535 		ret = -EFAULT;
1536 		if (copy_from_user(&umem, argp, sizeof(umem)))
1537 			break;
1538 
1539 		ret = -EINVAL;
1540 		if (!is_mem_zero((const char *)umem.reserved,
1541 				 sizeof(umem.reserved)) ||
1542 		    (dev->api_version < VDUSE_API_VERSION_1 &&
1543 		     umem.asid != 0) || umem.asid >= dev->nas)
1544 			break;
1545 
1546 		mutex_lock(&dev->domain_lock);
1547 		asid = array_index_nospec(umem.asid, dev->nas);
1548 		ret = vduse_dev_reg_umem(dev, asid, umem.iova,
1549 					 umem.uaddr, umem.size);
1550 		mutex_unlock(&dev->domain_lock);
1551 		break;
1552 	}
1553 	case VDUSE_IOTLB_DEREG_UMEM: {
1554 		struct vduse_iova_umem umem;
1555 		u32 asid;
1556 
1557 		ret = -EFAULT;
1558 		if (copy_from_user(&umem, argp, sizeof(umem)))
1559 			break;
1560 
1561 		ret = -EINVAL;
1562 		if (!is_mem_zero((const char *)umem.reserved,
1563 				 sizeof(umem.reserved)) ||
1564 		    (dev->api_version < VDUSE_API_VERSION_1 &&
1565 		     umem.asid != 0) ||
1566 		     umem.asid >= dev->nas)
1567 			break;
1568 
1569 		mutex_lock(&dev->domain_lock);
1570 		asid = array_index_nospec(umem.asid, dev->nas);
1571 		ret = vduse_dev_dereg_umem(dev, asid, umem.iova,
1572 					   umem.size);
1573 		mutex_unlock(&dev->domain_lock);
1574 		break;
1575 	}
1576 	case VDUSE_IOTLB_GET_INFO: {
1577 		struct vduse_iova_info info;
1578 		struct vduse_iotlb_entry_v2 entry;
1579 
1580 		ret = -EFAULT;
1581 		if (copy_from_user(&info, argp, sizeof(info)))
1582 			break;
1583 
1584 		if (!is_mem_zero((const char *)info.reserved,
1585 				 sizeof(info.reserved)))
1586 			break;
1587 
1588 		if (dev->api_version < VDUSE_API_VERSION_1) {
1589 			if (info.asid)
1590 				break;
1591 		} else if (info.asid >= dev->nas)
1592 			break;
1593 
1594 		entry.start = info.start;
1595 		entry.last = info.last;
1596 		entry.asid = info.asid;
1597 		ret = vduse_dev_iotlb_entry(dev, &entry, NULL,
1598 					    &info.capability);
1599 		if (ret < 0)
1600 			break;
1601 
1602 		info.start = entry.start;
1603 		info.last = entry.last;
1604 		info.asid = entry.asid;
1605 
1606 		ret = -EFAULT;
1607 		if (copy_to_user(argp, &info, sizeof(info)))
1608 			break;
1609 
1610 		ret = 0;
1611 		break;
1612 	}
1613 	default:
1614 		ret = -ENOIOCTLCMD;
1615 		break;
1616 	}
1617 
1618 	return ret;
1619 }
1620 
1621 static int vduse_dev_release(struct inode *inode, struct file *file)
1622 {
1623 	struct vduse_dev *dev = file->private_data;
1624 
1625 	mutex_lock(&dev->domain_lock);
1626 	for (int i = 0; i < dev->nas; i++)
1627 		if (dev->as[i].domain)
1628 			vduse_dev_dereg_umem(dev, i, 0,
1629 					     dev->as[i].domain->bounce_size);
1630 	mutex_unlock(&dev->domain_lock);
1631 	spin_lock(&dev->msg_lock);
1632 	/* Make sure the inflight messages can processed after reconncection */
1633 	list_splice_init(&dev->recv_list, &dev->send_list);
1634 	spin_unlock(&dev->msg_lock);
1635 	dev->connected = false;
1636 
1637 	return 0;
1638 }
1639 
1640 static struct vduse_dev *vduse_dev_get_from_minor(int minor)
1641 {
1642 	struct vduse_dev *dev;
1643 
1644 	mutex_lock(&vduse_lock);
1645 	dev = idr_find(&vduse_idr, minor);
1646 	mutex_unlock(&vduse_lock);
1647 
1648 	return dev;
1649 }
1650 
1651 static int vduse_dev_open(struct inode *inode, struct file *file)
1652 {
1653 	int ret;
1654 	struct vduse_dev *dev = vduse_dev_get_from_minor(iminor(inode));
1655 
1656 	if (!dev)
1657 		return -ENODEV;
1658 
1659 	ret = -EBUSY;
1660 	mutex_lock(&dev->lock);
1661 	if (dev->connected)
1662 		goto unlock;
1663 
1664 	ret = 0;
1665 	dev->connected = true;
1666 	file->private_data = dev;
1667 unlock:
1668 	mutex_unlock(&dev->lock);
1669 
1670 	return ret;
1671 }
1672 
1673 static const struct file_operations vduse_dev_fops = {
1674 	.owner		= THIS_MODULE,
1675 	.open		= vduse_dev_open,
1676 	.release	= vduse_dev_release,
1677 	.read_iter	= vduse_dev_read_iter,
1678 	.write_iter	= vduse_dev_write_iter,
1679 	.poll		= vduse_dev_poll,
1680 	.unlocked_ioctl	= vduse_dev_ioctl,
1681 	.compat_ioctl	= compat_ptr_ioctl,
1682 	.llseek		= noop_llseek,
1683 };
1684 
1685 static ssize_t irq_cb_affinity_show(struct vduse_virtqueue *vq, char *buf)
1686 {
1687 	return sprintf(buf, "%*pb\n", cpumask_pr_args(&vq->irq_affinity));
1688 }
1689 
1690 static ssize_t irq_cb_affinity_store(struct vduse_virtqueue *vq,
1691 				     const char *buf, size_t count)
1692 {
1693 	cpumask_var_t new_value;
1694 	int ret;
1695 
1696 	if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
1697 		return -ENOMEM;
1698 
1699 	ret = cpumask_parse(buf, new_value);
1700 	if (ret)
1701 		goto free_mask;
1702 
1703 	ret = -EINVAL;
1704 	if (!cpumask_intersects(new_value, cpu_online_mask))
1705 		goto free_mask;
1706 
1707 	cpumask_copy(&vq->irq_affinity, new_value);
1708 	ret = count;
1709 free_mask:
1710 	free_cpumask_var(new_value);
1711 	return ret;
1712 }
1713 
1714 struct vq_sysfs_entry {
1715 	struct attribute attr;
1716 	ssize_t (*show)(struct vduse_virtqueue *vq, char *buf);
1717 	ssize_t (*store)(struct vduse_virtqueue *vq, const char *buf,
1718 			 size_t count);
1719 };
1720 
1721 static struct vq_sysfs_entry irq_cb_affinity_attr = __ATTR_RW(irq_cb_affinity);
1722 
1723 static struct attribute *vq_attrs[] = {
1724 	&irq_cb_affinity_attr.attr,
1725 	NULL,
1726 };
1727 ATTRIBUTE_GROUPS(vq);
1728 
1729 static ssize_t vq_attr_show(struct kobject *kobj, struct attribute *attr,
1730 			    char *buf)
1731 {
1732 	struct vduse_virtqueue *vq = container_of(kobj,
1733 					struct vduse_virtqueue, kobj);
1734 	struct vq_sysfs_entry *entry = container_of(attr,
1735 					struct vq_sysfs_entry, attr);
1736 
1737 	if (!entry->show)
1738 		return -EIO;
1739 
1740 	return entry->show(vq, buf);
1741 }
1742 
1743 static ssize_t vq_attr_store(struct kobject *kobj, struct attribute *attr,
1744 			     const char *buf, size_t count)
1745 {
1746 	struct vduse_virtqueue *vq = container_of(kobj,
1747 					struct vduse_virtqueue, kobj);
1748 	struct vq_sysfs_entry *entry = container_of(attr,
1749 					struct vq_sysfs_entry, attr);
1750 
1751 	if (!entry->store)
1752 		return -EIO;
1753 
1754 	return entry->store(vq, buf, count);
1755 }
1756 
1757 static const struct sysfs_ops vq_sysfs_ops = {
1758 	.show = vq_attr_show,
1759 	.store = vq_attr_store,
1760 };
1761 
1762 static void vq_release(struct kobject *kobj)
1763 {
1764 	struct vduse_virtqueue *vq = container_of(kobj,
1765 					struct vduse_virtqueue, kobj);
1766 	kfree(vq);
1767 }
1768 
1769 static const struct kobj_type vq_type = {
1770 	.release	= vq_release,
1771 	.sysfs_ops	= &vq_sysfs_ops,
1772 	.default_groups	= vq_groups,
1773 };
1774 
1775 static char *vduse_devnode(const struct device *dev, umode_t *mode)
1776 {
1777 	return kasprintf(GFP_KERNEL, "vduse/%s", dev_name(dev));
1778 }
1779 
1780 static const struct class vduse_class = {
1781 	.name = "vduse",
1782 	.devnode = vduse_devnode,
1783 };
1784 
1785 static void vduse_dev_deinit_vqs(struct vduse_dev *dev)
1786 {
1787 	int i;
1788 
1789 	if (!dev->vqs)
1790 		return;
1791 
1792 	for (i = 0; i < dev->vq_num; i++)
1793 		kobject_put(&dev->vqs[i]->kobj);
1794 	kfree(dev->vqs);
1795 }
1796 
1797 static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)
1798 {
1799 	int ret, i;
1800 
1801 	dev->vq_align = vq_align;
1802 	dev->vq_num = vq_num;
1803 	dev->vqs = kcalloc(dev->vq_num, sizeof(*dev->vqs), GFP_KERNEL);
1804 	if (!dev->vqs)
1805 		return -ENOMEM;
1806 
1807 	for (i = 0; i < vq_num; i++) {
1808 		dev->vqs[i] = kzalloc(sizeof(*dev->vqs[i]), GFP_KERNEL);
1809 		if (!dev->vqs[i]) {
1810 			ret = -ENOMEM;
1811 			goto err;
1812 		}
1813 
1814 		dev->vqs[i]->index = i;
1815 		dev->vqs[i]->irq_effective_cpu = IRQ_UNBOUND;
1816 		INIT_WORK(&dev->vqs[i]->inject, vduse_vq_irq_inject);
1817 		INIT_WORK(&dev->vqs[i]->kick, vduse_vq_kick_work);
1818 		spin_lock_init(&dev->vqs[i]->kick_lock);
1819 		spin_lock_init(&dev->vqs[i]->irq_lock);
1820 		cpumask_setall(&dev->vqs[i]->irq_affinity);
1821 
1822 		kobject_init(&dev->vqs[i]->kobj, &vq_type);
1823 		ret = kobject_add(&dev->vqs[i]->kobj,
1824 				  &dev->dev->kobj, "vq%d", i);
1825 		if (ret) {
1826 			kfree(dev->vqs[i]);
1827 			goto err;
1828 		}
1829 	}
1830 
1831 	return 0;
1832 err:
1833 	while (i--)
1834 		kobject_put(&dev->vqs[i]->kobj);
1835 	kfree(dev->vqs);
1836 	dev->vqs = NULL;
1837 	return ret;
1838 }
1839 
1840 static struct vduse_dev *vduse_dev_create(void)
1841 {
1842 	struct vduse_dev *dev = kzalloc(sizeof(*dev), GFP_KERNEL);
1843 
1844 	if (!dev)
1845 		return NULL;
1846 
1847 	mutex_init(&dev->lock);
1848 	mutex_init(&dev->domain_lock);
1849 	spin_lock_init(&dev->msg_lock);
1850 	INIT_LIST_HEAD(&dev->send_list);
1851 	INIT_LIST_HEAD(&dev->recv_list);
1852 	spin_lock_init(&dev->irq_lock);
1853 	init_rwsem(&dev->rwsem);
1854 
1855 	INIT_WORK(&dev->inject, vduse_dev_irq_inject);
1856 	init_waitqueue_head(&dev->waitq);
1857 
1858 	return dev;
1859 }
1860 
1861 static void vduse_dev_destroy(struct vduse_dev *dev)
1862 {
1863 	kfree(dev);
1864 }
1865 
1866 static struct vduse_dev *vduse_find_dev(const char *name)
1867 {
1868 	struct vduse_dev *dev;
1869 	int id;
1870 
1871 	idr_for_each_entry(&vduse_idr, dev, id)
1872 		if (!strcmp(dev->name, name))
1873 			return dev;
1874 
1875 	return NULL;
1876 }
1877 
1878 static int vduse_destroy_dev(char *name)
1879 {
1880 	struct vduse_dev *dev = vduse_find_dev(name);
1881 
1882 	if (!dev)
1883 		return -EINVAL;
1884 
1885 	mutex_lock(&dev->lock);
1886 	if (dev->vdev || dev->connected) {
1887 		mutex_unlock(&dev->lock);
1888 		return -EBUSY;
1889 	}
1890 	dev->connected = true;
1891 	mutex_unlock(&dev->lock);
1892 
1893 	vduse_dev_reset(dev);
1894 	device_destroy(&vduse_class, MKDEV(MAJOR(vduse_major), dev->minor));
1895 	idr_remove(&vduse_idr, dev->minor);
1896 	kvfree(dev->config);
1897 	vduse_dev_deinit_vqs(dev);
1898 	for (int i = 0; i < dev->nas; i++) {
1899 		if (dev->as[i].domain)
1900 			vduse_domain_destroy(dev->as[i].domain);
1901 	}
1902 	kfree(dev->as);
1903 	kfree(dev->name);
1904 	kfree(dev->groups);
1905 	vduse_dev_destroy(dev);
1906 	module_put(THIS_MODULE);
1907 
1908 	return 0;
1909 }
1910 
1911 static bool device_is_allowed(u32 device_id)
1912 {
1913 	int i;
1914 
1915 	for (i = 0; i < ARRAY_SIZE(allowed_device_id); i++)
1916 		if (allowed_device_id[i] == device_id)
1917 			return true;
1918 
1919 	return false;
1920 }
1921 
1922 static bool features_is_valid(struct vduse_dev_config *config)
1923 {
1924 	if (!(config->features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)))
1925 		return false;
1926 
1927 	/* Now we only support read-only configuration space */
1928 	if ((config->device_id == VIRTIO_ID_BLOCK) &&
1929 			(config->features & BIT_ULL(VIRTIO_BLK_F_CONFIG_WCE)))
1930 		return false;
1931 	else if ((config->device_id == VIRTIO_ID_NET) &&
1932 			(config->features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
1933 		return false;
1934 
1935 	if ((config->device_id == VIRTIO_ID_NET) &&
1936 			!(config->features & BIT_ULL(VIRTIO_F_VERSION_1)))
1937 		return false;
1938 
1939 	return true;
1940 }
1941 
1942 static bool vduse_validate_config(struct vduse_dev_config *config,
1943 				  u64 api_version)
1944 {
1945 	if (!is_mem_zero((const char *)config->reserved,
1946 			 sizeof(config->reserved)))
1947 		return false;
1948 
1949 	if (api_version < VDUSE_API_VERSION_1 &&
1950 	    (config->ngroups || config->nas))
1951 		return false;
1952 
1953 	if (api_version >= VDUSE_API_VERSION_1) {
1954 		if (!config->ngroups || config->ngroups > VDUSE_DEV_MAX_GROUPS)
1955 			return false;
1956 
1957 		if (!config->nas || config->nas > VDUSE_DEV_MAX_AS)
1958 			return false;
1959 	}
1960 
1961 	if (config->vq_align > PAGE_SIZE)
1962 		return false;
1963 
1964 	if (config->config_size > PAGE_SIZE)
1965 		return false;
1966 
1967 	if (config->vq_num > 0xffff)
1968 		return false;
1969 
1970 	if (!config->name[0])
1971 		return false;
1972 
1973 	if (!device_is_allowed(config->device_id))
1974 		return false;
1975 
1976 	if (!features_is_valid(config))
1977 		return false;
1978 
1979 	return true;
1980 }
1981 
1982 static ssize_t msg_timeout_show(struct device *device,
1983 				struct device_attribute *attr, char *buf)
1984 {
1985 	struct vduse_dev *dev = dev_get_drvdata(device);
1986 
1987 	return sysfs_emit(buf, "%u\n", dev->msg_timeout);
1988 }
1989 
1990 static ssize_t msg_timeout_store(struct device *device,
1991 				 struct device_attribute *attr,
1992 				 const char *buf, size_t count)
1993 {
1994 	struct vduse_dev *dev = dev_get_drvdata(device);
1995 	int ret;
1996 
1997 	ret = kstrtouint(buf, 10, &dev->msg_timeout);
1998 	if (ret < 0)
1999 		return ret;
2000 
2001 	return count;
2002 }
2003 
2004 static DEVICE_ATTR_RW(msg_timeout);
2005 
2006 static ssize_t bounce_size_show(struct device *device,
2007 				struct device_attribute *attr, char *buf)
2008 {
2009 	struct vduse_dev *dev = dev_get_drvdata(device);
2010 
2011 	return sysfs_emit(buf, "%u\n", dev->bounce_size);
2012 }
2013 
2014 static ssize_t bounce_size_store(struct device *device,
2015 				 struct device_attribute *attr,
2016 				 const char *buf, size_t count)
2017 {
2018 	struct vduse_dev *dev = dev_get_drvdata(device);
2019 	unsigned int bounce_size;
2020 	int ret;
2021 
2022 	ret = -EPERM;
2023 	mutex_lock(&dev->domain_lock);
2024 	/* Assuming that if the first domain is allocated, all are allocated */
2025 	if (dev->as[0].domain)
2026 		goto unlock;
2027 
2028 	ret = kstrtouint(buf, 10, &bounce_size);
2029 	if (ret < 0)
2030 		goto unlock;
2031 
2032 	ret = -EINVAL;
2033 	if (bounce_size > VDUSE_MAX_BOUNCE_SIZE ||
2034 	    bounce_size < VDUSE_MIN_BOUNCE_SIZE)
2035 		goto unlock;
2036 
2037 	dev->bounce_size = bounce_size & PAGE_MASK;
2038 	ret = count;
2039 unlock:
2040 	mutex_unlock(&dev->domain_lock);
2041 	return ret;
2042 }
2043 
2044 static DEVICE_ATTR_RW(bounce_size);
2045 
2046 static struct attribute *vduse_dev_attrs[] = {
2047 	&dev_attr_msg_timeout.attr,
2048 	&dev_attr_bounce_size.attr,
2049 	NULL
2050 };
2051 
2052 ATTRIBUTE_GROUPS(vduse_dev);
2053 
2054 static int vduse_create_dev(struct vduse_dev_config *config,
2055 			    void *config_buf, u64 api_version)
2056 {
2057 	int ret;
2058 	struct vduse_dev *dev;
2059 
2060 	ret = -EPERM;
2061 	if ((config->device_id == VIRTIO_ID_NET) && !capable(CAP_NET_ADMIN))
2062 		goto err;
2063 
2064 	ret = -EEXIST;
2065 	if (vduse_find_dev(config->name))
2066 		goto err;
2067 
2068 	ret = -ENOMEM;
2069 	dev = vduse_dev_create();
2070 	if (!dev)
2071 		goto err;
2072 
2073 	dev->api_version = api_version;
2074 	dev->device_features = config->features;
2075 	dev->device_id = config->device_id;
2076 	dev->vendor_id = config->vendor_id;
2077 
2078 	dev->nas = (dev->api_version < VDUSE_API_VERSION_1) ? 1 : config->nas;
2079 	dev->as = kcalloc(dev->nas, sizeof(dev->as[0]), GFP_KERNEL);
2080 	if (!dev->as)
2081 		goto err_as;
2082 	for (int i = 0; i < dev->nas; i++)
2083 		mutex_init(&dev->as[i].mem_lock);
2084 
2085 	dev->ngroups = (dev->api_version < VDUSE_API_VERSION_1)
2086 		       ? 1
2087 		       : config->ngroups;
2088 	dev->groups = kcalloc(dev->ngroups, sizeof(dev->groups[0]),
2089 			      GFP_KERNEL);
2090 	if (!dev->groups)
2091 		goto err_vq_groups;
2092 	for (u32 i = 0; i < dev->ngroups; ++i) {
2093 		dev->groups[i].dev = dev;
2094 		rwlock_init(&dev->groups[i].as_lock);
2095 		dev->groups[i].as = &dev->as[0];
2096 	}
2097 
2098 	dev->name = kstrdup(config->name, GFP_KERNEL);
2099 	if (!dev->name)
2100 		goto err_str;
2101 
2102 	dev->bounce_size = VDUSE_BOUNCE_SIZE;
2103 	dev->config = config_buf;
2104 	dev->config_size = config->config_size;
2105 
2106 	ret = idr_alloc(&vduse_idr, dev, 1, VDUSE_DEV_MAX, GFP_KERNEL);
2107 	if (ret < 0)
2108 		goto err_idr;
2109 
2110 	dev->minor = ret;
2111 	dev->msg_timeout = VDUSE_MSG_DEFAULT_TIMEOUT;
2112 	dev->dev = device_create_with_groups(&vduse_class, NULL,
2113 				MKDEV(MAJOR(vduse_major), dev->minor),
2114 				dev, vduse_dev_groups, "%s", config->name);
2115 	if (IS_ERR(dev->dev)) {
2116 		ret = PTR_ERR(dev->dev);
2117 		goto err_dev;
2118 	}
2119 
2120 	ret = vduse_dev_init_vqs(dev, config->vq_align, config->vq_num);
2121 	if (ret)
2122 		goto err_vqs;
2123 
2124 	__module_get(THIS_MODULE);
2125 
2126 	return 0;
2127 err_vqs:
2128 	device_destroy(&vduse_class, MKDEV(MAJOR(vduse_major), dev->minor));
2129 err_dev:
2130 	idr_remove(&vduse_idr, dev->minor);
2131 err_idr:
2132 	kfree(dev->name);
2133 err_str:
2134 	kfree(dev->groups);
2135 err_vq_groups:
2136 	kfree(dev->as);
2137 err_as:
2138 	vduse_dev_destroy(dev);
2139 err:
2140 	return ret;
2141 }
2142 
2143 static long vduse_ioctl(struct file *file, unsigned int cmd,
2144 			unsigned long arg)
2145 {
2146 	int ret;
2147 	void __user *argp = (void __user *)arg;
2148 	struct vduse_control *control = file->private_data;
2149 
2150 	mutex_lock(&vduse_lock);
2151 	switch (cmd) {
2152 	case VDUSE_GET_API_VERSION:
2153 		if (control->api_version == VDUSE_API_VERSION_NOT_ASKED)
2154 			control->api_version = VDUSE_API_VERSION_1;
2155 		ret = put_user(control->api_version, (u64 __user *)argp);
2156 		break;
2157 	case VDUSE_SET_API_VERSION: {
2158 		u64 api_version;
2159 
2160 		ret = -EFAULT;
2161 		if (get_user(api_version, (u64 __user *)argp))
2162 			break;
2163 
2164 		ret = -EINVAL;
2165 		if (api_version > VDUSE_API_VERSION_1)
2166 			break;
2167 
2168 		ret = 0;
2169 		control->api_version = api_version;
2170 		break;
2171 	}
2172 	case VDUSE_CREATE_DEV: {
2173 		struct vduse_dev_config config;
2174 		unsigned long size = offsetof(struct vduse_dev_config, config);
2175 		void *buf;
2176 
2177 		ret = -EFAULT;
2178 		if (copy_from_user(&config, argp, size))
2179 			break;
2180 
2181 		ret = -EINVAL;
2182 		if (control->api_version == VDUSE_API_VERSION_NOT_ASKED)
2183 			control->api_version = VDUSE_API_VERSION;
2184 		if (!vduse_validate_config(&config, control->api_version))
2185 			break;
2186 
2187 		buf = vmemdup_user(argp + size, config.config_size);
2188 		if (IS_ERR(buf)) {
2189 			ret = PTR_ERR(buf);
2190 			break;
2191 		}
2192 		config.name[VDUSE_NAME_MAX - 1] = '\0';
2193 		ret = vduse_create_dev(&config, buf, control->api_version);
2194 		if (ret)
2195 			kvfree(buf);
2196 		break;
2197 	}
2198 	case VDUSE_DESTROY_DEV: {
2199 		char name[VDUSE_NAME_MAX];
2200 
2201 		ret = -EFAULT;
2202 		if (copy_from_user(name, argp, VDUSE_NAME_MAX))
2203 			break;
2204 
2205 		name[VDUSE_NAME_MAX - 1] = '\0';
2206 		ret = vduse_destroy_dev(name);
2207 		break;
2208 	}
2209 	default:
2210 		ret = -EINVAL;
2211 		break;
2212 	}
2213 	mutex_unlock(&vduse_lock);
2214 
2215 	return ret;
2216 }
2217 
2218 static int vduse_release(struct inode *inode, struct file *file)
2219 {
2220 	struct vduse_control *control = file->private_data;
2221 
2222 	kfree(control);
2223 	return 0;
2224 }
2225 
2226 static int vduse_open(struct inode *inode, struct file *file)
2227 {
2228 	struct vduse_control *control;
2229 
2230 	control = kmalloc(sizeof(struct vduse_control), GFP_KERNEL);
2231 	if (!control)
2232 		return -ENOMEM;
2233 
2234 	control->api_version = VDUSE_API_VERSION_NOT_ASKED;
2235 	file->private_data = control;
2236 
2237 	return 0;
2238 }
2239 
2240 static const struct file_operations vduse_ctrl_fops = {
2241 	.owner		= THIS_MODULE,
2242 	.open		= vduse_open,
2243 	.release	= vduse_release,
2244 	.unlocked_ioctl	= vduse_ioctl,
2245 	.compat_ioctl	= compat_ptr_ioctl,
2246 	.llseek		= noop_llseek,
2247 };
2248 
2249 struct vduse_mgmt_dev {
2250 	struct vdpa_mgmt_dev mgmt_dev;
2251 	struct device dev;
2252 };
2253 
2254 static struct vduse_mgmt_dev *vduse_mgmt;
2255 
2256 static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name)
2257 {
2258 	struct vduse_vdpa *vdev;
2259 
2260 	if (dev->vdev)
2261 		return -EEXIST;
2262 
2263 	vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev,
2264 				 &vduse_vdpa_config_ops, &vduse_map_ops,
2265 				 dev->ngroups, dev->nas, name, true);
2266 	if (IS_ERR(vdev))
2267 		return PTR_ERR(vdev);
2268 
2269 	dev->vdev = vdev;
2270 	vdev->dev = dev;
2271 	vdev->vdpa.mdev = &vduse_mgmt->mgmt_dev;
2272 
2273 	return 0;
2274 }
2275 
2276 static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
2277 			const struct vdpa_dev_set_config *config)
2278 {
2279 	struct vduse_dev *dev;
2280 	size_t domain_bounce_size;
2281 	int ret, i;
2282 
2283 	mutex_lock(&vduse_lock);
2284 	dev = vduse_find_dev(name);
2285 	if (!dev || !vduse_dev_is_ready(dev)) {
2286 		mutex_unlock(&vduse_lock);
2287 		return -EINVAL;
2288 	}
2289 	ret = vduse_dev_init_vdpa(dev, name);
2290 	mutex_unlock(&vduse_lock);
2291 	if (ret)
2292 		return ret;
2293 
2294 	mutex_lock(&dev->domain_lock);
2295 	ret = 0;
2296 
2297 	domain_bounce_size = dev->bounce_size / dev->nas;
2298 	for (i = 0; i < dev->nas; ++i) {
2299 		dev->as[i].domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1,
2300 							domain_bounce_size);
2301 		if (!dev->as[i].domain) {
2302 			ret = -ENOMEM;
2303 			goto err;
2304 		}
2305 	}
2306 
2307 	mutex_unlock(&dev->domain_lock);
2308 
2309 	ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num);
2310 	if (ret)
2311 		goto err_register;
2312 
2313 	return 0;
2314 
2315 err_register:
2316 	mutex_lock(&dev->domain_lock);
2317 
2318 err:
2319 	for (int j = 0; j < i; j++) {
2320 		if (dev->as[j].domain) {
2321 			vduse_domain_destroy(dev->as[j].domain);
2322 			dev->as[j].domain = NULL;
2323 		}
2324 	}
2325 	mutex_unlock(&dev->domain_lock);
2326 
2327 	put_device(&dev->vdev->vdpa.dev);
2328 
2329 	return ret;
2330 }
2331 
2332 static void vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev)
2333 {
2334 	_vdpa_unregister_device(dev);
2335 }
2336 
2337 static const struct vdpa_mgmtdev_ops vdpa_dev_mgmtdev_ops = {
2338 	.dev_add = vdpa_dev_add,
2339 	.dev_del = vdpa_dev_del,
2340 };
2341 
2342 static struct virtio_device_id id_table[] = {
2343 	{ VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
2344 	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
2345 	{ 0 },
2346 };
2347 
2348 static void vduse_mgmtdev_release(struct device *dev)
2349 {
2350 	struct vduse_mgmt_dev *mgmt_dev;
2351 
2352 	mgmt_dev = container_of(dev, struct vduse_mgmt_dev, dev);
2353 	kfree(mgmt_dev);
2354 }
2355 
2356 static int vduse_mgmtdev_init(void)
2357 {
2358 	int ret;
2359 
2360 	vduse_mgmt = kzalloc(sizeof(*vduse_mgmt), GFP_KERNEL);
2361 	if (!vduse_mgmt)
2362 		return -ENOMEM;
2363 
2364 	ret = dev_set_name(&vduse_mgmt->dev, "vduse");
2365 	if (ret) {
2366 		kfree(vduse_mgmt);
2367 		return ret;
2368 	}
2369 
2370 	vduse_mgmt->dev.release = vduse_mgmtdev_release;
2371 
2372 	ret = device_register(&vduse_mgmt->dev);
2373 	if (ret)
2374 		goto dev_reg_err;
2375 
2376 	vduse_mgmt->mgmt_dev.id_table = id_table;
2377 	vduse_mgmt->mgmt_dev.ops = &vdpa_dev_mgmtdev_ops;
2378 	vduse_mgmt->mgmt_dev.device = &vduse_mgmt->dev;
2379 	ret = vdpa_mgmtdev_register(&vduse_mgmt->mgmt_dev);
2380 	if (ret)
2381 		device_unregister(&vduse_mgmt->dev);
2382 
2383 	return ret;
2384 
2385 dev_reg_err:
2386 	put_device(&vduse_mgmt->dev);
2387 	return ret;
2388 }
2389 
2390 static void vduse_mgmtdev_exit(void)
2391 {
2392 	vdpa_mgmtdev_unregister(&vduse_mgmt->mgmt_dev);
2393 	device_unregister(&vduse_mgmt->dev);
2394 }
2395 
2396 static int vduse_init(void)
2397 {
2398 	int ret;
2399 	struct device *dev;
2400 
2401 	ret = class_register(&vduse_class);
2402 	if (ret)
2403 		return ret;
2404 
2405 	ret = alloc_chrdev_region(&vduse_major, 0, VDUSE_DEV_MAX, "vduse");
2406 	if (ret)
2407 		goto err_chardev_region;
2408 
2409 	/* /dev/vduse/control */
2410 	cdev_init(&vduse_ctrl_cdev, &vduse_ctrl_fops);
2411 	vduse_ctrl_cdev.owner = THIS_MODULE;
2412 	ret = cdev_add(&vduse_ctrl_cdev, vduse_major, 1);
2413 	if (ret)
2414 		goto err_ctrl_cdev;
2415 
2416 	dev = device_create(&vduse_class, NULL, vduse_major, NULL, "control");
2417 	if (IS_ERR(dev)) {
2418 		ret = PTR_ERR(dev);
2419 		goto err_device;
2420 	}
2421 
2422 	/* /dev/vduse/$DEVICE */
2423 	cdev_init(&vduse_cdev, &vduse_dev_fops);
2424 	vduse_cdev.owner = THIS_MODULE;
2425 	ret = cdev_add(&vduse_cdev, MKDEV(MAJOR(vduse_major), 1),
2426 		       VDUSE_DEV_MAX - 1);
2427 	if (ret)
2428 		goto err_cdev;
2429 
2430 	ret = -ENOMEM;
2431 	vduse_irq_wq = alloc_workqueue("vduse-irq",
2432 				WQ_HIGHPRI | WQ_SYSFS | WQ_UNBOUND, 0);
2433 	if (!vduse_irq_wq)
2434 		goto err_wq;
2435 
2436 	vduse_irq_bound_wq = alloc_workqueue("vduse-irq-bound",
2437 					     WQ_HIGHPRI | WQ_PERCPU, 0);
2438 	if (!vduse_irq_bound_wq)
2439 		goto err_bound_wq;
2440 
2441 	ret = vduse_domain_init();
2442 	if (ret)
2443 		goto err_domain;
2444 
2445 	ret = vduse_mgmtdev_init();
2446 	if (ret)
2447 		goto err_mgmtdev;
2448 
2449 	return 0;
2450 err_mgmtdev:
2451 	vduse_domain_exit();
2452 err_domain:
2453 	destroy_workqueue(vduse_irq_bound_wq);
2454 err_bound_wq:
2455 	destroy_workqueue(vduse_irq_wq);
2456 err_wq:
2457 	cdev_del(&vduse_cdev);
2458 err_cdev:
2459 	device_destroy(&vduse_class, vduse_major);
2460 err_device:
2461 	cdev_del(&vduse_ctrl_cdev);
2462 err_ctrl_cdev:
2463 	unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX);
2464 err_chardev_region:
2465 	class_unregister(&vduse_class);
2466 	return ret;
2467 }
2468 module_init(vduse_init);
2469 
2470 static void vduse_exit(void)
2471 {
2472 	vduse_mgmtdev_exit();
2473 	vduse_domain_exit();
2474 	destroy_workqueue(vduse_irq_bound_wq);
2475 	destroy_workqueue(vduse_irq_wq);
2476 	cdev_del(&vduse_cdev);
2477 	device_destroy(&vduse_class, vduse_major);
2478 	cdev_del(&vduse_ctrl_cdev);
2479 	unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX);
2480 	class_unregister(&vduse_class);
2481 	idr_destroy(&vduse_idr);
2482 }
2483 module_exit(vduse_exit);
2484 
2485 MODULE_LICENSE(DRV_LICENSE);
2486 MODULE_AUTHOR(DRV_AUTHOR);
2487 MODULE_DESCRIPTION(DRV_DESC);
2488