1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * VDUSE: vDPA Device in Userspace
4 *
5 * Copyright (C) 2020-2021 Bytedance Inc. and/or its affiliates. All rights reserved.
6 *
7 * Author: Xie Yongji <xieyongji@bytedance.com>
8 *
9 */
10
11 #include "linux/virtio_net.h"
12 #include <linux/cleanup.h>
13 #include <linux/init.h>
14 #include <linux/module.h>
15 #include <linux/cdev.h>
16 #include <linux/device.h>
17 #include <linux/eventfd.h>
18 #include <linux/slab.h>
19 #include <linux/wait.h>
20 #include <linux/dma-map-ops.h>
21 #include <linux/poll.h>
22 #include <linux/file.h>
23 #include <linux/uio.h>
24 #include <linux/vdpa.h>
25 #include <linux/nospec.h>
26 #include <linux/virtio.h>
27 #include <linux/vmalloc.h>
28 #include <linux/sched/mm.h>
29 #include <uapi/linux/vduse.h>
30 #include <uapi/linux/vdpa.h>
31 #include <uapi/linux/virtio_config.h>
32 #include <uapi/linux/virtio_ids.h>
33 #include <uapi/linux/virtio_blk.h>
34 #include <uapi/linux/virtio_ring.h>
35 #include <linux/mod_devicetable.h>
36
37 #include "iova_domain.h"
38
39 #define DRV_AUTHOR "Yongji Xie <xieyongji@bytedance.com>"
40 #define DRV_DESC "vDPA Device in Userspace"
41 #define DRV_LICENSE "GPL v2"
42
43 #define VDUSE_DEV_MAX (1U << MINORBITS)
44 #define VDUSE_DEV_MAX_GROUPS 0xffff
45 #define VDUSE_DEV_MAX_AS 0xffff
46 #define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024)
47 #define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024)
48 #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
49 /* 128 MB reserved for virtqueue creation */
50 #define VDUSE_IOVA_SIZE (VDUSE_MAX_BOUNCE_SIZE + 128 * 1024 * 1024)
51 #define VDUSE_MSG_DEFAULT_TIMEOUT 30
52
53 #define IRQ_UNBOUND -1
54
55 /*
56 * VDUSE instance have not asked the vduse API version, so assume 0.
57 *
58 * Old devices may not ask for the device version and assume it is 0. Keep
59 * this value for these. From the moment the VDUSE instance ask for the
60 * version, convert to the latests supported one and continue regular flow
61 */
62 #define VDUSE_API_VERSION_NOT_ASKED U64_MAX
63
64 struct vduse_virtqueue {
65 u16 index;
66 u16 num_max;
67 u32 num;
68 u64 desc_addr;
69 u64 driver_addr;
70 u64 device_addr;
71 struct vdpa_vq_state state;
72 bool ready;
73 bool kicked;
74 u32 group;
75 spinlock_t kick_lock;
76 spinlock_t irq_lock;
77 struct eventfd_ctx *kickfd;
78 struct vdpa_callback cb;
79 struct work_struct inject;
80 struct work_struct kick;
81 int irq_effective_cpu;
82 struct cpumask irq_affinity;
83 struct kobject kobj;
84 };
85
86 struct vduse_dev;
87
88 struct vduse_vdpa {
89 struct vdpa_device vdpa;
90 struct vduse_dev *dev;
91 };
92
93 struct vduse_umem {
94 unsigned long iova;
95 unsigned long npages;
96 struct page **pages;
97 struct mm_struct *mm;
98 };
99
100 struct vduse_as {
101 struct vduse_iova_domain *domain;
102 struct vduse_umem *umem;
103 struct mutex mem_lock;
104 };
105
106 struct vduse_vq_group {
107 rwlock_t as_lock;
108 struct vduse_as *as; /* Protected by as_lock */
109 struct vduse_dev *dev;
110 };
111
112 struct vduse_dev {
113 struct vduse_vdpa *vdev;
114 struct device *dev;
115 struct vduse_virtqueue **vqs;
116 struct vduse_as *as;
117 char *name;
118 struct mutex lock;
119 spinlock_t msg_lock;
120 u64 msg_unique;
121 u32 msg_timeout;
122 wait_queue_head_t waitq;
123 struct list_head send_list;
124 struct list_head recv_list;
125 struct vdpa_callback config_cb;
126 struct work_struct inject;
127 spinlock_t irq_lock;
128 struct rw_semaphore rwsem;
129 int minor;
130 bool broken;
131 bool connected;
132 u64 api_version;
133 u64 device_features;
134 u64 driver_features;
135 u32 device_id;
136 u32 vendor_id;
137 u32 generation;
138 u32 config_size;
139 void *config;
140 u8 status;
141 u32 vq_num;
142 u32 vq_align;
143 u32 ngroups;
144 u32 nas;
145 struct vduse_vq_group *groups;
146 unsigned int bounce_size;
147 struct mutex domain_lock;
148 };
149
150 struct vduse_dev_msg {
151 struct vduse_dev_request req;
152 struct vduse_dev_response resp;
153 struct list_head list;
154 wait_queue_head_t waitq;
155 bool completed;
156 };
157
158 struct vduse_control {
159 u64 api_version;
160 };
161
162 static DEFINE_MUTEX(vduse_lock);
163 static DEFINE_IDR(vduse_idr);
164
165 static dev_t vduse_major;
166 static struct cdev vduse_ctrl_cdev;
167 static struct cdev vduse_cdev;
168 static struct workqueue_struct *vduse_irq_wq;
169 static struct workqueue_struct *vduse_irq_bound_wq;
170
171 static u32 allowed_device_id[] = {
172 VIRTIO_ID_BLOCK,
173 VIRTIO_ID_NET,
174 VIRTIO_ID_FS,
175 };
176
vdpa_to_vduse(struct vdpa_device * vdpa)177 static inline struct vduse_dev *vdpa_to_vduse(struct vdpa_device *vdpa)
178 {
179 struct vduse_vdpa *vdev = container_of(vdpa, struct vduse_vdpa, vdpa);
180
181 return vdev->dev;
182 }
183
dev_to_vduse(struct device * dev)184 static inline struct vduse_dev *dev_to_vduse(struct device *dev)
185 {
186 struct vdpa_device *vdpa = dev_to_vdpa(dev);
187
188 return vdpa_to_vduse(vdpa);
189 }
190
vduse_find_msg(struct list_head * head,uint32_t request_id)191 static struct vduse_dev_msg *vduse_find_msg(struct list_head *head,
192 uint32_t request_id)
193 {
194 struct vduse_dev_msg *msg;
195
196 list_for_each_entry(msg, head, list) {
197 if (msg->req.request_id == request_id) {
198 list_del(&msg->list);
199 return msg;
200 }
201 }
202
203 return NULL;
204 }
205
vduse_dequeue_msg(struct list_head * head)206 static struct vduse_dev_msg *vduse_dequeue_msg(struct list_head *head)
207 {
208 struct vduse_dev_msg *msg = NULL;
209
210 if (!list_empty(head)) {
211 msg = list_first_entry(head, struct vduse_dev_msg, list);
212 list_del(&msg->list);
213 }
214
215 return msg;
216 }
217
vduse_enqueue_msg(struct list_head * head,struct vduse_dev_msg * msg)218 static void vduse_enqueue_msg(struct list_head *head,
219 struct vduse_dev_msg *msg)
220 {
221 list_add_tail(&msg->list, head);
222 }
223
vduse_dev_broken(struct vduse_dev * dev)224 static void vduse_dev_broken(struct vduse_dev *dev)
225 {
226 struct vduse_dev_msg *msg, *tmp;
227
228 if (unlikely(dev->broken))
229 return;
230
231 list_splice_init(&dev->recv_list, &dev->send_list);
232 list_for_each_entry_safe(msg, tmp, &dev->send_list, list) {
233 list_del(&msg->list);
234 msg->completed = 1;
235 msg->resp.result = VDUSE_REQ_RESULT_FAILED;
236 wake_up(&msg->waitq);
237 }
238 dev->broken = true;
239 wake_up(&dev->waitq);
240 }
241
vduse_dev_msg_sync(struct vduse_dev * dev,struct vduse_dev_msg * msg)242 static int vduse_dev_msg_sync(struct vduse_dev *dev,
243 struct vduse_dev_msg *msg)
244 {
245 int ret;
246
247 if (unlikely(dev->broken))
248 return -EIO;
249
250 init_waitqueue_head(&msg->waitq);
251 spin_lock(&dev->msg_lock);
252 if (unlikely(dev->broken)) {
253 spin_unlock(&dev->msg_lock);
254 return -EIO;
255 }
256 msg->req.request_id = dev->msg_unique++;
257 vduse_enqueue_msg(&dev->send_list, msg);
258 wake_up(&dev->waitq);
259 spin_unlock(&dev->msg_lock);
260 if (dev->msg_timeout)
261 ret = wait_event_killable_timeout(msg->waitq, msg->completed,
262 (long)dev->msg_timeout * HZ);
263 else
264 ret = wait_event_killable(msg->waitq, msg->completed);
265
266 spin_lock(&dev->msg_lock);
267 if (!msg->completed) {
268 list_del(&msg->list);
269 msg->resp.result = VDUSE_REQ_RESULT_FAILED;
270 /* Mark the device as malfunction when there is a timeout */
271 if (!ret)
272 vduse_dev_broken(dev);
273 }
274 ret = (msg->resp.result == VDUSE_REQ_RESULT_OK) ? 0 : -EIO;
275 spin_unlock(&dev->msg_lock);
276
277 return ret;
278 }
279
vduse_dev_get_vq_state_packed(struct vduse_dev * dev,struct vduse_virtqueue * vq,struct vdpa_vq_state_packed * packed)280 static int vduse_dev_get_vq_state_packed(struct vduse_dev *dev,
281 struct vduse_virtqueue *vq,
282 struct vdpa_vq_state_packed *packed)
283 {
284 struct vduse_dev_msg msg = { 0 };
285 int ret;
286
287 msg.req.type = VDUSE_GET_VQ_STATE;
288 msg.req.vq_state.index = vq->index;
289
290 ret = vduse_dev_msg_sync(dev, &msg);
291 if (ret)
292 return ret;
293
294 packed->last_avail_counter =
295 msg.resp.vq_state.packed.last_avail_counter & 0x0001;
296 packed->last_avail_idx =
297 msg.resp.vq_state.packed.last_avail_idx & 0x7FFF;
298 packed->last_used_counter =
299 msg.resp.vq_state.packed.last_used_counter & 0x0001;
300 packed->last_used_idx =
301 msg.resp.vq_state.packed.last_used_idx & 0x7FFF;
302
303 return 0;
304 }
305
vduse_dev_get_vq_state_split(struct vduse_dev * dev,struct vduse_virtqueue * vq,struct vdpa_vq_state_split * split)306 static int vduse_dev_get_vq_state_split(struct vduse_dev *dev,
307 struct vduse_virtqueue *vq,
308 struct vdpa_vq_state_split *split)
309 {
310 struct vduse_dev_msg msg = { 0 };
311 int ret;
312
313 msg.req.type = VDUSE_GET_VQ_STATE;
314 msg.req.vq_state.index = vq->index;
315
316 ret = vduse_dev_msg_sync(dev, &msg);
317 if (ret)
318 return ret;
319
320 split->avail_index = msg.resp.vq_state.split.avail_index;
321
322 return 0;
323 }
324
vduse_dev_set_status(struct vduse_dev * dev,u8 status)325 static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
326 {
327 struct vduse_dev_msg msg = { 0 };
328
329 msg.req.type = VDUSE_SET_STATUS;
330 msg.req.s.status = status;
331
332 return vduse_dev_msg_sync(dev, &msg);
333 }
334
vduse_dev_update_iotlb(struct vduse_dev * dev,u32 asid,u64 start,u64 last)335 static int vduse_dev_update_iotlb(struct vduse_dev *dev, u32 asid,
336 u64 start, u64 last)
337 {
338 struct vduse_dev_msg msg = { 0 };
339
340 if (last < start)
341 return -EINVAL;
342
343 msg.req.type = VDUSE_UPDATE_IOTLB;
344 if (dev->api_version < VDUSE_API_VERSION_1) {
345 msg.req.iova.start = start;
346 msg.req.iova.last = last;
347 } else {
348 msg.req.iova_v2.start = start;
349 msg.req.iova_v2.last = last;
350 msg.req.iova_v2.asid = asid;
351 }
352
353 return vduse_dev_msg_sync(dev, &msg);
354 }
355
vduse_dev_read_iter(struct kiocb * iocb,struct iov_iter * to)356 static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
357 {
358 struct file *file = iocb->ki_filp;
359 struct vduse_dev *dev = file->private_data;
360 struct vduse_dev_msg *msg;
361 int size = sizeof(struct vduse_dev_request);
362 ssize_t ret;
363
364 if (iov_iter_count(to) < size)
365 return -EINVAL;
366
367 spin_lock(&dev->msg_lock);
368 while (1) {
369 msg = vduse_dequeue_msg(&dev->send_list);
370 if (msg)
371 break;
372
373 ret = -EAGAIN;
374 if (file->f_flags & O_NONBLOCK)
375 goto unlock;
376
377 spin_unlock(&dev->msg_lock);
378 ret = wait_event_interruptible_exclusive(dev->waitq,
379 !list_empty(&dev->send_list));
380 if (ret)
381 return ret;
382
383 spin_lock(&dev->msg_lock);
384 }
385 spin_unlock(&dev->msg_lock);
386 ret = copy_to_iter(&msg->req, size, to);
387 spin_lock(&dev->msg_lock);
388 if (ret != size) {
389 ret = -EFAULT;
390 vduse_enqueue_msg(&dev->send_list, msg);
391 goto unlock;
392 }
393 vduse_enqueue_msg(&dev->recv_list, msg);
394 unlock:
395 spin_unlock(&dev->msg_lock);
396
397 return ret;
398 }
399
is_mem_zero(const char * ptr,int size)400 static bool is_mem_zero(const char *ptr, int size)
401 {
402 int i;
403
404 for (i = 0; i < size; i++) {
405 if (ptr[i])
406 return false;
407 }
408 return true;
409 }
410
vduse_dev_write_iter(struct kiocb * iocb,struct iov_iter * from)411 static ssize_t vduse_dev_write_iter(struct kiocb *iocb, struct iov_iter *from)
412 {
413 struct file *file = iocb->ki_filp;
414 struct vduse_dev *dev = file->private_data;
415 struct vduse_dev_response resp;
416 struct vduse_dev_msg *msg;
417 size_t ret;
418
419 ret = copy_from_iter(&resp, sizeof(resp), from);
420 if (ret != sizeof(resp))
421 return -EINVAL;
422
423 if (!is_mem_zero((const char *)resp.reserved, sizeof(resp.reserved)))
424 return -EINVAL;
425
426 spin_lock(&dev->msg_lock);
427 msg = vduse_find_msg(&dev->recv_list, resp.request_id);
428 if (!msg) {
429 ret = -ENOENT;
430 goto unlock;
431 }
432
433 memcpy(&msg->resp, &resp, sizeof(resp));
434 msg->completed = 1;
435 wake_up(&msg->waitq);
436 unlock:
437 spin_unlock(&dev->msg_lock);
438
439 return ret;
440 }
441
vduse_dev_poll(struct file * file,poll_table * wait)442 static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
443 {
444 struct vduse_dev *dev = file->private_data;
445 __poll_t mask = 0;
446
447 poll_wait(file, &dev->waitq, wait);
448
449 spin_lock(&dev->msg_lock);
450
451 if (unlikely(dev->broken))
452 mask |= EPOLLERR;
453 if (!list_empty(&dev->send_list))
454 mask |= EPOLLIN | EPOLLRDNORM;
455 if (!list_empty(&dev->recv_list))
456 mask |= EPOLLOUT | EPOLLWRNORM;
457
458 spin_unlock(&dev->msg_lock);
459
460 return mask;
461 }
462
vduse_dev_reset(struct vduse_dev * dev)463 static void vduse_dev_reset(struct vduse_dev *dev)
464 {
465 int i;
466
467 /* The coherent mappings are handled in vduse_dev_free_coherent() */
468 for (i = 0; i < dev->nas; i++) {
469 struct vduse_iova_domain *domain = dev->as[i].domain;
470
471 if (domain && domain->bounce_map)
472 vduse_domain_reset_bounce_map(domain);
473 }
474
475 down_write(&dev->rwsem);
476
477 dev->status = 0;
478 dev->driver_features = 0;
479 dev->generation++;
480 spin_lock(&dev->irq_lock);
481 dev->config_cb.callback = NULL;
482 dev->config_cb.private = NULL;
483 spin_unlock(&dev->irq_lock);
484 flush_work(&dev->inject);
485
486 for (i = 0; i < dev->vq_num; i++) {
487 struct vduse_virtqueue *vq = dev->vqs[i];
488
489 vq->ready = false;
490 vq->desc_addr = 0;
491 vq->driver_addr = 0;
492 vq->device_addr = 0;
493 vq->num = 0;
494 memset(&vq->state, 0, sizeof(vq->state));
495
496 spin_lock(&vq->kick_lock);
497 vq->kicked = false;
498 if (vq->kickfd)
499 eventfd_ctx_put(vq->kickfd);
500 vq->kickfd = NULL;
501 spin_unlock(&vq->kick_lock);
502
503 spin_lock(&vq->irq_lock);
504 vq->cb.callback = NULL;
505 vq->cb.private = NULL;
506 vq->cb.trigger = NULL;
507 spin_unlock(&vq->irq_lock);
508 flush_work(&vq->inject);
509 flush_work(&vq->kick);
510 }
511
512 up_write(&dev->rwsem);
513 }
514
vduse_vdpa_set_vq_address(struct vdpa_device * vdpa,u16 idx,u64 desc_area,u64 driver_area,u64 device_area)515 static int vduse_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 idx,
516 u64 desc_area, u64 driver_area,
517 u64 device_area)
518 {
519 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
520 struct vduse_virtqueue *vq = dev->vqs[idx];
521
522 vq->desc_addr = desc_area;
523 vq->driver_addr = driver_area;
524 vq->device_addr = device_area;
525
526 return 0;
527 }
528
vduse_vq_kick(struct vduse_virtqueue * vq)529 static void vduse_vq_kick(struct vduse_virtqueue *vq)
530 {
531 spin_lock(&vq->kick_lock);
532 if (!vq->ready)
533 goto unlock;
534
535 if (vq->kickfd)
536 eventfd_signal(vq->kickfd);
537 else
538 vq->kicked = true;
539 unlock:
540 spin_unlock(&vq->kick_lock);
541 }
542
vduse_vq_kick_work(struct work_struct * work)543 static void vduse_vq_kick_work(struct work_struct *work)
544 {
545 struct vduse_virtqueue *vq = container_of(work,
546 struct vduse_virtqueue, kick);
547
548 vduse_vq_kick(vq);
549 }
550
vduse_vdpa_kick_vq(struct vdpa_device * vdpa,u16 idx)551 static void vduse_vdpa_kick_vq(struct vdpa_device *vdpa, u16 idx)
552 {
553 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
554 struct vduse_virtqueue *vq = dev->vqs[idx];
555
556 if (!eventfd_signal_allowed()) {
557 schedule_work(&vq->kick);
558 return;
559 }
560 vduse_vq_kick(vq);
561 }
562
vduse_vdpa_set_vq_cb(struct vdpa_device * vdpa,u16 idx,struct vdpa_callback * cb)563 static void vduse_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 idx,
564 struct vdpa_callback *cb)
565 {
566 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
567 struct vduse_virtqueue *vq = dev->vqs[idx];
568
569 spin_lock(&vq->irq_lock);
570 vq->cb.callback = cb->callback;
571 vq->cb.private = cb->private;
572 vq->cb.trigger = cb->trigger;
573 spin_unlock(&vq->irq_lock);
574 }
575
vduse_vdpa_set_vq_num(struct vdpa_device * vdpa,u16 idx,u32 num)576 static void vduse_vdpa_set_vq_num(struct vdpa_device *vdpa, u16 idx, u32 num)
577 {
578 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
579 struct vduse_virtqueue *vq = dev->vqs[idx];
580
581 vq->num = num;
582 }
583
vduse_vdpa_get_vq_size(struct vdpa_device * vdpa,u16 idx)584 static u16 vduse_vdpa_get_vq_size(struct vdpa_device *vdpa, u16 idx)
585 {
586 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
587 struct vduse_virtqueue *vq = dev->vqs[idx];
588
589 if (vq->num)
590 return vq->num;
591 else
592 return vq->num_max;
593 }
594
vduse_vdpa_set_vq_ready(struct vdpa_device * vdpa,u16 idx,bool ready)595 static void vduse_vdpa_set_vq_ready(struct vdpa_device *vdpa,
596 u16 idx, bool ready)
597 {
598 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
599 struct vduse_virtqueue *vq = dev->vqs[idx];
600
601 vq->ready = ready;
602 }
603
vduse_vdpa_get_vq_ready(struct vdpa_device * vdpa,u16 idx)604 static bool vduse_vdpa_get_vq_ready(struct vdpa_device *vdpa, u16 idx)
605 {
606 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
607 struct vduse_virtqueue *vq = dev->vqs[idx];
608
609 return vq->ready;
610 }
611
vduse_vdpa_set_vq_state(struct vdpa_device * vdpa,u16 idx,const struct vdpa_vq_state * state)612 static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx,
613 const struct vdpa_vq_state *state)
614 {
615 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
616 struct vduse_virtqueue *vq = dev->vqs[idx];
617
618 if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
619 vq->state.packed.last_avail_counter =
620 state->packed.last_avail_counter;
621 vq->state.packed.last_avail_idx = state->packed.last_avail_idx;
622 vq->state.packed.last_used_counter =
623 state->packed.last_used_counter;
624 vq->state.packed.last_used_idx = state->packed.last_used_idx;
625 } else
626 vq->state.split.avail_index = state->split.avail_index;
627
628 return 0;
629 }
630
vduse_get_vq_group(struct vdpa_device * vdpa,u16 idx)631 static u32 vduse_get_vq_group(struct vdpa_device *vdpa, u16 idx)
632 {
633 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
634
635 if (dev->api_version < VDUSE_API_VERSION_1)
636 return 0;
637
638 return dev->vqs[idx]->group;
639 }
640
vduse_get_vq_map(struct vdpa_device * vdpa,u16 idx)641 static union virtio_map vduse_get_vq_map(struct vdpa_device *vdpa, u16 idx)
642 {
643 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
644 u32 vq_group = vduse_get_vq_group(vdpa, idx);
645 union virtio_map ret = {
646 .group = &dev->groups[vq_group],
647 };
648
649 return ret;
650 }
651
652 DEFINE_GUARD(vq_group_as_read_lock, struct vduse_vq_group *,
653 if (_T->dev->nas > 1)
654 read_lock(&_T->as_lock),
655 if (_T->dev->nas > 1)
656 read_unlock(&_T->as_lock))
657
658 DEFINE_GUARD(vq_group_as_write_lock, struct vduse_vq_group *,
659 if (_T->dev->nas > 1)
660 write_lock(&_T->as_lock),
661 if (_T->dev->nas > 1)
662 write_unlock(&_T->as_lock))
663
vduse_set_group_asid(struct vdpa_device * vdpa,unsigned int group,unsigned int asid)664 static int vduse_set_group_asid(struct vdpa_device *vdpa, unsigned int group,
665 unsigned int asid)
666 {
667 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
668 struct vduse_dev_msg msg = { 0 };
669 int r;
670
671 if (dev->api_version < VDUSE_API_VERSION_1)
672 return -EINVAL;
673
674 msg.req.type = VDUSE_SET_VQ_GROUP_ASID;
675 msg.req.vq_group_asid.group = group;
676 msg.req.vq_group_asid.asid = asid;
677
678 r = vduse_dev_msg_sync(dev, &msg);
679 if (r < 0)
680 return r;
681
682 guard(vq_group_as_write_lock)(&dev->groups[group]);
683 dev->groups[group].as = &dev->as[asid];
684
685 return 0;
686 }
687
vduse_vdpa_get_vq_state(struct vdpa_device * vdpa,u16 idx,struct vdpa_vq_state * state)688 static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
689 struct vdpa_vq_state *state)
690 {
691 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
692 struct vduse_virtqueue *vq = dev->vqs[idx];
693
694 if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED))
695 return vduse_dev_get_vq_state_packed(dev, vq, &state->packed);
696
697 return vduse_dev_get_vq_state_split(dev, vq, &state->split);
698 }
699
vduse_vdpa_get_vq_align(struct vdpa_device * vdpa)700 static u32 vduse_vdpa_get_vq_align(struct vdpa_device *vdpa)
701 {
702 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
703
704 return dev->vq_align;
705 }
706
vduse_vdpa_get_device_features(struct vdpa_device * vdpa)707 static u64 vduse_vdpa_get_device_features(struct vdpa_device *vdpa)
708 {
709 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
710
711 return dev->device_features;
712 }
713
vduse_vdpa_set_driver_features(struct vdpa_device * vdpa,u64 features)714 static int vduse_vdpa_set_driver_features(struct vdpa_device *vdpa, u64 features)
715 {
716 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
717
718 dev->driver_features = features;
719 return 0;
720 }
721
vduse_vdpa_get_driver_features(struct vdpa_device * vdpa)722 static u64 vduse_vdpa_get_driver_features(struct vdpa_device *vdpa)
723 {
724 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
725
726 return dev->driver_features;
727 }
728
vduse_vdpa_set_config_cb(struct vdpa_device * vdpa,struct vdpa_callback * cb)729 static void vduse_vdpa_set_config_cb(struct vdpa_device *vdpa,
730 struct vdpa_callback *cb)
731 {
732 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
733
734 spin_lock(&dev->irq_lock);
735 dev->config_cb.callback = cb->callback;
736 dev->config_cb.private = cb->private;
737 spin_unlock(&dev->irq_lock);
738 }
739
vduse_vdpa_get_vq_num_max(struct vdpa_device * vdpa)740 static u16 vduse_vdpa_get_vq_num_max(struct vdpa_device *vdpa)
741 {
742 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
743 u16 num_max = 0;
744 int i;
745
746 for (i = 0; i < dev->vq_num; i++)
747 if (num_max < dev->vqs[i]->num_max)
748 num_max = dev->vqs[i]->num_max;
749
750 return num_max;
751 }
752
vduse_vdpa_get_device_id(struct vdpa_device * vdpa)753 static u32 vduse_vdpa_get_device_id(struct vdpa_device *vdpa)
754 {
755 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
756
757 return dev->device_id;
758 }
759
vduse_vdpa_get_vendor_id(struct vdpa_device * vdpa)760 static u32 vduse_vdpa_get_vendor_id(struct vdpa_device *vdpa)
761 {
762 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
763
764 return dev->vendor_id;
765 }
766
vduse_vdpa_get_status(struct vdpa_device * vdpa)767 static u8 vduse_vdpa_get_status(struct vdpa_device *vdpa)
768 {
769 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
770
771 return dev->status;
772 }
773
vduse_vdpa_set_status(struct vdpa_device * vdpa,u8 status)774 static void vduse_vdpa_set_status(struct vdpa_device *vdpa, u8 status)
775 {
776 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
777
778 if (vduse_dev_set_status(dev, status))
779 return;
780
781 dev->status = status;
782 }
783
vduse_vdpa_get_config_size(struct vdpa_device * vdpa)784 static size_t vduse_vdpa_get_config_size(struct vdpa_device *vdpa)
785 {
786 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
787
788 return dev->config_size;
789 }
790
vduse_vdpa_get_config(struct vdpa_device * vdpa,unsigned int offset,void * buf,unsigned int len)791 static void vduse_vdpa_get_config(struct vdpa_device *vdpa, unsigned int offset,
792 void *buf, unsigned int len)
793 {
794 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
795
796 /* Initialize the buffer in case of partial copy. */
797 memset(buf, 0, len);
798
799 if (offset > dev->config_size)
800 return;
801
802 if (len > dev->config_size - offset)
803 len = dev->config_size - offset;
804
805 memcpy(buf, dev->config + offset, len);
806 }
807
vduse_vdpa_set_config(struct vdpa_device * vdpa,unsigned int offset,const void * buf,unsigned int len)808 static void vduse_vdpa_set_config(struct vdpa_device *vdpa, unsigned int offset,
809 const void *buf, unsigned int len)
810 {
811 /* Now we only support read-only configuration space */
812 }
813
vduse_vdpa_reset(struct vdpa_device * vdpa)814 static int vduse_vdpa_reset(struct vdpa_device *vdpa)
815 {
816 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
817 int ret = vduse_dev_set_status(dev, 0);
818
819 vduse_dev_reset(dev);
820
821 return ret;
822 }
823
vduse_vdpa_get_generation(struct vdpa_device * vdpa)824 static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa)
825 {
826 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
827
828 return dev->generation;
829 }
830
vduse_vdpa_set_vq_affinity(struct vdpa_device * vdpa,u16 idx,const struct cpumask * cpu_mask)831 static int vduse_vdpa_set_vq_affinity(struct vdpa_device *vdpa, u16 idx,
832 const struct cpumask *cpu_mask)
833 {
834 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
835
836 if (cpu_mask)
837 cpumask_copy(&dev->vqs[idx]->irq_affinity, cpu_mask);
838 else
839 cpumask_setall(&dev->vqs[idx]->irq_affinity);
840
841 return 0;
842 }
843
844 static const struct cpumask *
vduse_vdpa_get_vq_affinity(struct vdpa_device * vdpa,u16 idx)845 vduse_vdpa_get_vq_affinity(struct vdpa_device *vdpa, u16 idx)
846 {
847 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
848
849 return &dev->vqs[idx]->irq_affinity;
850 }
851
vduse_vdpa_set_map(struct vdpa_device * vdpa,unsigned int asid,struct vhost_iotlb * iotlb)852 static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
853 unsigned int asid,
854 struct vhost_iotlb *iotlb)
855 {
856 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
857 int ret;
858
859 ret = vduse_domain_set_map(dev->as[asid].domain, iotlb);
860 if (ret)
861 return ret;
862
863 ret = vduse_dev_update_iotlb(dev, asid, 0ULL, ULLONG_MAX);
864 if (ret) {
865 vduse_domain_clear_map(dev->as[asid].domain, iotlb);
866 return ret;
867 }
868
869 return 0;
870 }
871
vduse_vdpa_free(struct vdpa_device * vdpa)872 static void vduse_vdpa_free(struct vdpa_device *vdpa)
873 {
874 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
875
876 dev->vdev = NULL;
877 }
878
879 static const struct vdpa_config_ops vduse_vdpa_config_ops = {
880 .set_vq_address = vduse_vdpa_set_vq_address,
881 .kick_vq = vduse_vdpa_kick_vq,
882 .set_vq_cb = vduse_vdpa_set_vq_cb,
883 .set_vq_num = vduse_vdpa_set_vq_num,
884 .get_vq_size = vduse_vdpa_get_vq_size,
885 .get_vq_group = vduse_get_vq_group,
886 .set_vq_ready = vduse_vdpa_set_vq_ready,
887 .get_vq_ready = vduse_vdpa_get_vq_ready,
888 .set_vq_state = vduse_vdpa_set_vq_state,
889 .get_vq_state = vduse_vdpa_get_vq_state,
890 .get_vq_align = vduse_vdpa_get_vq_align,
891 .get_device_features = vduse_vdpa_get_device_features,
892 .set_driver_features = vduse_vdpa_set_driver_features,
893 .get_driver_features = vduse_vdpa_get_driver_features,
894 .set_config_cb = vduse_vdpa_set_config_cb,
895 .get_vq_num_max = vduse_vdpa_get_vq_num_max,
896 .get_device_id = vduse_vdpa_get_device_id,
897 .get_vendor_id = vduse_vdpa_get_vendor_id,
898 .get_status = vduse_vdpa_get_status,
899 .set_status = vduse_vdpa_set_status,
900 .get_config_size = vduse_vdpa_get_config_size,
901 .get_config = vduse_vdpa_get_config,
902 .set_config = vduse_vdpa_set_config,
903 .get_generation = vduse_vdpa_get_generation,
904 .set_vq_affinity = vduse_vdpa_set_vq_affinity,
905 .get_vq_affinity = vduse_vdpa_get_vq_affinity,
906 .reset = vduse_vdpa_reset,
907 .set_map = vduse_vdpa_set_map,
908 .set_group_asid = vduse_set_group_asid,
909 .get_vq_map = vduse_get_vq_map,
910 .free = vduse_vdpa_free,
911 };
912
vduse_dev_sync_single_for_device(union virtio_map token,dma_addr_t dma_addr,size_t size,enum dma_data_direction dir)913 static void vduse_dev_sync_single_for_device(union virtio_map token,
914 dma_addr_t dma_addr, size_t size,
915 enum dma_data_direction dir)
916 {
917 struct vduse_iova_domain *domain;
918
919 if (!token.group)
920 return;
921
922 guard(vq_group_as_read_lock)(token.group);
923 domain = token.group->as->domain;
924 vduse_domain_sync_single_for_device(domain, dma_addr, size, dir);
925 }
926
vduse_dev_sync_single_for_cpu(union virtio_map token,dma_addr_t dma_addr,size_t size,enum dma_data_direction dir)927 static void vduse_dev_sync_single_for_cpu(union virtio_map token,
928 dma_addr_t dma_addr, size_t size,
929 enum dma_data_direction dir)
930 {
931 struct vduse_iova_domain *domain;
932
933 if (!token.group)
934 return;
935
936 guard(vq_group_as_read_lock)(token.group);
937 domain = token.group->as->domain;
938 vduse_domain_sync_single_for_cpu(domain, dma_addr, size, dir);
939 }
940
vduse_dev_map_page(union virtio_map token,struct page * page,unsigned long offset,size_t size,enum dma_data_direction dir,unsigned long attrs)941 static dma_addr_t vduse_dev_map_page(union virtio_map token, struct page *page,
942 unsigned long offset, size_t size,
943 enum dma_data_direction dir,
944 unsigned long attrs)
945 {
946 struct vduse_iova_domain *domain;
947
948 if (!token.group)
949 return DMA_MAPPING_ERROR;
950
951 guard(vq_group_as_read_lock)(token.group);
952 domain = token.group->as->domain;
953 return vduse_domain_map_page(domain, page, offset, size, dir, attrs);
954 }
955
vduse_dev_unmap_page(union virtio_map token,dma_addr_t dma_addr,size_t size,enum dma_data_direction dir,unsigned long attrs)956 static void vduse_dev_unmap_page(union virtio_map token, dma_addr_t dma_addr,
957 size_t size, enum dma_data_direction dir,
958 unsigned long attrs)
959 {
960 struct vduse_iova_domain *domain;
961
962 if (!token.group)
963 return;
964
965 guard(vq_group_as_read_lock)(token.group);
966 domain = token.group->as->domain;
967 vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs);
968 }
969
vduse_dev_alloc_coherent(union virtio_map token,size_t size,dma_addr_t * dma_addr,gfp_t flag)970 static void *vduse_dev_alloc_coherent(union virtio_map token, size_t size,
971 dma_addr_t *dma_addr, gfp_t flag)
972 {
973 void *addr;
974
975 *dma_addr = DMA_MAPPING_ERROR;
976 if (!token.group)
977 return NULL;
978
979 addr = alloc_pages_exact(size, flag);
980 if (!addr)
981 return NULL;
982
983 {
984 struct vduse_iova_domain *domain;
985
986 guard(vq_group_as_read_lock)(token.group);
987 domain = token.group->as->domain;
988 *dma_addr = vduse_domain_alloc_coherent(domain, size, addr);
989 if (*dma_addr == DMA_MAPPING_ERROR)
990 goto err;
991 }
992
993 return addr;
994
995 err:
996 free_pages_exact(addr, size);
997 return NULL;
998 }
999
vduse_dev_free_coherent(union virtio_map token,size_t size,void * vaddr,dma_addr_t dma_addr,unsigned long attrs)1000 static void vduse_dev_free_coherent(union virtio_map token, size_t size,
1001 void *vaddr, dma_addr_t dma_addr,
1002 unsigned long attrs)
1003 {
1004 if (!token.group)
1005 return;
1006
1007 {
1008 struct vduse_iova_domain *domain;
1009
1010 guard(vq_group_as_read_lock)(token.group);
1011 domain = token.group->as->domain;
1012 vduse_domain_free_coherent(domain, size, dma_addr, attrs);
1013 }
1014
1015 free_pages_exact(vaddr, size);
1016 }
1017
vduse_dev_need_sync(union virtio_map token,dma_addr_t dma_addr)1018 static bool vduse_dev_need_sync(union virtio_map token, dma_addr_t dma_addr)
1019 {
1020 if (!token.group)
1021 return false;
1022
1023 guard(vq_group_as_read_lock)(token.group);
1024 return dma_addr < token.group->as->domain->bounce_size;
1025 }
1026
vduse_dev_mapping_error(union virtio_map token,dma_addr_t dma_addr)1027 static int vduse_dev_mapping_error(union virtio_map token, dma_addr_t dma_addr)
1028 {
1029 if (unlikely(dma_addr == DMA_MAPPING_ERROR))
1030 return -ENOMEM;
1031 return 0;
1032 }
1033
vduse_dev_max_mapping_size(union virtio_map token)1034 static size_t vduse_dev_max_mapping_size(union virtio_map token)
1035 {
1036 if (!token.group)
1037 return 0;
1038
1039 guard(vq_group_as_read_lock)(token.group);
1040 return token.group->as->domain->bounce_size;
1041 }
1042
1043 static const struct virtio_map_ops vduse_map_ops = {
1044 .sync_single_for_device = vduse_dev_sync_single_for_device,
1045 .sync_single_for_cpu = vduse_dev_sync_single_for_cpu,
1046 .map_page = vduse_dev_map_page,
1047 .unmap_page = vduse_dev_unmap_page,
1048 .alloc = vduse_dev_alloc_coherent,
1049 .free = vduse_dev_free_coherent,
1050 .need_sync = vduse_dev_need_sync,
1051 .mapping_error = vduse_dev_mapping_error,
1052 .max_mapping_size = vduse_dev_max_mapping_size,
1053 };
1054
perm_to_file_flags(u8 perm)1055 static unsigned int perm_to_file_flags(u8 perm)
1056 {
1057 unsigned int flags = 0;
1058
1059 switch (perm) {
1060 case VDUSE_ACCESS_WO:
1061 flags |= O_WRONLY;
1062 break;
1063 case VDUSE_ACCESS_RO:
1064 flags |= O_RDONLY;
1065 break;
1066 case VDUSE_ACCESS_RW:
1067 flags |= O_RDWR;
1068 break;
1069 default:
1070 WARN(1, "invalidate vhost IOTLB permission\n");
1071 break;
1072 }
1073
1074 return flags;
1075 }
1076
vduse_kickfd_setup(struct vduse_dev * dev,struct vduse_vq_eventfd * eventfd)1077 static int vduse_kickfd_setup(struct vduse_dev *dev,
1078 struct vduse_vq_eventfd *eventfd)
1079 {
1080 struct eventfd_ctx *ctx = NULL;
1081 struct vduse_virtqueue *vq;
1082 u32 index;
1083
1084 if (eventfd->index >= dev->vq_num)
1085 return -EINVAL;
1086
1087 index = array_index_nospec(eventfd->index, dev->vq_num);
1088 vq = dev->vqs[index];
1089 if (eventfd->fd >= 0) {
1090 ctx = eventfd_ctx_fdget(eventfd->fd);
1091 if (IS_ERR(ctx))
1092 return PTR_ERR(ctx);
1093 } else if (eventfd->fd != VDUSE_EVENTFD_DEASSIGN)
1094 return 0;
1095
1096 spin_lock(&vq->kick_lock);
1097 if (vq->kickfd)
1098 eventfd_ctx_put(vq->kickfd);
1099 vq->kickfd = ctx;
1100 if (vq->ready && vq->kicked && vq->kickfd) {
1101 eventfd_signal(vq->kickfd);
1102 vq->kicked = false;
1103 }
1104 spin_unlock(&vq->kick_lock);
1105
1106 return 0;
1107 }
1108
vduse_dev_is_ready(struct vduse_dev * dev)1109 static bool vduse_dev_is_ready(struct vduse_dev *dev)
1110 {
1111 int i;
1112
1113 for (i = 0; i < dev->vq_num; i++)
1114 if (!dev->vqs[i]->num_max)
1115 return false;
1116
1117 return true;
1118 }
1119
vduse_dev_irq_inject(struct work_struct * work)1120 static void vduse_dev_irq_inject(struct work_struct *work)
1121 {
1122 struct vduse_dev *dev = container_of(work, struct vduse_dev, inject);
1123
1124 spin_lock_bh(&dev->irq_lock);
1125 if (dev->config_cb.callback)
1126 dev->config_cb.callback(dev->config_cb.private);
1127 spin_unlock_bh(&dev->irq_lock);
1128 }
1129
vduse_vq_irq_inject(struct work_struct * work)1130 static void vduse_vq_irq_inject(struct work_struct *work)
1131 {
1132 struct vduse_virtqueue *vq = container_of(work,
1133 struct vduse_virtqueue, inject);
1134
1135 spin_lock_bh(&vq->irq_lock);
1136 if (vq->ready && vq->cb.callback)
1137 vq->cb.callback(vq->cb.private);
1138 spin_unlock_bh(&vq->irq_lock);
1139 }
1140
vduse_vq_signal_irqfd(struct vduse_virtqueue * vq)1141 static bool vduse_vq_signal_irqfd(struct vduse_virtqueue *vq)
1142 {
1143 bool signal = false;
1144
1145 if (!vq->cb.trigger)
1146 return false;
1147
1148 spin_lock_irq(&vq->irq_lock);
1149 if (vq->ready && vq->cb.trigger) {
1150 eventfd_signal(vq->cb.trigger);
1151 signal = true;
1152 }
1153 spin_unlock_irq(&vq->irq_lock);
1154
1155 return signal;
1156 }
1157
vduse_dev_queue_irq_work(struct vduse_dev * dev,struct work_struct * irq_work,int irq_effective_cpu)1158 static int vduse_dev_queue_irq_work(struct vduse_dev *dev,
1159 struct work_struct *irq_work,
1160 int irq_effective_cpu)
1161 {
1162 int ret = -EINVAL;
1163
1164 down_read(&dev->rwsem);
1165 if (!(dev->status & VIRTIO_CONFIG_S_DRIVER_OK))
1166 goto unlock;
1167
1168 ret = 0;
1169 if (irq_effective_cpu == IRQ_UNBOUND)
1170 queue_work(vduse_irq_wq, irq_work);
1171 else
1172 queue_work_on(irq_effective_cpu,
1173 vduse_irq_bound_wq, irq_work);
1174 unlock:
1175 up_read(&dev->rwsem);
1176
1177 return ret;
1178 }
1179
vduse_dev_dereg_umem(struct vduse_dev * dev,u32 asid,u64 iova,u64 size)1180 static int vduse_dev_dereg_umem(struct vduse_dev *dev, u32 asid,
1181 u64 iova, u64 size)
1182 {
1183 int ret;
1184
1185 mutex_lock(&dev->as[asid].mem_lock);
1186 ret = -ENOENT;
1187 if (!dev->as[asid].umem)
1188 goto unlock;
1189
1190 ret = -EINVAL;
1191 if (!dev->as[asid].domain)
1192 goto unlock;
1193
1194 if (dev->as[asid].umem->iova != iova ||
1195 size != dev->as[asid].domain->bounce_size)
1196 goto unlock;
1197
1198 vduse_domain_remove_user_bounce_pages(dev->as[asid].domain);
1199 unpin_user_pages_dirty_lock(dev->as[asid].umem->pages,
1200 dev->as[asid].umem->npages, true);
1201 atomic64_sub(dev->as[asid].umem->npages, &dev->as[asid].umem->mm->pinned_vm);
1202 mmdrop(dev->as[asid].umem->mm);
1203 vfree(dev->as[asid].umem->pages);
1204 kfree(dev->as[asid].umem);
1205 dev->as[asid].umem = NULL;
1206 ret = 0;
1207 unlock:
1208 mutex_unlock(&dev->as[asid].mem_lock);
1209 return ret;
1210 }
1211
vduse_dev_reg_umem(struct vduse_dev * dev,u32 asid,u64 iova,u64 uaddr,u64 size)1212 static int vduse_dev_reg_umem(struct vduse_dev *dev,
1213 u32 asid, u64 iova, u64 uaddr, u64 size)
1214 {
1215 struct page **page_list = NULL;
1216 struct vduse_umem *umem = NULL;
1217 long pinned = 0;
1218 unsigned long npages, lock_limit;
1219 int ret;
1220
1221 if (!dev->as[asid].domain || !dev->as[asid].domain->bounce_map ||
1222 size != dev->as[asid].domain->bounce_size ||
1223 iova != 0 || uaddr & ~PAGE_MASK)
1224 return -EINVAL;
1225
1226 mutex_lock(&dev->as[asid].mem_lock);
1227 ret = -EEXIST;
1228 if (dev->as[asid].umem)
1229 goto unlock;
1230
1231 ret = -ENOMEM;
1232 npages = size >> PAGE_SHIFT;
1233 page_list = __vmalloc(array_size(npages, sizeof(struct page *)),
1234 GFP_KERNEL_ACCOUNT);
1235 umem = kzalloc_obj(*umem);
1236 if (!page_list || !umem)
1237 goto unlock;
1238
1239 mmap_read_lock(current->mm);
1240
1241 lock_limit = PFN_DOWN(rlimit(RLIMIT_MEMLOCK));
1242 if (npages + atomic64_read(¤t->mm->pinned_vm) > lock_limit)
1243 goto out;
1244
1245 pinned = pin_user_pages(uaddr, npages, FOLL_LONGTERM | FOLL_WRITE,
1246 page_list);
1247 if (pinned != npages) {
1248 ret = pinned < 0 ? pinned : -ENOMEM;
1249 goto out;
1250 }
1251
1252 ret = vduse_domain_add_user_bounce_pages(dev->as[asid].domain,
1253 page_list, pinned);
1254 if (ret)
1255 goto out;
1256
1257 atomic64_add(npages, ¤t->mm->pinned_vm);
1258
1259 umem->pages = page_list;
1260 umem->npages = pinned;
1261 umem->iova = iova;
1262 umem->mm = current->mm;
1263 mmgrab(current->mm);
1264
1265 dev->as[asid].umem = umem;
1266 out:
1267 if (ret && pinned > 0)
1268 unpin_user_pages(page_list, pinned);
1269
1270 mmap_read_unlock(current->mm);
1271 unlock:
1272 if (ret) {
1273 vfree(page_list);
1274 kfree(umem);
1275 }
1276 mutex_unlock(&dev->as[asid].mem_lock);
1277 return ret;
1278 }
1279
vduse_vq_update_effective_cpu(struct vduse_virtqueue * vq)1280 static void vduse_vq_update_effective_cpu(struct vduse_virtqueue *vq)
1281 {
1282 int curr_cpu = vq->irq_effective_cpu;
1283
1284 while (true) {
1285 curr_cpu = cpumask_next(curr_cpu, &vq->irq_affinity);
1286 if (cpu_online(curr_cpu))
1287 break;
1288
1289 if (curr_cpu >= nr_cpu_ids)
1290 curr_cpu = IRQ_UNBOUND;
1291 }
1292
1293 vq->irq_effective_cpu = curr_cpu;
1294 }
1295
vduse_dev_iotlb_entry(struct vduse_dev * dev,struct vduse_iotlb_entry_v2 * entry,struct file ** f,uint64_t * capability)1296 static int vduse_dev_iotlb_entry(struct vduse_dev *dev,
1297 struct vduse_iotlb_entry_v2 *entry,
1298 struct file **f, uint64_t *capability)
1299 {
1300 u32 asid;
1301 int r = -EINVAL;
1302 struct vhost_iotlb_map *map;
1303
1304 if (entry->start > entry->last || entry->asid >= dev->nas)
1305 return -EINVAL;
1306
1307 asid = array_index_nospec(entry->asid, dev->nas);
1308 mutex_lock(&dev->domain_lock);
1309
1310 if (!dev->as[asid].domain)
1311 goto out;
1312
1313 spin_lock(&dev->as[asid].domain->iotlb_lock);
1314 map = vhost_iotlb_itree_first(dev->as[asid].domain->iotlb,
1315 entry->start, entry->last);
1316 if (map) {
1317 if (f) {
1318 const struct vdpa_map_file *map_file;
1319
1320 map_file = (struct vdpa_map_file *)map->opaque;
1321 entry->offset = map_file->offset;
1322 *f = get_file(map_file->file);
1323 }
1324 entry->start = map->start;
1325 entry->last = map->last;
1326 entry->perm = map->perm;
1327 if (capability) {
1328 *capability = 0;
1329
1330 if (dev->as[asid].domain->bounce_map && map->start == 0 &&
1331 map->last == dev->as[asid].domain->bounce_size - 1)
1332 *capability |= VDUSE_IOVA_CAP_UMEM;
1333 }
1334
1335 r = 0;
1336 }
1337 spin_unlock(&dev->as[asid].domain->iotlb_lock);
1338
1339 out:
1340 mutex_unlock(&dev->domain_lock);
1341 return r;
1342 }
1343
vduse_dev_ioctl(struct file * file,unsigned int cmd,unsigned long arg)1344 static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
1345 unsigned long arg)
1346 {
1347 struct vduse_dev *dev = file->private_data;
1348 void __user *argp = (void __user *)arg;
1349 int ret;
1350
1351 if (unlikely(dev->broken))
1352 return -EPERM;
1353
1354 switch (cmd) {
1355 case VDUSE_IOTLB_GET_FD:
1356 case VDUSE_IOTLB_GET_FD2: {
1357 struct vduse_iotlb_entry_v2 entry = {0};
1358 struct file *f = NULL;
1359
1360 ret = -ENOIOCTLCMD;
1361 if (dev->api_version < VDUSE_API_VERSION_1 &&
1362 cmd == VDUSE_IOTLB_GET_FD2)
1363 break;
1364
1365 ret = -EFAULT;
1366 if (copy_from_user(&entry, argp, _IOC_SIZE(cmd)))
1367 break;
1368
1369 ret = -EINVAL;
1370 if (!is_mem_zero((const char *)entry.reserved,
1371 sizeof(entry.reserved)))
1372 break;
1373
1374 ret = vduse_dev_iotlb_entry(dev, &entry, &f, NULL);
1375 if (ret)
1376 break;
1377
1378 ret = -EINVAL;
1379 if (!f)
1380 break;
1381
1382 ret = copy_to_user(argp, &entry, _IOC_SIZE(cmd));
1383 if (ret) {
1384 ret = -EFAULT;
1385 fput(f);
1386 break;
1387 }
1388 ret = receive_fd(f, NULL, perm_to_file_flags(entry.perm));
1389 fput(f);
1390 break;
1391 }
1392 case VDUSE_DEV_GET_FEATURES:
1393 /*
1394 * Just mirror what driver wrote here.
1395 * The driver is expected to check FEATURE_OK later.
1396 */
1397 ret = put_user(dev->driver_features, (u64 __user *)argp);
1398 break;
1399 case VDUSE_DEV_SET_CONFIG: {
1400 struct vduse_config_data config;
1401 unsigned long size = offsetof(struct vduse_config_data,
1402 buffer);
1403
1404 ret = -EFAULT;
1405 if (copy_from_user(&config, argp, size))
1406 break;
1407
1408 ret = -EINVAL;
1409 if (config.offset > dev->config_size ||
1410 config.length == 0 ||
1411 config.length > dev->config_size - config.offset)
1412 break;
1413
1414 ret = -EFAULT;
1415 if (copy_from_user(dev->config + config.offset, argp + size,
1416 config.length))
1417 break;
1418
1419 ret = 0;
1420 break;
1421 }
1422 case VDUSE_DEV_INJECT_CONFIG_IRQ:
1423 ret = vduse_dev_queue_irq_work(dev, &dev->inject, IRQ_UNBOUND);
1424 break;
1425 case VDUSE_VQ_SETUP: {
1426 struct vduse_vq_config config;
1427 u32 index;
1428
1429 ret = -EFAULT;
1430 if (copy_from_user(&config, argp, sizeof(config)))
1431 break;
1432
1433 ret = -EINVAL;
1434 if (config.index >= dev->vq_num)
1435 break;
1436
1437 if (dev->api_version < VDUSE_API_VERSION_1) {
1438 if (config.group)
1439 break;
1440 } else {
1441 if (config.group >= dev->ngroups)
1442 break;
1443 if (dev->status & VIRTIO_CONFIG_S_DRIVER_OK)
1444 break;
1445 }
1446
1447 if (config.reserved1 ||
1448 !is_mem_zero((const char *)config.reserved2,
1449 sizeof(config.reserved2)))
1450 break;
1451
1452 index = array_index_nospec(config.index, dev->vq_num);
1453 dev->vqs[index]->num_max = config.max_size;
1454 dev->vqs[index]->group = config.group;
1455 ret = 0;
1456 break;
1457 }
1458 case VDUSE_VQ_GET_INFO: {
1459 struct vduse_vq_info vq_info;
1460 struct vduse_virtqueue *vq;
1461 u32 index;
1462
1463 ret = -EFAULT;
1464 if (copy_from_user(&vq_info, argp, sizeof(vq_info)))
1465 break;
1466
1467 ret = -EINVAL;
1468 if (vq_info.index >= dev->vq_num)
1469 break;
1470
1471 index = array_index_nospec(vq_info.index, dev->vq_num);
1472 vq = dev->vqs[index];
1473 vq_info.desc_addr = vq->desc_addr;
1474 vq_info.driver_addr = vq->driver_addr;
1475 vq_info.device_addr = vq->device_addr;
1476 vq_info.num = vq->num;
1477
1478 if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
1479 vq_info.packed.last_avail_counter =
1480 vq->state.packed.last_avail_counter;
1481 vq_info.packed.last_avail_idx =
1482 vq->state.packed.last_avail_idx;
1483 vq_info.packed.last_used_counter =
1484 vq->state.packed.last_used_counter;
1485 vq_info.packed.last_used_idx =
1486 vq->state.packed.last_used_idx;
1487 } else
1488 vq_info.split.avail_index =
1489 vq->state.split.avail_index;
1490
1491 vq_info.ready = vq->ready;
1492
1493 ret = -EFAULT;
1494 if (copy_to_user(argp, &vq_info, sizeof(vq_info)))
1495 break;
1496
1497 ret = 0;
1498 break;
1499 }
1500 case VDUSE_VQ_SETUP_KICKFD: {
1501 struct vduse_vq_eventfd eventfd;
1502
1503 ret = -EFAULT;
1504 if (copy_from_user(&eventfd, argp, sizeof(eventfd)))
1505 break;
1506
1507 ret = vduse_kickfd_setup(dev, &eventfd);
1508 break;
1509 }
1510 case VDUSE_VQ_INJECT_IRQ: {
1511 u32 index;
1512
1513 ret = -EFAULT;
1514 if (get_user(index, (u32 __user *)argp))
1515 break;
1516
1517 ret = -EINVAL;
1518 if (index >= dev->vq_num)
1519 break;
1520
1521 ret = 0;
1522 index = array_index_nospec(index, dev->vq_num);
1523 if (!vduse_vq_signal_irqfd(dev->vqs[index])) {
1524 vduse_vq_update_effective_cpu(dev->vqs[index]);
1525 ret = vduse_dev_queue_irq_work(dev,
1526 &dev->vqs[index]->inject,
1527 dev->vqs[index]->irq_effective_cpu);
1528 }
1529 break;
1530 }
1531 case VDUSE_IOTLB_REG_UMEM: {
1532 struct vduse_iova_umem umem;
1533 u32 asid;
1534
1535 ret = -EFAULT;
1536 if (copy_from_user(&umem, argp, sizeof(umem)))
1537 break;
1538
1539 ret = -EINVAL;
1540 if (!is_mem_zero((const char *)umem.reserved,
1541 sizeof(umem.reserved)) ||
1542 (dev->api_version < VDUSE_API_VERSION_1 &&
1543 umem.asid != 0) || umem.asid >= dev->nas)
1544 break;
1545
1546 mutex_lock(&dev->domain_lock);
1547 asid = array_index_nospec(umem.asid, dev->nas);
1548 ret = vduse_dev_reg_umem(dev, asid, umem.iova,
1549 umem.uaddr, umem.size);
1550 mutex_unlock(&dev->domain_lock);
1551 break;
1552 }
1553 case VDUSE_IOTLB_DEREG_UMEM: {
1554 struct vduse_iova_umem umem;
1555 u32 asid;
1556
1557 ret = -EFAULT;
1558 if (copy_from_user(&umem, argp, sizeof(umem)))
1559 break;
1560
1561 ret = -EINVAL;
1562 if (!is_mem_zero((const char *)umem.reserved,
1563 sizeof(umem.reserved)) ||
1564 (dev->api_version < VDUSE_API_VERSION_1 &&
1565 umem.asid != 0) ||
1566 umem.asid >= dev->nas)
1567 break;
1568
1569 mutex_lock(&dev->domain_lock);
1570 asid = array_index_nospec(umem.asid, dev->nas);
1571 ret = vduse_dev_dereg_umem(dev, asid, umem.iova,
1572 umem.size);
1573 mutex_unlock(&dev->domain_lock);
1574 break;
1575 }
1576 case VDUSE_IOTLB_GET_INFO: {
1577 struct vduse_iova_info info;
1578 struct vduse_iotlb_entry_v2 entry;
1579
1580 ret = -EFAULT;
1581 if (copy_from_user(&info, argp, sizeof(info)))
1582 break;
1583
1584 if (!is_mem_zero((const char *)info.reserved,
1585 sizeof(info.reserved)))
1586 break;
1587
1588 if (dev->api_version < VDUSE_API_VERSION_1) {
1589 if (info.asid)
1590 break;
1591 } else if (info.asid >= dev->nas)
1592 break;
1593
1594 entry.start = info.start;
1595 entry.last = info.last;
1596 entry.asid = info.asid;
1597 ret = vduse_dev_iotlb_entry(dev, &entry, NULL,
1598 &info.capability);
1599 if (ret < 0)
1600 break;
1601
1602 info.start = entry.start;
1603 info.last = entry.last;
1604 info.asid = entry.asid;
1605
1606 ret = -EFAULT;
1607 if (copy_to_user(argp, &info, sizeof(info)))
1608 break;
1609
1610 ret = 0;
1611 break;
1612 }
1613 default:
1614 ret = -ENOIOCTLCMD;
1615 break;
1616 }
1617
1618 return ret;
1619 }
1620
vduse_dev_release(struct inode * inode,struct file * file)1621 static int vduse_dev_release(struct inode *inode, struct file *file)
1622 {
1623 struct vduse_dev *dev = file->private_data;
1624
1625 mutex_lock(&dev->domain_lock);
1626 for (int i = 0; i < dev->nas; i++)
1627 if (dev->as[i].domain)
1628 vduse_dev_dereg_umem(dev, i, 0,
1629 dev->as[i].domain->bounce_size);
1630 mutex_unlock(&dev->domain_lock);
1631 spin_lock(&dev->msg_lock);
1632 /* Make sure the inflight messages can processed after reconncection */
1633 list_splice_init(&dev->recv_list, &dev->send_list);
1634 spin_unlock(&dev->msg_lock);
1635 dev->connected = false;
1636
1637 return 0;
1638 }
1639
vduse_dev_get_from_minor(int minor)1640 static struct vduse_dev *vduse_dev_get_from_minor(int minor)
1641 {
1642 struct vduse_dev *dev;
1643
1644 mutex_lock(&vduse_lock);
1645 dev = idr_find(&vduse_idr, minor);
1646 mutex_unlock(&vduse_lock);
1647
1648 return dev;
1649 }
1650
vduse_dev_open(struct inode * inode,struct file * file)1651 static int vduse_dev_open(struct inode *inode, struct file *file)
1652 {
1653 int ret;
1654 struct vduse_dev *dev = vduse_dev_get_from_minor(iminor(inode));
1655
1656 if (!dev)
1657 return -ENODEV;
1658
1659 ret = -EBUSY;
1660 mutex_lock(&dev->lock);
1661 if (dev->connected)
1662 goto unlock;
1663
1664 ret = 0;
1665 dev->connected = true;
1666 file->private_data = dev;
1667 unlock:
1668 mutex_unlock(&dev->lock);
1669
1670 return ret;
1671 }
1672
1673 static const struct file_operations vduse_dev_fops = {
1674 .owner = THIS_MODULE,
1675 .open = vduse_dev_open,
1676 .release = vduse_dev_release,
1677 .read_iter = vduse_dev_read_iter,
1678 .write_iter = vduse_dev_write_iter,
1679 .poll = vduse_dev_poll,
1680 .unlocked_ioctl = vduse_dev_ioctl,
1681 .compat_ioctl = compat_ptr_ioctl,
1682 .llseek = noop_llseek,
1683 };
1684
irq_cb_affinity_show(struct vduse_virtqueue * vq,char * buf)1685 static ssize_t irq_cb_affinity_show(struct vduse_virtqueue *vq, char *buf)
1686 {
1687 return sprintf(buf, "%*pb\n", cpumask_pr_args(&vq->irq_affinity));
1688 }
1689
irq_cb_affinity_store(struct vduse_virtqueue * vq,const char * buf,size_t count)1690 static ssize_t irq_cb_affinity_store(struct vduse_virtqueue *vq,
1691 const char *buf, size_t count)
1692 {
1693 cpumask_var_t new_value;
1694 int ret;
1695
1696 if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
1697 return -ENOMEM;
1698
1699 ret = cpumask_parse(buf, new_value);
1700 if (ret)
1701 goto free_mask;
1702
1703 ret = -EINVAL;
1704 if (!cpumask_intersects(new_value, cpu_online_mask))
1705 goto free_mask;
1706
1707 cpumask_copy(&vq->irq_affinity, new_value);
1708 ret = count;
1709 free_mask:
1710 free_cpumask_var(new_value);
1711 return ret;
1712 }
1713
1714 struct vq_sysfs_entry {
1715 struct attribute attr;
1716 ssize_t (*show)(struct vduse_virtqueue *vq, char *buf);
1717 ssize_t (*store)(struct vduse_virtqueue *vq, const char *buf,
1718 size_t count);
1719 };
1720
1721 static struct vq_sysfs_entry irq_cb_affinity_attr = __ATTR_RW(irq_cb_affinity);
1722
1723 static struct attribute *vq_attrs[] = {
1724 &irq_cb_affinity_attr.attr,
1725 NULL,
1726 };
1727 ATTRIBUTE_GROUPS(vq);
1728
vq_attr_show(struct kobject * kobj,struct attribute * attr,char * buf)1729 static ssize_t vq_attr_show(struct kobject *kobj, struct attribute *attr,
1730 char *buf)
1731 {
1732 struct vduse_virtqueue *vq = container_of(kobj,
1733 struct vduse_virtqueue, kobj);
1734 struct vq_sysfs_entry *entry = container_of(attr,
1735 struct vq_sysfs_entry, attr);
1736
1737 if (!entry->show)
1738 return -EIO;
1739
1740 return entry->show(vq, buf);
1741 }
1742
vq_attr_store(struct kobject * kobj,struct attribute * attr,const char * buf,size_t count)1743 static ssize_t vq_attr_store(struct kobject *kobj, struct attribute *attr,
1744 const char *buf, size_t count)
1745 {
1746 struct vduse_virtqueue *vq = container_of(kobj,
1747 struct vduse_virtqueue, kobj);
1748 struct vq_sysfs_entry *entry = container_of(attr,
1749 struct vq_sysfs_entry, attr);
1750
1751 if (!entry->store)
1752 return -EIO;
1753
1754 return entry->store(vq, buf, count);
1755 }
1756
1757 static const struct sysfs_ops vq_sysfs_ops = {
1758 .show = vq_attr_show,
1759 .store = vq_attr_store,
1760 };
1761
vq_release(struct kobject * kobj)1762 static void vq_release(struct kobject *kobj)
1763 {
1764 struct vduse_virtqueue *vq = container_of(kobj,
1765 struct vduse_virtqueue, kobj);
1766 kfree(vq);
1767 }
1768
1769 static const struct kobj_type vq_type = {
1770 .release = vq_release,
1771 .sysfs_ops = &vq_sysfs_ops,
1772 .default_groups = vq_groups,
1773 };
1774
vduse_devnode(const struct device * dev,umode_t * mode)1775 static char *vduse_devnode(const struct device *dev, umode_t *mode)
1776 {
1777 return kasprintf(GFP_KERNEL, "vduse/%s", dev_name(dev));
1778 }
1779
1780 static const struct class vduse_class = {
1781 .name = "vduse",
1782 .devnode = vduse_devnode,
1783 };
1784
vduse_dev_deinit_vqs(struct vduse_dev * dev)1785 static void vduse_dev_deinit_vqs(struct vduse_dev *dev)
1786 {
1787 int i;
1788
1789 if (!dev->vqs)
1790 return;
1791
1792 for (i = 0; i < dev->vq_num; i++)
1793 kobject_put(&dev->vqs[i]->kobj);
1794 kfree(dev->vqs);
1795 }
1796
vduse_dev_init_vqs(struct vduse_dev * dev,u32 vq_align,u32 vq_num)1797 static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)
1798 {
1799 int ret, i;
1800
1801 dev->vq_align = vq_align;
1802 dev->vq_num = vq_num;
1803 dev->vqs = kzalloc_objs(*dev->vqs, dev->vq_num);
1804 if (!dev->vqs)
1805 return -ENOMEM;
1806
1807 for (i = 0; i < vq_num; i++) {
1808 dev->vqs[i] = kzalloc_obj(*dev->vqs[i]);
1809 if (!dev->vqs[i]) {
1810 ret = -ENOMEM;
1811 goto err;
1812 }
1813
1814 dev->vqs[i]->index = i;
1815 dev->vqs[i]->irq_effective_cpu = IRQ_UNBOUND;
1816 INIT_WORK(&dev->vqs[i]->inject, vduse_vq_irq_inject);
1817 INIT_WORK(&dev->vqs[i]->kick, vduse_vq_kick_work);
1818 spin_lock_init(&dev->vqs[i]->kick_lock);
1819 spin_lock_init(&dev->vqs[i]->irq_lock);
1820 cpumask_setall(&dev->vqs[i]->irq_affinity);
1821
1822 kobject_init(&dev->vqs[i]->kobj, &vq_type);
1823 ret = kobject_add(&dev->vqs[i]->kobj,
1824 &dev->dev->kobj, "vq%d", i);
1825 if (ret) {
1826 kfree(dev->vqs[i]);
1827 goto err;
1828 }
1829 }
1830
1831 return 0;
1832 err:
1833 while (i--)
1834 kobject_put(&dev->vqs[i]->kobj);
1835 kfree(dev->vqs);
1836 dev->vqs = NULL;
1837 return ret;
1838 }
1839
vduse_dev_create(void)1840 static struct vduse_dev *vduse_dev_create(void)
1841 {
1842 struct vduse_dev *dev = kzalloc_obj(*dev);
1843
1844 if (!dev)
1845 return NULL;
1846
1847 mutex_init(&dev->lock);
1848 mutex_init(&dev->domain_lock);
1849 spin_lock_init(&dev->msg_lock);
1850 INIT_LIST_HEAD(&dev->send_list);
1851 INIT_LIST_HEAD(&dev->recv_list);
1852 spin_lock_init(&dev->irq_lock);
1853 init_rwsem(&dev->rwsem);
1854
1855 INIT_WORK(&dev->inject, vduse_dev_irq_inject);
1856 init_waitqueue_head(&dev->waitq);
1857
1858 return dev;
1859 }
1860
vduse_dev_destroy(struct vduse_dev * dev)1861 static void vduse_dev_destroy(struct vduse_dev *dev)
1862 {
1863 kfree(dev);
1864 }
1865
vduse_find_dev(const char * name)1866 static struct vduse_dev *vduse_find_dev(const char *name)
1867 {
1868 struct vduse_dev *dev;
1869 int id;
1870
1871 idr_for_each_entry(&vduse_idr, dev, id)
1872 if (!strcmp(dev->name, name))
1873 return dev;
1874
1875 return NULL;
1876 }
1877
vduse_destroy_dev(char * name)1878 static int vduse_destroy_dev(char *name)
1879 {
1880 struct vduse_dev *dev = vduse_find_dev(name);
1881
1882 if (!dev)
1883 return -EINVAL;
1884
1885 mutex_lock(&dev->lock);
1886 if (dev->vdev || dev->connected) {
1887 mutex_unlock(&dev->lock);
1888 return -EBUSY;
1889 }
1890 dev->connected = true;
1891 mutex_unlock(&dev->lock);
1892
1893 vduse_dev_reset(dev);
1894 device_destroy(&vduse_class, MKDEV(MAJOR(vduse_major), dev->minor));
1895 idr_remove(&vduse_idr, dev->minor);
1896 kvfree(dev->config);
1897 vduse_dev_deinit_vqs(dev);
1898 for (int i = 0; i < dev->nas; i++) {
1899 if (dev->as[i].domain)
1900 vduse_domain_destroy(dev->as[i].domain);
1901 }
1902 kfree(dev->as);
1903 kfree(dev->name);
1904 kfree(dev->groups);
1905 vduse_dev_destroy(dev);
1906 module_put(THIS_MODULE);
1907
1908 return 0;
1909 }
1910
device_is_allowed(u32 device_id)1911 static bool device_is_allowed(u32 device_id)
1912 {
1913 int i;
1914
1915 for (i = 0; i < ARRAY_SIZE(allowed_device_id); i++)
1916 if (allowed_device_id[i] == device_id)
1917 return true;
1918
1919 return false;
1920 }
1921
features_is_valid(struct vduse_dev_config * config)1922 static bool features_is_valid(struct vduse_dev_config *config)
1923 {
1924 if (!(config->features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)))
1925 return false;
1926
1927 /* Now we only support read-only configuration space */
1928 if ((config->device_id == VIRTIO_ID_BLOCK) &&
1929 (config->features & BIT_ULL(VIRTIO_BLK_F_CONFIG_WCE)))
1930 return false;
1931 else if ((config->device_id == VIRTIO_ID_NET) &&
1932 (config->features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
1933 return false;
1934
1935 if ((config->device_id == VIRTIO_ID_NET) &&
1936 !(config->features & BIT_ULL(VIRTIO_F_VERSION_1)))
1937 return false;
1938
1939 return true;
1940 }
1941
vduse_validate_config(struct vduse_dev_config * config,u64 api_version)1942 static bool vduse_validate_config(struct vduse_dev_config *config,
1943 u64 api_version)
1944 {
1945 if (!is_mem_zero((const char *)config->reserved,
1946 sizeof(config->reserved)))
1947 return false;
1948
1949 if (api_version < VDUSE_API_VERSION_1 &&
1950 (config->ngroups || config->nas))
1951 return false;
1952
1953 if (api_version >= VDUSE_API_VERSION_1) {
1954 if (!config->ngroups || config->ngroups > VDUSE_DEV_MAX_GROUPS)
1955 return false;
1956
1957 if (!config->nas || config->nas > VDUSE_DEV_MAX_AS)
1958 return false;
1959 }
1960
1961 if (config->vq_align > PAGE_SIZE)
1962 return false;
1963
1964 if (config->config_size > PAGE_SIZE)
1965 return false;
1966
1967 if (config->vq_num > 0xffff)
1968 return false;
1969
1970 if (!config->name[0])
1971 return false;
1972
1973 if (!device_is_allowed(config->device_id))
1974 return false;
1975
1976 if (!features_is_valid(config))
1977 return false;
1978
1979 return true;
1980 }
1981
msg_timeout_show(struct device * device,struct device_attribute * attr,char * buf)1982 static ssize_t msg_timeout_show(struct device *device,
1983 struct device_attribute *attr, char *buf)
1984 {
1985 struct vduse_dev *dev = dev_get_drvdata(device);
1986
1987 return sysfs_emit(buf, "%u\n", dev->msg_timeout);
1988 }
1989
msg_timeout_store(struct device * device,struct device_attribute * attr,const char * buf,size_t count)1990 static ssize_t msg_timeout_store(struct device *device,
1991 struct device_attribute *attr,
1992 const char *buf, size_t count)
1993 {
1994 struct vduse_dev *dev = dev_get_drvdata(device);
1995 int ret;
1996
1997 ret = kstrtouint(buf, 10, &dev->msg_timeout);
1998 if (ret < 0)
1999 return ret;
2000
2001 return count;
2002 }
2003
2004 static DEVICE_ATTR_RW(msg_timeout);
2005
bounce_size_show(struct device * device,struct device_attribute * attr,char * buf)2006 static ssize_t bounce_size_show(struct device *device,
2007 struct device_attribute *attr, char *buf)
2008 {
2009 struct vduse_dev *dev = dev_get_drvdata(device);
2010
2011 return sysfs_emit(buf, "%u\n", dev->bounce_size);
2012 }
2013
bounce_size_store(struct device * device,struct device_attribute * attr,const char * buf,size_t count)2014 static ssize_t bounce_size_store(struct device *device,
2015 struct device_attribute *attr,
2016 const char *buf, size_t count)
2017 {
2018 struct vduse_dev *dev = dev_get_drvdata(device);
2019 unsigned int bounce_size;
2020 int ret;
2021
2022 ret = -EPERM;
2023 mutex_lock(&dev->domain_lock);
2024 /* Assuming that if the first domain is allocated, all are allocated */
2025 if (dev->as[0].domain)
2026 goto unlock;
2027
2028 ret = kstrtouint(buf, 10, &bounce_size);
2029 if (ret < 0)
2030 goto unlock;
2031
2032 ret = -EINVAL;
2033 if (bounce_size > VDUSE_MAX_BOUNCE_SIZE ||
2034 bounce_size < VDUSE_MIN_BOUNCE_SIZE)
2035 goto unlock;
2036
2037 dev->bounce_size = bounce_size & PAGE_MASK;
2038 ret = count;
2039 unlock:
2040 mutex_unlock(&dev->domain_lock);
2041 return ret;
2042 }
2043
2044 static DEVICE_ATTR_RW(bounce_size);
2045
2046 static struct attribute *vduse_dev_attrs[] = {
2047 &dev_attr_msg_timeout.attr,
2048 &dev_attr_bounce_size.attr,
2049 NULL
2050 };
2051
2052 ATTRIBUTE_GROUPS(vduse_dev);
2053
vduse_create_dev(struct vduse_dev_config * config,void * config_buf,u64 api_version)2054 static int vduse_create_dev(struct vduse_dev_config *config,
2055 void *config_buf, u64 api_version)
2056 {
2057 int ret;
2058 struct vduse_dev *dev;
2059
2060 ret = -EPERM;
2061 if ((config->device_id == VIRTIO_ID_NET) && !capable(CAP_NET_ADMIN))
2062 goto err;
2063
2064 ret = -EEXIST;
2065 if (vduse_find_dev(config->name))
2066 goto err;
2067
2068 ret = -ENOMEM;
2069 dev = vduse_dev_create();
2070 if (!dev)
2071 goto err;
2072
2073 dev->api_version = api_version;
2074 dev->device_features = config->features;
2075 dev->device_id = config->device_id;
2076 dev->vendor_id = config->vendor_id;
2077
2078 dev->nas = (dev->api_version < VDUSE_API_VERSION_1) ? 1 : config->nas;
2079 dev->as = kzalloc_objs(dev->as[0], dev->nas);
2080 if (!dev->as)
2081 goto err_as;
2082 for (int i = 0; i < dev->nas; i++)
2083 mutex_init(&dev->as[i].mem_lock);
2084
2085 dev->ngroups = (dev->api_version < VDUSE_API_VERSION_1)
2086 ? 1
2087 : config->ngroups;
2088 dev->groups = kzalloc_objs(dev->groups[0], dev->ngroups);
2089 if (!dev->groups)
2090 goto err_vq_groups;
2091 for (u32 i = 0; i < dev->ngroups; ++i) {
2092 dev->groups[i].dev = dev;
2093 rwlock_init(&dev->groups[i].as_lock);
2094 dev->groups[i].as = &dev->as[0];
2095 }
2096
2097 dev->name = kstrdup(config->name, GFP_KERNEL);
2098 if (!dev->name)
2099 goto err_str;
2100
2101 dev->bounce_size = VDUSE_BOUNCE_SIZE;
2102 dev->config = config_buf;
2103 dev->config_size = config->config_size;
2104
2105 ret = idr_alloc(&vduse_idr, dev, 1, VDUSE_DEV_MAX, GFP_KERNEL);
2106 if (ret < 0)
2107 goto err_idr;
2108
2109 dev->minor = ret;
2110 dev->msg_timeout = VDUSE_MSG_DEFAULT_TIMEOUT;
2111 dev->dev = device_create_with_groups(&vduse_class, NULL,
2112 MKDEV(MAJOR(vduse_major), dev->minor),
2113 dev, vduse_dev_groups, "%s", config->name);
2114 if (IS_ERR(dev->dev)) {
2115 ret = PTR_ERR(dev->dev);
2116 goto err_dev;
2117 }
2118
2119 ret = vduse_dev_init_vqs(dev, config->vq_align, config->vq_num);
2120 if (ret)
2121 goto err_vqs;
2122
2123 __module_get(THIS_MODULE);
2124
2125 return 0;
2126 err_vqs:
2127 device_destroy(&vduse_class, MKDEV(MAJOR(vduse_major), dev->minor));
2128 err_dev:
2129 idr_remove(&vduse_idr, dev->minor);
2130 err_idr:
2131 kfree(dev->name);
2132 err_str:
2133 kfree(dev->groups);
2134 err_vq_groups:
2135 kfree(dev->as);
2136 err_as:
2137 vduse_dev_destroy(dev);
2138 err:
2139 return ret;
2140 }
2141
vduse_ioctl(struct file * file,unsigned int cmd,unsigned long arg)2142 static long vduse_ioctl(struct file *file, unsigned int cmd,
2143 unsigned long arg)
2144 {
2145 int ret;
2146 void __user *argp = (void __user *)arg;
2147 struct vduse_control *control = file->private_data;
2148
2149 mutex_lock(&vduse_lock);
2150 switch (cmd) {
2151 case VDUSE_GET_API_VERSION:
2152 if (control->api_version == VDUSE_API_VERSION_NOT_ASKED)
2153 control->api_version = VDUSE_API_VERSION_1;
2154 ret = put_user(control->api_version, (u64 __user *)argp);
2155 break;
2156 case VDUSE_SET_API_VERSION: {
2157 u64 api_version;
2158
2159 ret = -EFAULT;
2160 if (get_user(api_version, (u64 __user *)argp))
2161 break;
2162
2163 ret = -EINVAL;
2164 if (api_version > VDUSE_API_VERSION_1)
2165 break;
2166
2167 ret = 0;
2168 control->api_version = api_version;
2169 break;
2170 }
2171 case VDUSE_CREATE_DEV: {
2172 struct vduse_dev_config config;
2173 unsigned long size = offsetof(struct vduse_dev_config, config);
2174 void *buf;
2175
2176 ret = -EFAULT;
2177 if (copy_from_user(&config, argp, size))
2178 break;
2179
2180 ret = -EINVAL;
2181 if (control->api_version == VDUSE_API_VERSION_NOT_ASKED)
2182 control->api_version = VDUSE_API_VERSION;
2183 if (!vduse_validate_config(&config, control->api_version))
2184 break;
2185
2186 buf = vmemdup_user(argp + size, config.config_size);
2187 if (IS_ERR(buf)) {
2188 ret = PTR_ERR(buf);
2189 break;
2190 }
2191 config.name[VDUSE_NAME_MAX - 1] = '\0';
2192 ret = vduse_create_dev(&config, buf, control->api_version);
2193 if (ret)
2194 kvfree(buf);
2195 break;
2196 }
2197 case VDUSE_DESTROY_DEV: {
2198 char name[VDUSE_NAME_MAX];
2199
2200 ret = -EFAULT;
2201 if (copy_from_user(name, argp, VDUSE_NAME_MAX))
2202 break;
2203
2204 name[VDUSE_NAME_MAX - 1] = '\0';
2205 ret = vduse_destroy_dev(name);
2206 break;
2207 }
2208 default:
2209 ret = -EINVAL;
2210 break;
2211 }
2212 mutex_unlock(&vduse_lock);
2213
2214 return ret;
2215 }
2216
vduse_release(struct inode * inode,struct file * file)2217 static int vduse_release(struct inode *inode, struct file *file)
2218 {
2219 struct vduse_control *control = file->private_data;
2220
2221 kfree(control);
2222 return 0;
2223 }
2224
vduse_open(struct inode * inode,struct file * file)2225 static int vduse_open(struct inode *inode, struct file *file)
2226 {
2227 struct vduse_control *control;
2228
2229 control = kmalloc_obj(struct vduse_control);
2230 if (!control)
2231 return -ENOMEM;
2232
2233 control->api_version = VDUSE_API_VERSION_NOT_ASKED;
2234 file->private_data = control;
2235
2236 return 0;
2237 }
2238
2239 static const struct file_operations vduse_ctrl_fops = {
2240 .owner = THIS_MODULE,
2241 .open = vduse_open,
2242 .release = vduse_release,
2243 .unlocked_ioctl = vduse_ioctl,
2244 .compat_ioctl = compat_ptr_ioctl,
2245 .llseek = noop_llseek,
2246 };
2247
2248 struct vduse_mgmt_dev {
2249 struct vdpa_mgmt_dev mgmt_dev;
2250 struct device dev;
2251 };
2252
2253 static struct vduse_mgmt_dev *vduse_mgmt;
2254
vduse_dev_init_vdpa(struct vduse_dev * dev,const char * name)2255 static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name)
2256 {
2257 struct vduse_vdpa *vdev;
2258
2259 if (dev->vdev)
2260 return -EEXIST;
2261
2262 vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev,
2263 &vduse_vdpa_config_ops, &vduse_map_ops,
2264 dev->ngroups, dev->nas, name, true);
2265 if (IS_ERR(vdev))
2266 return PTR_ERR(vdev);
2267
2268 dev->vdev = vdev;
2269 vdev->dev = dev;
2270 vdev->vdpa.mdev = &vduse_mgmt->mgmt_dev;
2271
2272 return 0;
2273 }
2274
vdpa_dev_add(struct vdpa_mgmt_dev * mdev,const char * name,const struct vdpa_dev_set_config * config)2275 static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
2276 const struct vdpa_dev_set_config *config)
2277 {
2278 struct vduse_dev *dev;
2279 size_t domain_bounce_size;
2280 int ret, i;
2281
2282 mutex_lock(&vduse_lock);
2283 dev = vduse_find_dev(name);
2284 if (!dev || !vduse_dev_is_ready(dev)) {
2285 mutex_unlock(&vduse_lock);
2286 return -EINVAL;
2287 }
2288 ret = vduse_dev_init_vdpa(dev, name);
2289 mutex_unlock(&vduse_lock);
2290 if (ret)
2291 return ret;
2292
2293 mutex_lock(&dev->domain_lock);
2294 ret = 0;
2295
2296 domain_bounce_size = dev->bounce_size / dev->nas;
2297 for (i = 0; i < dev->nas; ++i) {
2298 dev->as[i].domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1,
2299 domain_bounce_size);
2300 if (!dev->as[i].domain) {
2301 ret = -ENOMEM;
2302 goto err;
2303 }
2304 }
2305
2306 mutex_unlock(&dev->domain_lock);
2307
2308 ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num);
2309 if (ret)
2310 goto err_register;
2311
2312 return 0;
2313
2314 err_register:
2315 mutex_lock(&dev->domain_lock);
2316
2317 err:
2318 for (int j = 0; j < i; j++) {
2319 if (dev->as[j].domain) {
2320 vduse_domain_destroy(dev->as[j].domain);
2321 dev->as[j].domain = NULL;
2322 }
2323 }
2324 mutex_unlock(&dev->domain_lock);
2325
2326 put_device(&dev->vdev->vdpa.dev);
2327
2328 return ret;
2329 }
2330
vdpa_dev_del(struct vdpa_mgmt_dev * mdev,struct vdpa_device * dev)2331 static void vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev)
2332 {
2333 _vdpa_unregister_device(dev);
2334 }
2335
2336 static const struct vdpa_mgmtdev_ops vdpa_dev_mgmtdev_ops = {
2337 .dev_add = vdpa_dev_add,
2338 .dev_del = vdpa_dev_del,
2339 };
2340
2341 static struct virtio_device_id id_table[] = {
2342 { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
2343 { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
2344 { 0 },
2345 };
2346
vduse_mgmtdev_release(struct device * dev)2347 static void vduse_mgmtdev_release(struct device *dev)
2348 {
2349 struct vduse_mgmt_dev *mgmt_dev;
2350
2351 mgmt_dev = container_of(dev, struct vduse_mgmt_dev, dev);
2352 kfree(mgmt_dev);
2353 }
2354
vduse_mgmtdev_init(void)2355 static int vduse_mgmtdev_init(void)
2356 {
2357 int ret;
2358
2359 vduse_mgmt = kzalloc_obj(*vduse_mgmt);
2360 if (!vduse_mgmt)
2361 return -ENOMEM;
2362
2363 ret = dev_set_name(&vduse_mgmt->dev, "vduse");
2364 if (ret) {
2365 kfree(vduse_mgmt);
2366 return ret;
2367 }
2368
2369 vduse_mgmt->dev.release = vduse_mgmtdev_release;
2370
2371 ret = device_register(&vduse_mgmt->dev);
2372 if (ret)
2373 goto dev_reg_err;
2374
2375 vduse_mgmt->mgmt_dev.id_table = id_table;
2376 vduse_mgmt->mgmt_dev.ops = &vdpa_dev_mgmtdev_ops;
2377 vduse_mgmt->mgmt_dev.device = &vduse_mgmt->dev;
2378 ret = vdpa_mgmtdev_register(&vduse_mgmt->mgmt_dev);
2379 if (ret)
2380 device_unregister(&vduse_mgmt->dev);
2381
2382 return ret;
2383
2384 dev_reg_err:
2385 put_device(&vduse_mgmt->dev);
2386 return ret;
2387 }
2388
vduse_mgmtdev_exit(void)2389 static void vduse_mgmtdev_exit(void)
2390 {
2391 vdpa_mgmtdev_unregister(&vduse_mgmt->mgmt_dev);
2392 device_unregister(&vduse_mgmt->dev);
2393 }
2394
vduse_init(void)2395 static int vduse_init(void)
2396 {
2397 int ret;
2398 struct device *dev;
2399
2400 ret = class_register(&vduse_class);
2401 if (ret)
2402 return ret;
2403
2404 ret = alloc_chrdev_region(&vduse_major, 0, VDUSE_DEV_MAX, "vduse");
2405 if (ret)
2406 goto err_chardev_region;
2407
2408 /* /dev/vduse/control */
2409 cdev_init(&vduse_ctrl_cdev, &vduse_ctrl_fops);
2410 vduse_ctrl_cdev.owner = THIS_MODULE;
2411 ret = cdev_add(&vduse_ctrl_cdev, vduse_major, 1);
2412 if (ret)
2413 goto err_ctrl_cdev;
2414
2415 dev = device_create(&vduse_class, NULL, vduse_major, NULL, "control");
2416 if (IS_ERR(dev)) {
2417 ret = PTR_ERR(dev);
2418 goto err_device;
2419 }
2420
2421 /* /dev/vduse/$DEVICE */
2422 cdev_init(&vduse_cdev, &vduse_dev_fops);
2423 vduse_cdev.owner = THIS_MODULE;
2424 ret = cdev_add(&vduse_cdev, MKDEV(MAJOR(vduse_major), 1),
2425 VDUSE_DEV_MAX - 1);
2426 if (ret)
2427 goto err_cdev;
2428
2429 ret = -ENOMEM;
2430 vduse_irq_wq = alloc_workqueue("vduse-irq",
2431 WQ_HIGHPRI | WQ_SYSFS | WQ_UNBOUND, 0);
2432 if (!vduse_irq_wq)
2433 goto err_wq;
2434
2435 vduse_irq_bound_wq = alloc_workqueue("vduse-irq-bound",
2436 WQ_HIGHPRI | WQ_PERCPU, 0);
2437 if (!vduse_irq_bound_wq)
2438 goto err_bound_wq;
2439
2440 ret = vduse_domain_init();
2441 if (ret)
2442 goto err_domain;
2443
2444 ret = vduse_mgmtdev_init();
2445 if (ret)
2446 goto err_mgmtdev;
2447
2448 return 0;
2449 err_mgmtdev:
2450 vduse_domain_exit();
2451 err_domain:
2452 destroy_workqueue(vduse_irq_bound_wq);
2453 err_bound_wq:
2454 destroy_workqueue(vduse_irq_wq);
2455 err_wq:
2456 cdev_del(&vduse_cdev);
2457 err_cdev:
2458 device_destroy(&vduse_class, vduse_major);
2459 err_device:
2460 cdev_del(&vduse_ctrl_cdev);
2461 err_ctrl_cdev:
2462 unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX);
2463 err_chardev_region:
2464 class_unregister(&vduse_class);
2465 return ret;
2466 }
2467 module_init(vduse_init);
2468
vduse_exit(void)2469 static void vduse_exit(void)
2470 {
2471 vduse_mgmtdev_exit();
2472 vduse_domain_exit();
2473 destroy_workqueue(vduse_irq_bound_wq);
2474 destroy_workqueue(vduse_irq_wq);
2475 cdev_del(&vduse_cdev);
2476 device_destroy(&vduse_class, vduse_major);
2477 cdev_del(&vduse_ctrl_cdev);
2478 unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX);
2479 class_unregister(&vduse_class);
2480 idr_destroy(&vduse_idr);
2481 }
2482 module_exit(vduse_exit);
2483
2484 MODULE_LICENSE(DRV_LICENSE);
2485 MODULE_AUTHOR(DRV_AUTHOR);
2486 MODULE_DESCRIPTION(DRV_DESC);
2487