xref: /linux/fs/fuse/virtio_fs.c (revision 8b6d678fede700db6466d73f11fcbad496fa515e)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * virtio-fs: Virtio Filesystem
4  * Copyright (C) 2018 Red Hat, Inc.
5  */
6 
7 #include <linux/fs.h>
8 #include <linux/dax.h>
9 #include <linux/pci.h>
10 #include <linux/pfn_t.h>
11 #include <linux/memremap.h>
12 #include <linux/module.h>
13 #include <linux/virtio.h>
14 #include <linux/virtio_fs.h>
15 #include <linux/delay.h>
16 #include <linux/fs_context.h>
17 #include <linux/fs_parser.h>
18 #include <linux/highmem.h>
19 #include <linux/cleanup.h>
20 #include <linux/uio.h>
21 #include "fuse_i.h"
22 
23 /* Used to help calculate the FUSE connection's max_pages limit for a request's
24  * size. Parts of the struct fuse_req are sliced into scattergather lists in
25  * addition to the pages used, so this can help account for that overhead.
26  */
27 #define FUSE_HEADER_OVERHEAD    4
28 
29 /* List of virtio-fs device instances and a lock for the list. Also provides
30  * mutual exclusion in device removal and mounting path
31  */
32 static DEFINE_MUTEX(virtio_fs_mutex);
33 static LIST_HEAD(virtio_fs_instances);
34 
35 /* The /sys/fs/virtio_fs/ kset */
36 static struct kset *virtio_fs_kset;
37 
38 enum {
39 	VQ_HIPRIO,
40 	VQ_REQUEST
41 };
42 
43 #define VQ_NAME_LEN	24
44 
45 /* Per-virtqueue state */
46 struct virtio_fs_vq {
47 	spinlock_t lock;
48 	struct virtqueue *vq;     /* protected by ->lock */
49 	struct work_struct done_work;
50 	struct list_head queued_reqs;
51 	struct list_head end_reqs;	/* End these requests */
52 	struct delayed_work dispatch_work;
53 	struct fuse_dev *fud;
54 	bool connected;
55 	long in_flight;
56 	struct completion in_flight_zero; /* No inflight requests */
57 	char name[VQ_NAME_LEN];
58 } ____cacheline_aligned_in_smp;
59 
60 /* A virtio-fs device instance */
61 struct virtio_fs {
62 	struct kobject kobj;
63 	struct list_head list;    /* on virtio_fs_instances */
64 	char *tag;
65 	struct virtio_fs_vq *vqs;
66 	unsigned int nvqs;               /* number of virtqueues */
67 	unsigned int num_request_queues; /* number of request queues */
68 	struct dax_device *dax_dev;
69 
70 	/* DAX memory window where file contents are mapped */
71 	void *window_kaddr;
72 	phys_addr_t window_phys_addr;
73 	size_t window_len;
74 };
75 
76 struct virtio_fs_forget_req {
77 	struct fuse_in_header ih;
78 	struct fuse_forget_in arg;
79 };
80 
81 struct virtio_fs_forget {
82 	/* This request can be temporarily queued on virt queue */
83 	struct list_head list;
84 	struct virtio_fs_forget_req req;
85 };
86 
87 struct virtio_fs_req_work {
88 	struct fuse_req *req;
89 	struct virtio_fs_vq *fsvq;
90 	struct work_struct done_work;
91 };
92 
93 static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
94 				 struct fuse_req *req, bool in_flight);
95 
96 static const struct constant_table dax_param_enums[] = {
97 	{"always",	FUSE_DAX_ALWAYS },
98 	{"never",	FUSE_DAX_NEVER },
99 	{"inode",	FUSE_DAX_INODE_USER },
100 	{}
101 };
102 
103 enum {
104 	OPT_DAX,
105 	OPT_DAX_ENUM,
106 };
107 
108 static const struct fs_parameter_spec virtio_fs_parameters[] = {
109 	fsparam_flag("dax", OPT_DAX),
110 	fsparam_enum("dax", OPT_DAX_ENUM, dax_param_enums),
111 	{}
112 };
113 
114 static int virtio_fs_parse_param(struct fs_context *fsc,
115 				 struct fs_parameter *param)
116 {
117 	struct fs_parse_result result;
118 	struct fuse_fs_context *ctx = fsc->fs_private;
119 	int opt;
120 
121 	opt = fs_parse(fsc, virtio_fs_parameters, param, &result);
122 	if (opt < 0)
123 		return opt;
124 
125 	switch (opt) {
126 	case OPT_DAX:
127 		ctx->dax_mode = FUSE_DAX_ALWAYS;
128 		break;
129 	case OPT_DAX_ENUM:
130 		ctx->dax_mode = result.uint_32;
131 		break;
132 	default:
133 		return -EINVAL;
134 	}
135 
136 	return 0;
137 }
138 
139 static void virtio_fs_free_fsc(struct fs_context *fsc)
140 {
141 	struct fuse_fs_context *ctx = fsc->fs_private;
142 
143 	kfree(ctx);
144 }
145 
146 static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq)
147 {
148 	struct virtio_fs *fs = vq->vdev->priv;
149 
150 	return &fs->vqs[vq->index];
151 }
152 
153 /* Should be called with fsvq->lock held. */
154 static inline void inc_in_flight_req(struct virtio_fs_vq *fsvq)
155 {
156 	fsvq->in_flight++;
157 }
158 
159 /* Should be called with fsvq->lock held. */
160 static inline void dec_in_flight_req(struct virtio_fs_vq *fsvq)
161 {
162 	WARN_ON(fsvq->in_flight <= 0);
163 	fsvq->in_flight--;
164 	if (!fsvq->in_flight)
165 		complete(&fsvq->in_flight_zero);
166 }
167 
168 static ssize_t tag_show(struct kobject *kobj,
169 		struct kobj_attribute *attr, char *buf)
170 {
171 	struct virtio_fs *fs = container_of(kobj, struct virtio_fs, kobj);
172 
173 	return sysfs_emit(buf, fs->tag);
174 }
175 
176 static struct kobj_attribute virtio_fs_tag_attr = __ATTR_RO(tag);
177 
178 static struct attribute *virtio_fs_attrs[] = {
179 	&virtio_fs_tag_attr.attr,
180 	NULL
181 };
182 ATTRIBUTE_GROUPS(virtio_fs);
183 
184 static void virtio_fs_ktype_release(struct kobject *kobj)
185 {
186 	struct virtio_fs *vfs = container_of(kobj, struct virtio_fs, kobj);
187 
188 	kfree(vfs->vqs);
189 	kfree(vfs);
190 }
191 
192 static const struct kobj_type virtio_fs_ktype = {
193 	.release = virtio_fs_ktype_release,
194 	.sysfs_ops = &kobj_sysfs_ops,
195 	.default_groups = virtio_fs_groups,
196 };
197 
198 /* Make sure virtiofs_mutex is held */
199 static void virtio_fs_put(struct virtio_fs *fs)
200 {
201 	kobject_put(&fs->kobj);
202 }
203 
204 static void virtio_fs_fiq_release(struct fuse_iqueue *fiq)
205 {
206 	struct virtio_fs *vfs = fiq->priv;
207 
208 	mutex_lock(&virtio_fs_mutex);
209 	virtio_fs_put(vfs);
210 	mutex_unlock(&virtio_fs_mutex);
211 }
212 
213 static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq)
214 {
215 	WARN_ON(fsvq->in_flight < 0);
216 
217 	/* Wait for in flight requests to finish.*/
218 	spin_lock(&fsvq->lock);
219 	if (fsvq->in_flight) {
220 		/* We are holding virtio_fs_mutex. There should not be any
221 		 * waiters waiting for completion.
222 		 */
223 		reinit_completion(&fsvq->in_flight_zero);
224 		spin_unlock(&fsvq->lock);
225 		wait_for_completion(&fsvq->in_flight_zero);
226 	} else {
227 		spin_unlock(&fsvq->lock);
228 	}
229 
230 	flush_work(&fsvq->done_work);
231 	flush_delayed_work(&fsvq->dispatch_work);
232 }
233 
234 static void virtio_fs_drain_all_queues_locked(struct virtio_fs *fs)
235 {
236 	struct virtio_fs_vq *fsvq;
237 	int i;
238 
239 	for (i = 0; i < fs->nvqs; i++) {
240 		fsvq = &fs->vqs[i];
241 		virtio_fs_drain_queue(fsvq);
242 	}
243 }
244 
245 static void virtio_fs_drain_all_queues(struct virtio_fs *fs)
246 {
247 	/* Provides mutual exclusion between ->remove and ->kill_sb
248 	 * paths. We don't want both of these draining queue at the
249 	 * same time. Current completion logic reinits completion
250 	 * and that means there should not be any other thread
251 	 * doing reinit or waiting for completion already.
252 	 */
253 	mutex_lock(&virtio_fs_mutex);
254 	virtio_fs_drain_all_queues_locked(fs);
255 	mutex_unlock(&virtio_fs_mutex);
256 }
257 
258 static void virtio_fs_start_all_queues(struct virtio_fs *fs)
259 {
260 	struct virtio_fs_vq *fsvq;
261 	int i;
262 
263 	for (i = 0; i < fs->nvqs; i++) {
264 		fsvq = &fs->vqs[i];
265 		spin_lock(&fsvq->lock);
266 		fsvq->connected = true;
267 		spin_unlock(&fsvq->lock);
268 	}
269 }
270 
271 /* Add a new instance to the list or return -EEXIST if tag name exists*/
272 static int virtio_fs_add_instance(struct virtio_device *vdev,
273 				  struct virtio_fs *fs)
274 {
275 	struct virtio_fs *fs2;
276 	int ret;
277 
278 	mutex_lock(&virtio_fs_mutex);
279 
280 	list_for_each_entry(fs2, &virtio_fs_instances, list) {
281 		if (strcmp(fs->tag, fs2->tag) == 0) {
282 			mutex_unlock(&virtio_fs_mutex);
283 			return -EEXIST;
284 		}
285 	}
286 
287 	/* Use the virtio_device's index as a unique identifier, there is no
288 	 * need to allocate our own identifiers because the virtio_fs instance
289 	 * is only visible to userspace as long as the underlying virtio_device
290 	 * exists.
291 	 */
292 	fs->kobj.kset = virtio_fs_kset;
293 	ret = kobject_add(&fs->kobj, NULL, "%d", vdev->index);
294 	if (ret < 0) {
295 		mutex_unlock(&virtio_fs_mutex);
296 		return ret;
297 	}
298 
299 	ret = sysfs_create_link(&fs->kobj, &vdev->dev.kobj, "device");
300 	if (ret < 0) {
301 		kobject_del(&fs->kobj);
302 		mutex_unlock(&virtio_fs_mutex);
303 		return ret;
304 	}
305 
306 	list_add_tail(&fs->list, &virtio_fs_instances);
307 
308 	mutex_unlock(&virtio_fs_mutex);
309 
310 	kobject_uevent(&fs->kobj, KOBJ_ADD);
311 
312 	return 0;
313 }
314 
315 /* Return the virtio_fs with a given tag, or NULL */
316 static struct virtio_fs *virtio_fs_find_instance(const char *tag)
317 {
318 	struct virtio_fs *fs;
319 
320 	mutex_lock(&virtio_fs_mutex);
321 
322 	list_for_each_entry(fs, &virtio_fs_instances, list) {
323 		if (strcmp(fs->tag, tag) == 0) {
324 			kobject_get(&fs->kobj);
325 			goto found;
326 		}
327 	}
328 
329 	fs = NULL; /* not found */
330 
331 found:
332 	mutex_unlock(&virtio_fs_mutex);
333 
334 	return fs;
335 }
336 
337 static void virtio_fs_free_devs(struct virtio_fs *fs)
338 {
339 	unsigned int i;
340 
341 	for (i = 0; i < fs->nvqs; i++) {
342 		struct virtio_fs_vq *fsvq = &fs->vqs[i];
343 
344 		if (!fsvq->fud)
345 			continue;
346 
347 		fuse_dev_free(fsvq->fud);
348 		fsvq->fud = NULL;
349 	}
350 }
351 
352 /* Read filesystem name from virtio config into fs->tag (must kfree()). */
353 static int virtio_fs_read_tag(struct virtio_device *vdev, struct virtio_fs *fs)
354 {
355 	char tag_buf[sizeof_field(struct virtio_fs_config, tag)];
356 	char *end;
357 	size_t len;
358 
359 	virtio_cread_bytes(vdev, offsetof(struct virtio_fs_config, tag),
360 			   &tag_buf, sizeof(tag_buf));
361 	end = memchr(tag_buf, '\0', sizeof(tag_buf));
362 	if (end == tag_buf)
363 		return -EINVAL; /* empty tag */
364 	if (!end)
365 		end = &tag_buf[sizeof(tag_buf)];
366 
367 	len = end - tag_buf;
368 	fs->tag = devm_kmalloc(&vdev->dev, len + 1, GFP_KERNEL);
369 	if (!fs->tag)
370 		return -ENOMEM;
371 	memcpy(fs->tag, tag_buf, len);
372 	fs->tag[len] = '\0';
373 
374 	/* While the VIRTIO specification allows any character, newlines are
375 	 * awkward on mount(8) command-lines and cause problems in the sysfs
376 	 * "tag" attr and uevent TAG= properties. Forbid them.
377 	 */
378 	if (strchr(fs->tag, '\n')) {
379 		dev_dbg(&vdev->dev, "refusing virtiofs tag with newline character\n");
380 		return -EINVAL;
381 	}
382 
383 	return 0;
384 }
385 
386 /* Work function for hiprio completion */
387 static void virtio_fs_hiprio_done_work(struct work_struct *work)
388 {
389 	struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
390 						 done_work);
391 	struct virtqueue *vq = fsvq->vq;
392 
393 	/* Free completed FUSE_FORGET requests */
394 	spin_lock(&fsvq->lock);
395 	do {
396 		unsigned int len;
397 		void *req;
398 
399 		virtqueue_disable_cb(vq);
400 
401 		while ((req = virtqueue_get_buf(vq, &len)) != NULL) {
402 			kfree(req);
403 			dec_in_flight_req(fsvq);
404 		}
405 	} while (!virtqueue_enable_cb(vq));
406 	spin_unlock(&fsvq->lock);
407 }
408 
409 static void virtio_fs_request_dispatch_work(struct work_struct *work)
410 {
411 	struct fuse_req *req;
412 	struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
413 						 dispatch_work.work);
414 	int ret;
415 
416 	pr_debug("virtio-fs: worker %s called.\n", __func__);
417 	while (1) {
418 		spin_lock(&fsvq->lock);
419 		req = list_first_entry_or_null(&fsvq->end_reqs, struct fuse_req,
420 					       list);
421 		if (!req) {
422 			spin_unlock(&fsvq->lock);
423 			break;
424 		}
425 
426 		list_del_init(&req->list);
427 		spin_unlock(&fsvq->lock);
428 		fuse_request_end(req);
429 	}
430 
431 	/* Dispatch pending requests */
432 	while (1) {
433 		spin_lock(&fsvq->lock);
434 		req = list_first_entry_or_null(&fsvq->queued_reqs,
435 					       struct fuse_req, list);
436 		if (!req) {
437 			spin_unlock(&fsvq->lock);
438 			return;
439 		}
440 		list_del_init(&req->list);
441 		spin_unlock(&fsvq->lock);
442 
443 		ret = virtio_fs_enqueue_req(fsvq, req, true);
444 		if (ret < 0) {
445 			if (ret == -ENOMEM || ret == -ENOSPC) {
446 				spin_lock(&fsvq->lock);
447 				list_add_tail(&req->list, &fsvq->queued_reqs);
448 				schedule_delayed_work(&fsvq->dispatch_work,
449 						      msecs_to_jiffies(1));
450 				spin_unlock(&fsvq->lock);
451 				return;
452 			}
453 			req->out.h.error = ret;
454 			spin_lock(&fsvq->lock);
455 			dec_in_flight_req(fsvq);
456 			spin_unlock(&fsvq->lock);
457 			pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n",
458 			       ret);
459 			fuse_request_end(req);
460 		}
461 	}
462 }
463 
464 /*
465  * Returns 1 if queue is full and sender should wait a bit before sending
466  * next request, 0 otherwise.
467  */
468 static int send_forget_request(struct virtio_fs_vq *fsvq,
469 			       struct virtio_fs_forget *forget,
470 			       bool in_flight)
471 {
472 	struct scatterlist sg;
473 	struct virtqueue *vq;
474 	int ret = 0;
475 	bool notify;
476 	struct virtio_fs_forget_req *req = &forget->req;
477 
478 	spin_lock(&fsvq->lock);
479 	if (!fsvq->connected) {
480 		if (in_flight)
481 			dec_in_flight_req(fsvq);
482 		kfree(forget);
483 		goto out;
484 	}
485 
486 	sg_init_one(&sg, req, sizeof(*req));
487 	vq = fsvq->vq;
488 	dev_dbg(&vq->vdev->dev, "%s\n", __func__);
489 
490 	ret = virtqueue_add_outbuf(vq, &sg, 1, forget, GFP_ATOMIC);
491 	if (ret < 0) {
492 		if (ret == -ENOMEM || ret == -ENOSPC) {
493 			pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later\n",
494 				 ret);
495 			list_add_tail(&forget->list, &fsvq->queued_reqs);
496 			schedule_delayed_work(&fsvq->dispatch_work,
497 					      msecs_to_jiffies(1));
498 			if (!in_flight)
499 				inc_in_flight_req(fsvq);
500 			/* Queue is full */
501 			ret = 1;
502 		} else {
503 			pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n",
504 				 ret);
505 			kfree(forget);
506 			if (in_flight)
507 				dec_in_flight_req(fsvq);
508 		}
509 		goto out;
510 	}
511 
512 	if (!in_flight)
513 		inc_in_flight_req(fsvq);
514 	notify = virtqueue_kick_prepare(vq);
515 	spin_unlock(&fsvq->lock);
516 
517 	if (notify)
518 		virtqueue_notify(vq);
519 	return ret;
520 out:
521 	spin_unlock(&fsvq->lock);
522 	return ret;
523 }
524 
525 static void virtio_fs_hiprio_dispatch_work(struct work_struct *work)
526 {
527 	struct virtio_fs_forget *forget;
528 	struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
529 						 dispatch_work.work);
530 	pr_debug("virtio-fs: worker %s called.\n", __func__);
531 	while (1) {
532 		spin_lock(&fsvq->lock);
533 		forget = list_first_entry_or_null(&fsvq->queued_reqs,
534 					struct virtio_fs_forget, list);
535 		if (!forget) {
536 			spin_unlock(&fsvq->lock);
537 			return;
538 		}
539 
540 		list_del(&forget->list);
541 		spin_unlock(&fsvq->lock);
542 		if (send_forget_request(fsvq, forget, true))
543 			return;
544 	}
545 }
546 
547 /* Allocate and copy args into req->argbuf */
548 static int copy_args_to_argbuf(struct fuse_req *req)
549 {
550 	struct fuse_args *args = req->args;
551 	unsigned int offset = 0;
552 	unsigned int num_in;
553 	unsigned int num_out;
554 	unsigned int len;
555 	unsigned int i;
556 
557 	num_in = args->in_numargs - args->in_pages;
558 	num_out = args->out_numargs - args->out_pages;
559 	len = fuse_len_args(num_in, (struct fuse_arg *) args->in_args) +
560 	      fuse_len_args(num_out, args->out_args);
561 
562 	req->argbuf = kmalloc(len, GFP_ATOMIC);
563 	if (!req->argbuf)
564 		return -ENOMEM;
565 
566 	for (i = 0; i < num_in; i++) {
567 		memcpy(req->argbuf + offset,
568 		       args->in_args[i].value,
569 		       args->in_args[i].size);
570 		offset += args->in_args[i].size;
571 	}
572 
573 	return 0;
574 }
575 
576 /* Copy args out of and free req->argbuf */
577 static void copy_args_from_argbuf(struct fuse_args *args, struct fuse_req *req)
578 {
579 	unsigned int remaining;
580 	unsigned int offset;
581 	unsigned int num_in;
582 	unsigned int num_out;
583 	unsigned int i;
584 
585 	remaining = req->out.h.len - sizeof(req->out.h);
586 	num_in = args->in_numargs - args->in_pages;
587 	num_out = args->out_numargs - args->out_pages;
588 	offset = fuse_len_args(num_in, (struct fuse_arg *)args->in_args);
589 
590 	for (i = 0; i < num_out; i++) {
591 		unsigned int argsize = args->out_args[i].size;
592 
593 		if (args->out_argvar &&
594 		    i == args->out_numargs - 1 &&
595 		    argsize > remaining) {
596 			argsize = remaining;
597 		}
598 
599 		memcpy(args->out_args[i].value, req->argbuf + offset, argsize);
600 		offset += argsize;
601 
602 		if (i != args->out_numargs - 1)
603 			remaining -= argsize;
604 	}
605 
606 	/* Store the actual size of the variable-length arg */
607 	if (args->out_argvar)
608 		args->out_args[args->out_numargs - 1].size = remaining;
609 
610 	kfree(req->argbuf);
611 	req->argbuf = NULL;
612 }
613 
614 /* Work function for request completion */
615 static void virtio_fs_request_complete(struct fuse_req *req,
616 				       struct virtio_fs_vq *fsvq)
617 {
618 	struct fuse_pqueue *fpq = &fsvq->fud->pq;
619 	struct fuse_args *args;
620 	struct fuse_args_pages *ap;
621 	unsigned int len, i, thislen;
622 	struct page *page;
623 
624 	/*
625 	 * TODO verify that server properly follows FUSE protocol
626 	 * (oh.uniq, oh.len)
627 	 */
628 	args = req->args;
629 	copy_args_from_argbuf(args, req);
630 
631 	if (args->out_pages && args->page_zeroing) {
632 		len = args->out_args[args->out_numargs - 1].size;
633 		ap = container_of(args, typeof(*ap), args);
634 		for (i = 0; i < ap->num_pages; i++) {
635 			thislen = ap->descs[i].length;
636 			if (len < thislen) {
637 				WARN_ON(ap->descs[i].offset);
638 				page = ap->pages[i];
639 				zero_user_segment(page, len, thislen);
640 				len = 0;
641 			} else {
642 				len -= thislen;
643 			}
644 		}
645 	}
646 
647 	spin_lock(&fpq->lock);
648 	clear_bit(FR_SENT, &req->flags);
649 	spin_unlock(&fpq->lock);
650 
651 	fuse_request_end(req);
652 	spin_lock(&fsvq->lock);
653 	dec_in_flight_req(fsvq);
654 	spin_unlock(&fsvq->lock);
655 }
656 
657 static void virtio_fs_complete_req_work(struct work_struct *work)
658 {
659 	struct virtio_fs_req_work *w =
660 		container_of(work, typeof(*w), done_work);
661 
662 	virtio_fs_request_complete(w->req, w->fsvq);
663 	kfree(w);
664 }
665 
666 static void virtio_fs_requests_done_work(struct work_struct *work)
667 {
668 	struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
669 						 done_work);
670 	struct fuse_pqueue *fpq = &fsvq->fud->pq;
671 	struct virtqueue *vq = fsvq->vq;
672 	struct fuse_req *req;
673 	struct fuse_req *next;
674 	unsigned int len;
675 	LIST_HEAD(reqs);
676 
677 	/* Collect completed requests off the virtqueue */
678 	spin_lock(&fsvq->lock);
679 	do {
680 		virtqueue_disable_cb(vq);
681 
682 		while ((req = virtqueue_get_buf(vq, &len)) != NULL) {
683 			spin_lock(&fpq->lock);
684 			list_move_tail(&req->list, &reqs);
685 			spin_unlock(&fpq->lock);
686 		}
687 	} while (!virtqueue_enable_cb(vq));
688 	spin_unlock(&fsvq->lock);
689 
690 	/* End requests */
691 	list_for_each_entry_safe(req, next, &reqs, list) {
692 		list_del_init(&req->list);
693 
694 		/* blocking async request completes in a worker context */
695 		if (req->args->may_block) {
696 			struct virtio_fs_req_work *w;
697 
698 			w = kzalloc(sizeof(*w), GFP_NOFS | __GFP_NOFAIL);
699 			INIT_WORK(&w->done_work, virtio_fs_complete_req_work);
700 			w->fsvq = fsvq;
701 			w->req = req;
702 			schedule_work(&w->done_work);
703 		} else {
704 			virtio_fs_request_complete(req, fsvq);
705 		}
706 	}
707 }
708 
709 /* Virtqueue interrupt handler */
710 static void virtio_fs_vq_done(struct virtqueue *vq)
711 {
712 	struct virtio_fs_vq *fsvq = vq_to_fsvq(vq);
713 
714 	dev_dbg(&vq->vdev->dev, "%s %s\n", __func__, fsvq->name);
715 
716 	schedule_work(&fsvq->done_work);
717 }
718 
719 static void virtio_fs_init_vq(struct virtio_fs_vq *fsvq, char *name,
720 			      int vq_type)
721 {
722 	strscpy(fsvq->name, name, VQ_NAME_LEN);
723 	spin_lock_init(&fsvq->lock);
724 	INIT_LIST_HEAD(&fsvq->queued_reqs);
725 	INIT_LIST_HEAD(&fsvq->end_reqs);
726 	init_completion(&fsvq->in_flight_zero);
727 
728 	if (vq_type == VQ_REQUEST) {
729 		INIT_WORK(&fsvq->done_work, virtio_fs_requests_done_work);
730 		INIT_DELAYED_WORK(&fsvq->dispatch_work,
731 				  virtio_fs_request_dispatch_work);
732 	} else {
733 		INIT_WORK(&fsvq->done_work, virtio_fs_hiprio_done_work);
734 		INIT_DELAYED_WORK(&fsvq->dispatch_work,
735 				  virtio_fs_hiprio_dispatch_work);
736 	}
737 }
738 
739 /* Initialize virtqueues */
740 static int virtio_fs_setup_vqs(struct virtio_device *vdev,
741 			       struct virtio_fs *fs)
742 {
743 	struct virtqueue **vqs;
744 	vq_callback_t **callbacks;
745 	const char **names;
746 	unsigned int i;
747 	int ret = 0;
748 
749 	virtio_cread_le(vdev, struct virtio_fs_config, num_request_queues,
750 			&fs->num_request_queues);
751 	if (fs->num_request_queues == 0)
752 		return -EINVAL;
753 
754 	fs->nvqs = VQ_REQUEST + fs->num_request_queues;
755 	fs->vqs = kcalloc(fs->nvqs, sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL);
756 	if (!fs->vqs)
757 		return -ENOMEM;
758 
759 	vqs = kmalloc_array(fs->nvqs, sizeof(vqs[VQ_HIPRIO]), GFP_KERNEL);
760 	callbacks = kmalloc_array(fs->nvqs, sizeof(callbacks[VQ_HIPRIO]),
761 					GFP_KERNEL);
762 	names = kmalloc_array(fs->nvqs, sizeof(names[VQ_HIPRIO]), GFP_KERNEL);
763 	if (!vqs || !callbacks || !names) {
764 		ret = -ENOMEM;
765 		goto out;
766 	}
767 
768 	/* Initialize the hiprio/forget request virtqueue */
769 	callbacks[VQ_HIPRIO] = virtio_fs_vq_done;
770 	virtio_fs_init_vq(&fs->vqs[VQ_HIPRIO], "hiprio", VQ_HIPRIO);
771 	names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name;
772 
773 	/* Initialize the requests virtqueues */
774 	for (i = VQ_REQUEST; i < fs->nvqs; i++) {
775 		char vq_name[VQ_NAME_LEN];
776 
777 		snprintf(vq_name, VQ_NAME_LEN, "requests.%u", i - VQ_REQUEST);
778 		virtio_fs_init_vq(&fs->vqs[i], vq_name, VQ_REQUEST);
779 		callbacks[i] = virtio_fs_vq_done;
780 		names[i] = fs->vqs[i].name;
781 	}
782 
783 	ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, NULL);
784 	if (ret < 0)
785 		goto out;
786 
787 	for (i = 0; i < fs->nvqs; i++)
788 		fs->vqs[i].vq = vqs[i];
789 
790 	virtio_fs_start_all_queues(fs);
791 out:
792 	kfree(names);
793 	kfree(callbacks);
794 	kfree(vqs);
795 	if (ret)
796 		kfree(fs->vqs);
797 	return ret;
798 }
799 
800 /* Free virtqueues (device must already be reset) */
801 static void virtio_fs_cleanup_vqs(struct virtio_device *vdev)
802 {
803 	vdev->config->del_vqs(vdev);
804 }
805 
806 /* Map a window offset to a page frame number.  The window offset will have
807  * been produced by .iomap_begin(), which maps a file offset to a window
808  * offset.
809  */
810 static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
811 				    long nr_pages, enum dax_access_mode mode,
812 				    void **kaddr, pfn_t *pfn)
813 {
814 	struct virtio_fs *fs = dax_get_private(dax_dev);
815 	phys_addr_t offset = PFN_PHYS(pgoff);
816 	size_t max_nr_pages = fs->window_len / PAGE_SIZE - pgoff;
817 
818 	if (kaddr)
819 		*kaddr = fs->window_kaddr + offset;
820 	if (pfn)
821 		*pfn = phys_to_pfn_t(fs->window_phys_addr + offset,
822 					PFN_DEV | PFN_MAP);
823 	return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
824 }
825 
826 static int virtio_fs_zero_page_range(struct dax_device *dax_dev,
827 				     pgoff_t pgoff, size_t nr_pages)
828 {
829 	long rc;
830 	void *kaddr;
831 
832 	rc = dax_direct_access(dax_dev, pgoff, nr_pages, DAX_ACCESS, &kaddr,
833 			       NULL);
834 	if (rc < 0)
835 		return dax_mem2blk_err(rc);
836 
837 	memset(kaddr, 0, nr_pages << PAGE_SHIFT);
838 	dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT);
839 	return 0;
840 }
841 
842 static const struct dax_operations virtio_fs_dax_ops = {
843 	.direct_access = virtio_fs_direct_access,
844 	.zero_page_range = virtio_fs_zero_page_range,
845 };
846 
847 static void virtio_fs_cleanup_dax(void *data)
848 {
849 	struct dax_device *dax_dev = data;
850 
851 	kill_dax(dax_dev);
852 	put_dax(dax_dev);
853 }
854 
855 DEFINE_FREE(cleanup_dax, struct dax_dev *, if (!IS_ERR_OR_NULL(_T)) virtio_fs_cleanup_dax(_T))
856 
857 static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
858 {
859 	struct dax_device *dax_dev __free(cleanup_dax) = NULL;
860 	struct virtio_shm_region cache_reg;
861 	struct dev_pagemap *pgmap;
862 	bool have_cache;
863 
864 	if (!IS_ENABLED(CONFIG_FUSE_DAX))
865 		return 0;
866 
867 	dax_dev = alloc_dax(fs, &virtio_fs_dax_ops);
868 	if (IS_ERR(dax_dev)) {
869 		int rc = PTR_ERR(dax_dev);
870 		return rc == -EOPNOTSUPP ? 0 : rc;
871 	}
872 
873 	/* Get cache region */
874 	have_cache = virtio_get_shm_region(vdev, &cache_reg,
875 					   (u8)VIRTIO_FS_SHMCAP_ID_CACHE);
876 	if (!have_cache) {
877 		dev_notice(&vdev->dev, "%s: No cache capability\n", __func__);
878 		return 0;
879 	}
880 
881 	if (!devm_request_mem_region(&vdev->dev, cache_reg.addr, cache_reg.len,
882 				     dev_name(&vdev->dev))) {
883 		dev_warn(&vdev->dev, "could not reserve region addr=0x%llx len=0x%llx\n",
884 			 cache_reg.addr, cache_reg.len);
885 		return -EBUSY;
886 	}
887 
888 	dev_notice(&vdev->dev, "Cache len: 0x%llx @ 0x%llx\n", cache_reg.len,
889 		   cache_reg.addr);
890 
891 	pgmap = devm_kzalloc(&vdev->dev, sizeof(*pgmap), GFP_KERNEL);
892 	if (!pgmap)
893 		return -ENOMEM;
894 
895 	pgmap->type = MEMORY_DEVICE_FS_DAX;
896 
897 	/* Ideally we would directly use the PCI BAR resource but
898 	 * devm_memremap_pages() wants its own copy in pgmap.  So
899 	 * initialize a struct resource from scratch (only the start
900 	 * and end fields will be used).
901 	 */
902 	pgmap->range = (struct range) {
903 		.start = (phys_addr_t) cache_reg.addr,
904 		.end = (phys_addr_t) cache_reg.addr + cache_reg.len - 1,
905 	};
906 	pgmap->nr_range = 1;
907 
908 	fs->window_kaddr = devm_memremap_pages(&vdev->dev, pgmap);
909 	if (IS_ERR(fs->window_kaddr))
910 		return PTR_ERR(fs->window_kaddr);
911 
912 	fs->window_phys_addr = (phys_addr_t) cache_reg.addr;
913 	fs->window_len = (phys_addr_t) cache_reg.len;
914 
915 	dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n",
916 		__func__, fs->window_kaddr, cache_reg.addr, cache_reg.len);
917 
918 	fs->dax_dev = no_free_ptr(dax_dev);
919 	return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax,
920 					fs->dax_dev);
921 }
922 
923 static int virtio_fs_probe(struct virtio_device *vdev)
924 {
925 	struct virtio_fs *fs;
926 	int ret;
927 
928 	fs = kzalloc(sizeof(*fs), GFP_KERNEL);
929 	if (!fs)
930 		return -ENOMEM;
931 	kobject_init(&fs->kobj, &virtio_fs_ktype);
932 	vdev->priv = fs;
933 
934 	ret = virtio_fs_read_tag(vdev, fs);
935 	if (ret < 0)
936 		goto out;
937 
938 	ret = virtio_fs_setup_vqs(vdev, fs);
939 	if (ret < 0)
940 		goto out;
941 
942 	/* TODO vq affinity */
943 
944 	ret = virtio_fs_setup_dax(vdev, fs);
945 	if (ret < 0)
946 		goto out_vqs;
947 
948 	/* Bring the device online in case the filesystem is mounted and
949 	 * requests need to be sent before we return.
950 	 */
951 	virtio_device_ready(vdev);
952 
953 	ret = virtio_fs_add_instance(vdev, fs);
954 	if (ret < 0)
955 		goto out_vqs;
956 
957 	return 0;
958 
959 out_vqs:
960 	virtio_reset_device(vdev);
961 	virtio_fs_cleanup_vqs(vdev);
962 
963 out:
964 	vdev->priv = NULL;
965 	kobject_put(&fs->kobj);
966 	return ret;
967 }
968 
969 static void virtio_fs_stop_all_queues(struct virtio_fs *fs)
970 {
971 	struct virtio_fs_vq *fsvq;
972 	int i;
973 
974 	for (i = 0; i < fs->nvqs; i++) {
975 		fsvq = &fs->vqs[i];
976 		spin_lock(&fsvq->lock);
977 		fsvq->connected = false;
978 		spin_unlock(&fsvq->lock);
979 	}
980 }
981 
982 static void virtio_fs_remove(struct virtio_device *vdev)
983 {
984 	struct virtio_fs *fs = vdev->priv;
985 
986 	mutex_lock(&virtio_fs_mutex);
987 	/* This device is going away. No one should get new reference */
988 	list_del_init(&fs->list);
989 	sysfs_remove_link(&fs->kobj, "device");
990 	kobject_del(&fs->kobj);
991 	virtio_fs_stop_all_queues(fs);
992 	virtio_fs_drain_all_queues_locked(fs);
993 	virtio_reset_device(vdev);
994 	virtio_fs_cleanup_vqs(vdev);
995 
996 	vdev->priv = NULL;
997 	/* Put device reference on virtio_fs object */
998 	virtio_fs_put(fs);
999 	mutex_unlock(&virtio_fs_mutex);
1000 }
1001 
1002 #ifdef CONFIG_PM_SLEEP
1003 static int virtio_fs_freeze(struct virtio_device *vdev)
1004 {
1005 	/* TODO need to save state here */
1006 	pr_warn("virtio-fs: suspend/resume not yet supported\n");
1007 	return -EOPNOTSUPP;
1008 }
1009 
1010 static int virtio_fs_restore(struct virtio_device *vdev)
1011 {
1012 	 /* TODO need to restore state here */
1013 	return 0;
1014 }
1015 #endif /* CONFIG_PM_SLEEP */
1016 
1017 static const struct virtio_device_id id_table[] = {
1018 	{ VIRTIO_ID_FS, VIRTIO_DEV_ANY_ID },
1019 	{},
1020 };
1021 
1022 static const unsigned int feature_table[] = {};
1023 
1024 static struct virtio_driver virtio_fs_driver = {
1025 	.driver.name		= KBUILD_MODNAME,
1026 	.driver.owner		= THIS_MODULE,
1027 	.id_table		= id_table,
1028 	.feature_table		= feature_table,
1029 	.feature_table_size	= ARRAY_SIZE(feature_table),
1030 	.probe			= virtio_fs_probe,
1031 	.remove			= virtio_fs_remove,
1032 #ifdef CONFIG_PM_SLEEP
1033 	.freeze			= virtio_fs_freeze,
1034 	.restore		= virtio_fs_restore,
1035 #endif
1036 };
1037 
1038 static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq)
1039 __releases(fiq->lock)
1040 {
1041 	struct fuse_forget_link *link;
1042 	struct virtio_fs_forget *forget;
1043 	struct virtio_fs_forget_req *req;
1044 	struct virtio_fs *fs;
1045 	struct virtio_fs_vq *fsvq;
1046 	u64 unique;
1047 
1048 	link = fuse_dequeue_forget(fiq, 1, NULL);
1049 	unique = fuse_get_unique(fiq);
1050 
1051 	fs = fiq->priv;
1052 	fsvq = &fs->vqs[VQ_HIPRIO];
1053 	spin_unlock(&fiq->lock);
1054 
1055 	/* Allocate a buffer for the request */
1056 	forget = kmalloc(sizeof(*forget), GFP_NOFS | __GFP_NOFAIL);
1057 	req = &forget->req;
1058 
1059 	req->ih = (struct fuse_in_header){
1060 		.opcode = FUSE_FORGET,
1061 		.nodeid = link->forget_one.nodeid,
1062 		.unique = unique,
1063 		.len = sizeof(*req),
1064 	};
1065 	req->arg = (struct fuse_forget_in){
1066 		.nlookup = link->forget_one.nlookup,
1067 	};
1068 
1069 	send_forget_request(fsvq, forget, false);
1070 	kfree(link);
1071 }
1072 
1073 static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq)
1074 __releases(fiq->lock)
1075 {
1076 	/*
1077 	 * TODO interrupts.
1078 	 *
1079 	 * Normal fs operations on a local filesystems aren't interruptible.
1080 	 * Exceptions are blocking lock operations; for example fcntl(F_SETLKW)
1081 	 * with shared lock between host and guest.
1082 	 */
1083 	spin_unlock(&fiq->lock);
1084 }
1085 
1086 /* Count number of scatter-gather elements required */
1087 static unsigned int sg_count_fuse_pages(struct fuse_page_desc *page_descs,
1088 				       unsigned int num_pages,
1089 				       unsigned int total_len)
1090 {
1091 	unsigned int i;
1092 	unsigned int this_len;
1093 
1094 	for (i = 0; i < num_pages && total_len; i++) {
1095 		this_len =  min(page_descs[i].length, total_len);
1096 		total_len -= this_len;
1097 	}
1098 
1099 	return i;
1100 }
1101 
1102 /* Return the number of scatter-gather list elements required */
1103 static unsigned int sg_count_fuse_req(struct fuse_req *req)
1104 {
1105 	struct fuse_args *args = req->args;
1106 	struct fuse_args_pages *ap = container_of(args, typeof(*ap), args);
1107 	unsigned int size, total_sgs = 1 /* fuse_in_header */;
1108 
1109 	if (args->in_numargs - args->in_pages)
1110 		total_sgs += 1;
1111 
1112 	if (args->in_pages) {
1113 		size = args->in_args[args->in_numargs - 1].size;
1114 		total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages,
1115 						 size);
1116 	}
1117 
1118 	if (!test_bit(FR_ISREPLY, &req->flags))
1119 		return total_sgs;
1120 
1121 	total_sgs += 1 /* fuse_out_header */;
1122 
1123 	if (args->out_numargs - args->out_pages)
1124 		total_sgs += 1;
1125 
1126 	if (args->out_pages) {
1127 		size = args->out_args[args->out_numargs - 1].size;
1128 		total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages,
1129 						 size);
1130 	}
1131 
1132 	return total_sgs;
1133 }
1134 
1135 /* Add pages to scatter-gather list and return number of elements used */
1136 static unsigned int sg_init_fuse_pages(struct scatterlist *sg,
1137 				       struct page **pages,
1138 				       struct fuse_page_desc *page_descs,
1139 				       unsigned int num_pages,
1140 				       unsigned int total_len)
1141 {
1142 	unsigned int i;
1143 	unsigned int this_len;
1144 
1145 	for (i = 0; i < num_pages && total_len; i++) {
1146 		sg_init_table(&sg[i], 1);
1147 		this_len =  min(page_descs[i].length, total_len);
1148 		sg_set_page(&sg[i], pages[i], this_len, page_descs[i].offset);
1149 		total_len -= this_len;
1150 	}
1151 
1152 	return i;
1153 }
1154 
1155 /* Add args to scatter-gather list and return number of elements used */
1156 static unsigned int sg_init_fuse_args(struct scatterlist *sg,
1157 				      struct fuse_req *req,
1158 				      struct fuse_arg *args,
1159 				      unsigned int numargs,
1160 				      bool argpages,
1161 				      void *argbuf,
1162 				      unsigned int *len_used)
1163 {
1164 	struct fuse_args_pages *ap = container_of(req->args, typeof(*ap), args);
1165 	unsigned int total_sgs = 0;
1166 	unsigned int len;
1167 
1168 	len = fuse_len_args(numargs - argpages, args);
1169 	if (len)
1170 		sg_init_one(&sg[total_sgs++], argbuf, len);
1171 
1172 	if (argpages)
1173 		total_sgs += sg_init_fuse_pages(&sg[total_sgs],
1174 						ap->pages, ap->descs,
1175 						ap->num_pages,
1176 						args[numargs - 1].size);
1177 
1178 	if (len_used)
1179 		*len_used = len;
1180 
1181 	return total_sgs;
1182 }
1183 
1184 /* Add a request to a virtqueue and kick the device */
1185 static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
1186 				 struct fuse_req *req, bool in_flight)
1187 {
1188 	/* requests need at least 4 elements */
1189 	struct scatterlist *stack_sgs[6];
1190 	struct scatterlist stack_sg[ARRAY_SIZE(stack_sgs)];
1191 	struct scatterlist **sgs = stack_sgs;
1192 	struct scatterlist *sg = stack_sg;
1193 	struct virtqueue *vq;
1194 	struct fuse_args *args = req->args;
1195 	unsigned int argbuf_used = 0;
1196 	unsigned int out_sgs = 0;
1197 	unsigned int in_sgs = 0;
1198 	unsigned int total_sgs;
1199 	unsigned int i;
1200 	int ret;
1201 	bool notify;
1202 	struct fuse_pqueue *fpq;
1203 
1204 	/* Does the sglist fit on the stack? */
1205 	total_sgs = sg_count_fuse_req(req);
1206 	if (total_sgs > ARRAY_SIZE(stack_sgs)) {
1207 		sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), GFP_ATOMIC);
1208 		sg = kmalloc_array(total_sgs, sizeof(sg[0]), GFP_ATOMIC);
1209 		if (!sgs || !sg) {
1210 			ret = -ENOMEM;
1211 			goto out;
1212 		}
1213 	}
1214 
1215 	/* Use a bounce buffer since stack args cannot be mapped */
1216 	ret = copy_args_to_argbuf(req);
1217 	if (ret < 0)
1218 		goto out;
1219 
1220 	/* Request elements */
1221 	sg_init_one(&sg[out_sgs++], &req->in.h, sizeof(req->in.h));
1222 	out_sgs += sg_init_fuse_args(&sg[out_sgs], req,
1223 				     (struct fuse_arg *)args->in_args,
1224 				     args->in_numargs, args->in_pages,
1225 				     req->argbuf, &argbuf_used);
1226 
1227 	/* Reply elements */
1228 	if (test_bit(FR_ISREPLY, &req->flags)) {
1229 		sg_init_one(&sg[out_sgs + in_sgs++],
1230 			    &req->out.h, sizeof(req->out.h));
1231 		in_sgs += sg_init_fuse_args(&sg[out_sgs + in_sgs], req,
1232 					    args->out_args, args->out_numargs,
1233 					    args->out_pages,
1234 					    req->argbuf + argbuf_used, NULL);
1235 	}
1236 
1237 	WARN_ON(out_sgs + in_sgs != total_sgs);
1238 
1239 	for (i = 0; i < total_sgs; i++)
1240 		sgs[i] = &sg[i];
1241 
1242 	spin_lock(&fsvq->lock);
1243 
1244 	if (!fsvq->connected) {
1245 		spin_unlock(&fsvq->lock);
1246 		ret = -ENOTCONN;
1247 		goto out;
1248 	}
1249 
1250 	vq = fsvq->vq;
1251 	ret = virtqueue_add_sgs(vq, sgs, out_sgs, in_sgs, req, GFP_ATOMIC);
1252 	if (ret < 0) {
1253 		spin_unlock(&fsvq->lock);
1254 		goto out;
1255 	}
1256 
1257 	/* Request successfully sent. */
1258 	fpq = &fsvq->fud->pq;
1259 	spin_lock(&fpq->lock);
1260 	list_add_tail(&req->list, fpq->processing);
1261 	spin_unlock(&fpq->lock);
1262 	set_bit(FR_SENT, &req->flags);
1263 	/* matches barrier in request_wait_answer() */
1264 	smp_mb__after_atomic();
1265 
1266 	if (!in_flight)
1267 		inc_in_flight_req(fsvq);
1268 	notify = virtqueue_kick_prepare(vq);
1269 
1270 	spin_unlock(&fsvq->lock);
1271 
1272 	if (notify)
1273 		virtqueue_notify(vq);
1274 
1275 out:
1276 	if (ret < 0 && req->argbuf) {
1277 		kfree(req->argbuf);
1278 		req->argbuf = NULL;
1279 	}
1280 	if (sgs != stack_sgs) {
1281 		kfree(sgs);
1282 		kfree(sg);
1283 	}
1284 
1285 	return ret;
1286 }
1287 
1288 static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq)
1289 __releases(fiq->lock)
1290 {
1291 	unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */
1292 	struct virtio_fs *fs;
1293 	struct fuse_req *req;
1294 	struct virtio_fs_vq *fsvq;
1295 	int ret;
1296 
1297 	WARN_ON(list_empty(&fiq->pending));
1298 	req = list_last_entry(&fiq->pending, struct fuse_req, list);
1299 	clear_bit(FR_PENDING, &req->flags);
1300 	list_del_init(&req->list);
1301 	WARN_ON(!list_empty(&fiq->pending));
1302 	spin_unlock(&fiq->lock);
1303 
1304 	fs = fiq->priv;
1305 
1306 	pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u\n",
1307 		  __func__, req->in.h.opcode, req->in.h.unique,
1308 		 req->in.h.nodeid, req->in.h.len,
1309 		 fuse_len_args(req->args->out_numargs, req->args->out_args));
1310 
1311 	fsvq = &fs->vqs[queue_id];
1312 	ret = virtio_fs_enqueue_req(fsvq, req, false);
1313 	if (ret < 0) {
1314 		if (ret == -ENOMEM || ret == -ENOSPC) {
1315 			/*
1316 			 * Virtqueue full. Retry submission from worker
1317 			 * context as we might be holding fc->bg_lock.
1318 			 */
1319 			spin_lock(&fsvq->lock);
1320 			list_add_tail(&req->list, &fsvq->queued_reqs);
1321 			inc_in_flight_req(fsvq);
1322 			schedule_delayed_work(&fsvq->dispatch_work,
1323 						msecs_to_jiffies(1));
1324 			spin_unlock(&fsvq->lock);
1325 			return;
1326 		}
1327 		req->out.h.error = ret;
1328 		pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n", ret);
1329 
1330 		/* Can't end request in submission context. Use a worker */
1331 		spin_lock(&fsvq->lock);
1332 		list_add_tail(&req->list, &fsvq->end_reqs);
1333 		schedule_delayed_work(&fsvq->dispatch_work, 0);
1334 		spin_unlock(&fsvq->lock);
1335 		return;
1336 	}
1337 }
1338 
1339 static const struct fuse_iqueue_ops virtio_fs_fiq_ops = {
1340 	.wake_forget_and_unlock		= virtio_fs_wake_forget_and_unlock,
1341 	.wake_interrupt_and_unlock	= virtio_fs_wake_interrupt_and_unlock,
1342 	.wake_pending_and_unlock	= virtio_fs_wake_pending_and_unlock,
1343 	.release			= virtio_fs_fiq_release,
1344 };
1345 
1346 static inline void virtio_fs_ctx_set_defaults(struct fuse_fs_context *ctx)
1347 {
1348 	ctx->rootmode = S_IFDIR;
1349 	ctx->default_permissions = 1;
1350 	ctx->allow_other = 1;
1351 	ctx->max_read = UINT_MAX;
1352 	ctx->blksize = 512;
1353 	ctx->destroy = true;
1354 	ctx->no_control = true;
1355 	ctx->no_force_umount = true;
1356 }
1357 
1358 static int virtio_fs_fill_super(struct super_block *sb, struct fs_context *fsc)
1359 {
1360 	struct fuse_mount *fm = get_fuse_mount_super(sb);
1361 	struct fuse_conn *fc = fm->fc;
1362 	struct virtio_fs *fs = fc->iq.priv;
1363 	struct fuse_fs_context *ctx = fsc->fs_private;
1364 	unsigned int i;
1365 	int err;
1366 
1367 	virtio_fs_ctx_set_defaults(ctx);
1368 	mutex_lock(&virtio_fs_mutex);
1369 
1370 	/* After holding mutex, make sure virtiofs device is still there.
1371 	 * Though we are holding a reference to it, drive ->remove might
1372 	 * still have cleaned up virtual queues. In that case bail out.
1373 	 */
1374 	err = -EINVAL;
1375 	if (list_empty(&fs->list)) {
1376 		pr_info("virtio-fs: tag <%s> not found\n", fs->tag);
1377 		goto err;
1378 	}
1379 
1380 	err = -ENOMEM;
1381 	/* Allocate fuse_dev for hiprio and notification queues */
1382 	for (i = 0; i < fs->nvqs; i++) {
1383 		struct virtio_fs_vq *fsvq = &fs->vqs[i];
1384 
1385 		fsvq->fud = fuse_dev_alloc();
1386 		if (!fsvq->fud)
1387 			goto err_free_fuse_devs;
1388 	}
1389 
1390 	/* virtiofs allocates and installs its own fuse devices */
1391 	ctx->fudptr = NULL;
1392 	if (ctx->dax_mode != FUSE_DAX_NEVER) {
1393 		if (ctx->dax_mode == FUSE_DAX_ALWAYS && !fs->dax_dev) {
1394 			err = -EINVAL;
1395 			pr_err("virtio-fs: dax can't be enabled as filesystem"
1396 			       " device does not support it.\n");
1397 			goto err_free_fuse_devs;
1398 		}
1399 		ctx->dax_dev = fs->dax_dev;
1400 	}
1401 	err = fuse_fill_super_common(sb, ctx);
1402 	if (err < 0)
1403 		goto err_free_fuse_devs;
1404 
1405 	for (i = 0; i < fs->nvqs; i++) {
1406 		struct virtio_fs_vq *fsvq = &fs->vqs[i];
1407 
1408 		fuse_dev_install(fsvq->fud, fc);
1409 	}
1410 
1411 	/* Previous unmount will stop all queues. Start these again */
1412 	virtio_fs_start_all_queues(fs);
1413 	fuse_send_init(fm);
1414 	mutex_unlock(&virtio_fs_mutex);
1415 	return 0;
1416 
1417 err_free_fuse_devs:
1418 	virtio_fs_free_devs(fs);
1419 err:
1420 	mutex_unlock(&virtio_fs_mutex);
1421 	return err;
1422 }
1423 
1424 static void virtio_fs_conn_destroy(struct fuse_mount *fm)
1425 {
1426 	struct fuse_conn *fc = fm->fc;
1427 	struct virtio_fs *vfs = fc->iq.priv;
1428 	struct virtio_fs_vq *fsvq = &vfs->vqs[VQ_HIPRIO];
1429 
1430 	/* Stop dax worker. Soon evict_inodes() will be called which
1431 	 * will free all memory ranges belonging to all inodes.
1432 	 */
1433 	if (IS_ENABLED(CONFIG_FUSE_DAX))
1434 		fuse_dax_cancel_work(fc);
1435 
1436 	/* Stop forget queue. Soon destroy will be sent */
1437 	spin_lock(&fsvq->lock);
1438 	fsvq->connected = false;
1439 	spin_unlock(&fsvq->lock);
1440 	virtio_fs_drain_all_queues(vfs);
1441 
1442 	fuse_conn_destroy(fm);
1443 
1444 	/* fuse_conn_destroy() must have sent destroy. Stop all queues
1445 	 * and drain one more time and free fuse devices. Freeing fuse
1446 	 * devices will drop their reference on fuse_conn and that in
1447 	 * turn will drop its reference on virtio_fs object.
1448 	 */
1449 	virtio_fs_stop_all_queues(vfs);
1450 	virtio_fs_drain_all_queues(vfs);
1451 	virtio_fs_free_devs(vfs);
1452 }
1453 
1454 static void virtio_kill_sb(struct super_block *sb)
1455 {
1456 	struct fuse_mount *fm = get_fuse_mount_super(sb);
1457 	bool last;
1458 
1459 	/* If mount failed, we can still be called without any fc */
1460 	if (sb->s_root) {
1461 		last = fuse_mount_remove(fm);
1462 		if (last)
1463 			virtio_fs_conn_destroy(fm);
1464 	}
1465 	kill_anon_super(sb);
1466 	fuse_mount_destroy(fm);
1467 }
1468 
1469 static int virtio_fs_test_super(struct super_block *sb,
1470 				struct fs_context *fsc)
1471 {
1472 	struct fuse_mount *fsc_fm = fsc->s_fs_info;
1473 	struct fuse_mount *sb_fm = get_fuse_mount_super(sb);
1474 
1475 	return fsc_fm->fc->iq.priv == sb_fm->fc->iq.priv;
1476 }
1477 
1478 static int virtio_fs_get_tree(struct fs_context *fsc)
1479 {
1480 	struct virtio_fs *fs;
1481 	struct super_block *sb;
1482 	struct fuse_conn *fc = NULL;
1483 	struct fuse_mount *fm;
1484 	unsigned int virtqueue_size;
1485 	int err = -EIO;
1486 
1487 	/* This gets a reference on virtio_fs object. This ptr gets installed
1488 	 * in fc->iq->priv. Once fuse_conn is going away, it calls ->put()
1489 	 * to drop the reference to this object.
1490 	 */
1491 	fs = virtio_fs_find_instance(fsc->source);
1492 	if (!fs) {
1493 		pr_info("virtio-fs: tag <%s> not found\n", fsc->source);
1494 		return -EINVAL;
1495 	}
1496 
1497 	virtqueue_size = virtqueue_get_vring_size(fs->vqs[VQ_REQUEST].vq);
1498 	if (WARN_ON(virtqueue_size <= FUSE_HEADER_OVERHEAD))
1499 		goto out_err;
1500 
1501 	err = -ENOMEM;
1502 	fc = kzalloc(sizeof(struct fuse_conn), GFP_KERNEL);
1503 	if (!fc)
1504 		goto out_err;
1505 
1506 	fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL);
1507 	if (!fm)
1508 		goto out_err;
1509 
1510 	fuse_conn_init(fc, fm, fsc->user_ns, &virtio_fs_fiq_ops, fs);
1511 	fc->release = fuse_free_conn;
1512 	fc->delete_stale = true;
1513 	fc->auto_submounts = true;
1514 	fc->sync_fs = true;
1515 
1516 	/* Tell FUSE to split requests that exceed the virtqueue's size */
1517 	fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit,
1518 				    virtqueue_size - FUSE_HEADER_OVERHEAD);
1519 
1520 	fsc->s_fs_info = fm;
1521 	sb = sget_fc(fsc, virtio_fs_test_super, set_anon_super_fc);
1522 	if (fsc->s_fs_info)
1523 		fuse_mount_destroy(fm);
1524 	if (IS_ERR(sb))
1525 		return PTR_ERR(sb);
1526 
1527 	if (!sb->s_root) {
1528 		err = virtio_fs_fill_super(sb, fsc);
1529 		if (err) {
1530 			deactivate_locked_super(sb);
1531 			return err;
1532 		}
1533 
1534 		sb->s_flags |= SB_ACTIVE;
1535 	}
1536 
1537 	WARN_ON(fsc->root);
1538 	fsc->root = dget(sb->s_root);
1539 	return 0;
1540 
1541 out_err:
1542 	kfree(fc);
1543 	mutex_lock(&virtio_fs_mutex);
1544 	virtio_fs_put(fs);
1545 	mutex_unlock(&virtio_fs_mutex);
1546 	return err;
1547 }
1548 
1549 static const struct fs_context_operations virtio_fs_context_ops = {
1550 	.free		= virtio_fs_free_fsc,
1551 	.parse_param	= virtio_fs_parse_param,
1552 	.get_tree	= virtio_fs_get_tree,
1553 };
1554 
1555 static int virtio_fs_init_fs_context(struct fs_context *fsc)
1556 {
1557 	struct fuse_fs_context *ctx;
1558 
1559 	if (fsc->purpose == FS_CONTEXT_FOR_SUBMOUNT)
1560 		return fuse_init_fs_context_submount(fsc);
1561 
1562 	ctx = kzalloc(sizeof(struct fuse_fs_context), GFP_KERNEL);
1563 	if (!ctx)
1564 		return -ENOMEM;
1565 	fsc->fs_private = ctx;
1566 	fsc->ops = &virtio_fs_context_ops;
1567 	return 0;
1568 }
1569 
1570 static struct file_system_type virtio_fs_type = {
1571 	.owner		= THIS_MODULE,
1572 	.name		= "virtiofs",
1573 	.init_fs_context = virtio_fs_init_fs_context,
1574 	.kill_sb	= virtio_kill_sb,
1575 };
1576 
1577 static int virtio_fs_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
1578 {
1579 	const struct virtio_fs *fs = container_of(kobj, struct virtio_fs, kobj);
1580 
1581 	add_uevent_var(env, "TAG=%s", fs->tag);
1582 	return 0;
1583 }
1584 
1585 static const struct kset_uevent_ops virtio_fs_uevent_ops = {
1586 	.uevent = virtio_fs_uevent,
1587 };
1588 
1589 static int __init virtio_fs_sysfs_init(void)
1590 {
1591 	virtio_fs_kset = kset_create_and_add("virtiofs", &virtio_fs_uevent_ops,
1592 					     fs_kobj);
1593 	if (!virtio_fs_kset)
1594 		return -ENOMEM;
1595 	return 0;
1596 }
1597 
1598 static void virtio_fs_sysfs_exit(void)
1599 {
1600 	kset_unregister(virtio_fs_kset);
1601 	virtio_fs_kset = NULL;
1602 }
1603 
1604 static int __init virtio_fs_init(void)
1605 {
1606 	int ret;
1607 
1608 	ret = virtio_fs_sysfs_init();
1609 	if (ret < 0)
1610 		return ret;
1611 
1612 	ret = register_virtio_driver(&virtio_fs_driver);
1613 	if (ret < 0)
1614 		goto sysfs_exit;
1615 
1616 	ret = register_filesystem(&virtio_fs_type);
1617 	if (ret < 0)
1618 		goto unregister_virtio_driver;
1619 
1620 	return 0;
1621 
1622 unregister_virtio_driver:
1623 	unregister_virtio_driver(&virtio_fs_driver);
1624 sysfs_exit:
1625 	virtio_fs_sysfs_exit();
1626 	return ret;
1627 }
1628 module_init(virtio_fs_init);
1629 
1630 static void __exit virtio_fs_exit(void)
1631 {
1632 	unregister_filesystem(&virtio_fs_type);
1633 	unregister_virtio_driver(&virtio_fs_driver);
1634 	virtio_fs_sysfs_exit();
1635 }
1636 module_exit(virtio_fs_exit);
1637 
1638 MODULE_AUTHOR("Stefan Hajnoczi <stefanha@redhat.com>");
1639 MODULE_DESCRIPTION("Virtio Filesystem");
1640 MODULE_LICENSE("GPL");
1641 MODULE_ALIAS_FS(KBUILD_MODNAME);
1642 MODULE_DEVICE_TABLE(virtio, id_table);
1643