xref: /linux/io_uring/net.c (revision 3de9c42d02a79a5e09bbee7a4421ddc00cfd5c6d)
1  // SPDX-License-Identifier: GPL-2.0
2  #include <linux/kernel.h>
3  #include <linux/errno.h>
4  #include <linux/file.h>
5  #include <linux/slab.h>
6  #include <linux/net.h>
7  #include <linux/compat.h>
8  #include <net/compat.h>
9  #include <linux/io_uring.h>
10  
11  #include <uapi/linux/io_uring.h>
12  
13  #include "io_uring.h"
14  #include "kbuf.h"
15  #include "alloc_cache.h"
16  #include "net.h"
17  #include "notif.h"
18  #include "rsrc.h"
19  
20  #if defined(CONFIG_NET)
21  struct io_shutdown {
22  	struct file			*file;
23  	int				how;
24  };
25  
26  struct io_accept {
27  	struct file			*file;
28  	struct sockaddr __user		*addr;
29  	int __user			*addr_len;
30  	int				flags;
31  	int				iou_flags;
32  	u32				file_slot;
33  	unsigned long			nofile;
34  };
35  
36  struct io_socket {
37  	struct file			*file;
38  	int				domain;
39  	int				type;
40  	int				protocol;
41  	int				flags;
42  	u32				file_slot;
43  	unsigned long			nofile;
44  };
45  
46  struct io_connect {
47  	struct file			*file;
48  	struct sockaddr __user		*addr;
49  	int				addr_len;
50  	bool				in_progress;
51  	bool				seen_econnaborted;
52  };
53  
54  struct io_sr_msg {
55  	struct file			*file;
56  	union {
57  		struct compat_msghdr __user	*umsg_compat;
58  		struct user_msghdr __user	*umsg;
59  		void __user			*buf;
60  	};
61  	int				len;
62  	unsigned			done_io;
63  	unsigned			msg_flags;
64  	unsigned			nr_multishot_loops;
65  	u16				flags;
66  	/* initialised and used only by !msg send variants */
67  	u16				addr_len;
68  	u16				buf_group;
69  	void __user			*addr;
70  	void __user			*msg_control;
71  	/* used only for send zerocopy */
72  	struct io_kiocb 		*notif;
73  };
74  
75  /*
76   * Number of times we'll try and do receives if there's more data. If we
77   * exceed this limit, then add us to the back of the queue and retry from
78   * there. This helps fairness between flooding clients.
79   */
80  #define MULTISHOT_MAX_RETRY	32
81  
82  int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
83  {
84  	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
85  
86  	if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
87  		     sqe->buf_index || sqe->splice_fd_in))
88  		return -EINVAL;
89  
90  	shutdown->how = READ_ONCE(sqe->len);
91  	req->flags |= REQ_F_FORCE_ASYNC;
92  	return 0;
93  }
94  
95  int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
96  {
97  	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
98  	struct socket *sock;
99  	int ret;
100  
101  	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
102  
103  	sock = sock_from_file(req->file);
104  	if (unlikely(!sock))
105  		return -ENOTSOCK;
106  
107  	ret = __sys_shutdown_sock(sock, shutdown->how);
108  	io_req_set_res(req, ret, 0);
109  	return IOU_OK;
110  }
111  
112  static bool io_net_retry(struct socket *sock, int flags)
113  {
114  	if (!(flags & MSG_WAITALL))
115  		return false;
116  	return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
117  }
118  
119  static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg)
120  {
121  	if (kmsg->free_iov) {
122  		kfree(kmsg->free_iov);
123  		kmsg->free_iov_nr = 0;
124  		kmsg->free_iov = NULL;
125  	}
126  }
127  
128  static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
129  {
130  	struct io_async_msghdr *hdr = req->async_data;
131  	struct iovec *iov;
132  
133  	/* can't recycle, ensure we free the iovec if we have one */
134  	if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
135  		io_netmsg_iovec_free(hdr);
136  		return;
137  	}
138  
139  	/* Let normal cleanup path reap it if we fail adding to the cache */
140  	iov = hdr->free_iov;
141  	if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) {
142  		if (iov)
143  			kasan_mempool_poison_object(iov);
144  		req->async_data = NULL;
145  		req->flags &= ~REQ_F_ASYNC_DATA;
146  	}
147  }
148  
149  static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req)
150  {
151  	struct io_ring_ctx *ctx = req->ctx;
152  	struct io_async_msghdr *hdr;
153  
154  	hdr = io_alloc_cache_get(&ctx->netmsg_cache);
155  	if (hdr) {
156  		if (hdr->free_iov) {
157  			kasan_mempool_unpoison_object(hdr->free_iov,
158  				hdr->free_iov_nr * sizeof(struct iovec));
159  			req->flags |= REQ_F_NEED_CLEANUP;
160  		}
161  		req->flags |= REQ_F_ASYNC_DATA;
162  		req->async_data = hdr;
163  		return hdr;
164  	}
165  
166  	if (!io_alloc_async_data(req)) {
167  		hdr = req->async_data;
168  		hdr->free_iov_nr = 0;
169  		hdr->free_iov = NULL;
170  		return hdr;
171  	}
172  	return NULL;
173  }
174  
175  /* assign new iovec to kmsg, if we need to */
176  static int io_net_vec_assign(struct io_kiocb *req, struct io_async_msghdr *kmsg,
177  			     struct iovec *iov)
178  {
179  	if (iov) {
180  		req->flags |= REQ_F_NEED_CLEANUP;
181  		kmsg->free_iov_nr = kmsg->msg.msg_iter.nr_segs;
182  		if (kmsg->free_iov)
183  			kfree(kmsg->free_iov);
184  		kmsg->free_iov = iov;
185  	}
186  	return 0;
187  }
188  
189  static inline void io_mshot_prep_retry(struct io_kiocb *req,
190  				       struct io_async_msghdr *kmsg)
191  {
192  	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
193  
194  	req->flags &= ~REQ_F_BL_EMPTY;
195  	sr->done_io = 0;
196  	sr->len = 0; /* get from the provided buffer */
197  	req->buf_index = sr->buf_group;
198  }
199  
200  #ifdef CONFIG_COMPAT
201  static int io_compat_msg_copy_hdr(struct io_kiocb *req,
202  				  struct io_async_msghdr *iomsg,
203  				  struct compat_msghdr *msg, int ddir)
204  {
205  	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
206  	struct compat_iovec __user *uiov;
207  	struct iovec *iov;
208  	int ret, nr_segs;
209  
210  	if (iomsg->free_iov) {
211  		nr_segs = iomsg->free_iov_nr;
212  		iov = iomsg->free_iov;
213  	} else {
214  		iov = &iomsg->fast_iov;
215  		nr_segs = 1;
216  	}
217  
218  	if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg)))
219  		return -EFAULT;
220  
221  	uiov = compat_ptr(msg->msg_iov);
222  	if (req->flags & REQ_F_BUFFER_SELECT) {
223  		compat_ssize_t clen;
224  
225  		if (msg->msg_iovlen == 0) {
226  			sr->len = iov->iov_len = 0;
227  			iov->iov_base = NULL;
228  		} else if (msg->msg_iovlen > 1) {
229  			return -EINVAL;
230  		} else {
231  			if (!access_ok(uiov, sizeof(*uiov)))
232  				return -EFAULT;
233  			if (__get_user(clen, &uiov->iov_len))
234  				return -EFAULT;
235  			if (clen < 0)
236  				return -EINVAL;
237  			sr->len = clen;
238  		}
239  
240  		return 0;
241  	}
242  
243  	ret = __import_iovec(ddir, (struct iovec __user *)uiov, msg->msg_iovlen,
244  				nr_segs, &iov, &iomsg->msg.msg_iter, true);
245  	if (unlikely(ret < 0))
246  		return ret;
247  
248  	return io_net_vec_assign(req, iomsg, iov);
249  }
250  #endif
251  
252  static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
253  			   struct user_msghdr *msg, int ddir)
254  {
255  	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
256  	struct iovec *iov;
257  	int ret, nr_segs;
258  
259  	if (iomsg->free_iov) {
260  		nr_segs = iomsg->free_iov_nr;
261  		iov = iomsg->free_iov;
262  	} else {
263  		iov = &iomsg->fast_iov;
264  		nr_segs = 1;
265  	}
266  
267  	if (!user_access_begin(sr->umsg, sizeof(*sr->umsg)))
268  		return -EFAULT;
269  
270  	ret = -EFAULT;
271  	unsafe_get_user(msg->msg_name, &sr->umsg->msg_name, ua_end);
272  	unsafe_get_user(msg->msg_namelen, &sr->umsg->msg_namelen, ua_end);
273  	unsafe_get_user(msg->msg_iov, &sr->umsg->msg_iov, ua_end);
274  	unsafe_get_user(msg->msg_iovlen, &sr->umsg->msg_iovlen, ua_end);
275  	unsafe_get_user(msg->msg_control, &sr->umsg->msg_control, ua_end);
276  	unsafe_get_user(msg->msg_controllen, &sr->umsg->msg_controllen, ua_end);
277  	msg->msg_flags = 0;
278  
279  	if (req->flags & REQ_F_BUFFER_SELECT) {
280  		if (msg->msg_iovlen == 0) {
281  			sr->len = iov->iov_len = 0;
282  			iov->iov_base = NULL;
283  		} else if (msg->msg_iovlen > 1) {
284  			ret = -EINVAL;
285  			goto ua_end;
286  		} else {
287  			/* we only need the length for provided buffers */
288  			if (!access_ok(&msg->msg_iov[0].iov_len, sizeof(__kernel_size_t)))
289  				goto ua_end;
290  			unsafe_get_user(iov->iov_len, &msg->msg_iov[0].iov_len,
291  					ua_end);
292  			sr->len = iov->iov_len;
293  		}
294  		ret = 0;
295  ua_end:
296  		user_access_end();
297  		return ret;
298  	}
299  
300  	user_access_end();
301  	ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, nr_segs,
302  				&iov, &iomsg->msg.msg_iter, false);
303  	if (unlikely(ret < 0))
304  		return ret;
305  
306  	return io_net_vec_assign(req, iomsg, iov);
307  }
308  
309  static int io_sendmsg_copy_hdr(struct io_kiocb *req,
310  			       struct io_async_msghdr *iomsg)
311  {
312  	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
313  	struct user_msghdr msg;
314  	int ret;
315  
316  	iomsg->msg.msg_name = &iomsg->addr;
317  	iomsg->msg.msg_iter.nr_segs = 0;
318  
319  #ifdef CONFIG_COMPAT
320  	if (unlikely(req->ctx->compat)) {
321  		struct compat_msghdr cmsg;
322  
323  		ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_SOURCE);
324  		if (unlikely(ret))
325  			return ret;
326  
327  		return __get_compat_msghdr(&iomsg->msg, &cmsg, NULL);
328  	}
329  #endif
330  
331  	ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE);
332  	if (unlikely(ret))
333  		return ret;
334  
335  	ret = __copy_msghdr(&iomsg->msg, &msg, NULL);
336  
337  	/* save msg_control as sys_sendmsg() overwrites it */
338  	sr->msg_control = iomsg->msg.msg_control_user;
339  	return ret;
340  }
341  
342  void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req)
343  {
344  	struct io_async_msghdr *io = req->async_data;
345  
346  	io_netmsg_iovec_free(io);
347  }
348  
349  static int io_send_setup(struct io_kiocb *req)
350  {
351  	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
352  	struct io_async_msghdr *kmsg = req->async_data;
353  	int ret;
354  
355  	kmsg->msg.msg_name = NULL;
356  	kmsg->msg.msg_namelen = 0;
357  	kmsg->msg.msg_control = NULL;
358  	kmsg->msg.msg_controllen = 0;
359  	kmsg->msg.msg_ubuf = NULL;
360  
361  	if (sr->addr) {
362  		ret = move_addr_to_kernel(sr->addr, sr->addr_len, &kmsg->addr);
363  		if (unlikely(ret < 0))
364  			return ret;
365  		kmsg->msg.msg_name = &kmsg->addr;
366  		kmsg->msg.msg_namelen = sr->addr_len;
367  	}
368  	if (!io_do_buffer_select(req)) {
369  		ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len,
370  				  &kmsg->msg.msg_iter);
371  		if (unlikely(ret < 0))
372  			return ret;
373  	}
374  	return 0;
375  }
376  
377  static int io_sendmsg_prep_setup(struct io_kiocb *req, int is_msg)
378  {
379  	struct io_async_msghdr *kmsg;
380  	int ret;
381  
382  	kmsg = io_msg_alloc_async(req);
383  	if (unlikely(!kmsg))
384  		return -ENOMEM;
385  	if (!is_msg)
386  		return io_send_setup(req);
387  	ret = io_sendmsg_copy_hdr(req, kmsg);
388  	if (!ret)
389  		req->flags |= REQ_F_NEED_CLEANUP;
390  	return ret;
391  }
392  
393  #define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE)
394  
395  int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
396  {
397  	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
398  
399  	sr->done_io = 0;
400  
401  	if (req->opcode == IORING_OP_SEND) {
402  		if (READ_ONCE(sqe->__pad3[0]))
403  			return -EINVAL;
404  		sr->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
405  		sr->addr_len = READ_ONCE(sqe->addr_len);
406  	} else if (sqe->addr2 || sqe->file_index) {
407  		return -EINVAL;
408  	}
409  
410  	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
411  	sr->len = READ_ONCE(sqe->len);
412  	sr->flags = READ_ONCE(sqe->ioprio);
413  	if (sr->flags & ~SENDMSG_FLAGS)
414  		return -EINVAL;
415  	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
416  	if (sr->msg_flags & MSG_DONTWAIT)
417  		req->flags |= REQ_F_NOWAIT;
418  	if (sr->flags & IORING_RECVSEND_BUNDLE) {
419  		if (req->opcode == IORING_OP_SENDMSG)
420  			return -EINVAL;
421  		if (!(req->flags & REQ_F_BUFFER_SELECT))
422  			return -EINVAL;
423  		sr->msg_flags |= MSG_WAITALL;
424  		sr->buf_group = req->buf_index;
425  		req->buf_list = NULL;
426  	}
427  	if (req->flags & REQ_F_BUFFER_SELECT && sr->len)
428  		return -EINVAL;
429  
430  #ifdef CONFIG_COMPAT
431  	if (req->ctx->compat)
432  		sr->msg_flags |= MSG_CMSG_COMPAT;
433  #endif
434  	return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG);
435  }
436  
437  static void io_req_msg_cleanup(struct io_kiocb *req,
438  			       unsigned int issue_flags)
439  {
440  	req->flags &= ~REQ_F_NEED_CLEANUP;
441  	io_netmsg_recycle(req, issue_flags);
442  }
443  
444  /*
445   * For bundle completions, we need to figure out how many segments we consumed.
446   * A bundle could be using a single ITER_UBUF if that's all we mapped, or it
447   * could be using an ITER_IOVEC. If the latter, then if we consumed all of
448   * the segments, then it's a trivial questiont o answer. If we have residual
449   * data in the iter, then loop the segments to figure out how much we
450   * transferred.
451   */
452  static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret)
453  {
454  	struct iovec *iov;
455  	int nbufs;
456  
457  	/* no data is always zero segments, and a ubuf is always 1 segment */
458  	if (ret <= 0)
459  		return 0;
460  	if (iter_is_ubuf(&kmsg->msg.msg_iter))
461  		return 1;
462  
463  	iov = kmsg->free_iov;
464  	if (!iov)
465  		iov = &kmsg->fast_iov;
466  
467  	/* if all data was transferred, it's basic pointer math */
468  	if (!iov_iter_count(&kmsg->msg.msg_iter))
469  		return iter_iov(&kmsg->msg.msg_iter) - iov;
470  
471  	/* short transfer, count segments */
472  	nbufs = 0;
473  	do {
474  		int this_len = min_t(int, iov[nbufs].iov_len, ret);
475  
476  		nbufs++;
477  		ret -= this_len;
478  	} while (ret);
479  
480  	return nbufs;
481  }
482  
483  static inline bool io_send_finish(struct io_kiocb *req, int *ret,
484  				  struct io_async_msghdr *kmsg,
485  				  unsigned issue_flags)
486  {
487  	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
488  	bool bundle_finished = *ret <= 0;
489  	unsigned int cflags;
490  
491  	if (!(sr->flags & IORING_RECVSEND_BUNDLE)) {
492  		cflags = io_put_kbuf(req, issue_flags);
493  		goto finish;
494  	}
495  
496  	cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), issue_flags);
497  
498  	if (bundle_finished || req->flags & REQ_F_BL_EMPTY)
499  		goto finish;
500  
501  	/*
502  	 * Fill CQE for this receive and see if we should keep trying to
503  	 * receive from this socket.
504  	 */
505  	if (io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) {
506  		io_mshot_prep_retry(req, kmsg);
507  		return false;
508  	}
509  
510  	/* Otherwise stop bundle and use the current result. */
511  finish:
512  	io_req_set_res(req, *ret, cflags);
513  	*ret = IOU_OK;
514  	return true;
515  }
516  
517  int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
518  {
519  	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
520  	struct io_async_msghdr *kmsg = req->async_data;
521  	struct socket *sock;
522  	unsigned flags;
523  	int min_ret = 0;
524  	int ret;
525  
526  	sock = sock_from_file(req->file);
527  	if (unlikely(!sock))
528  		return -ENOTSOCK;
529  
530  	if (!(req->flags & REQ_F_POLLED) &&
531  	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
532  		return -EAGAIN;
533  
534  	flags = sr->msg_flags;
535  	if (issue_flags & IO_URING_F_NONBLOCK)
536  		flags |= MSG_DONTWAIT;
537  	if (flags & MSG_WAITALL)
538  		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
539  
540  	kmsg->msg.msg_control_user = sr->msg_control;
541  
542  	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
543  
544  	if (ret < min_ret) {
545  		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
546  			return -EAGAIN;
547  		if (ret > 0 && io_net_retry(sock, flags)) {
548  			kmsg->msg.msg_controllen = 0;
549  			kmsg->msg.msg_control = NULL;
550  			sr->done_io += ret;
551  			req->flags |= REQ_F_BL_NO_RECYCLE;
552  			return -EAGAIN;
553  		}
554  		if (ret == -ERESTARTSYS)
555  			ret = -EINTR;
556  		req_set_fail(req);
557  	}
558  	io_req_msg_cleanup(req, issue_flags);
559  	if (ret >= 0)
560  		ret += sr->done_io;
561  	else if (sr->done_io)
562  		ret = sr->done_io;
563  	io_req_set_res(req, ret, 0);
564  	return IOU_OK;
565  }
566  
567  int io_send(struct io_kiocb *req, unsigned int issue_flags)
568  {
569  	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
570  	struct io_async_msghdr *kmsg = req->async_data;
571  	struct socket *sock;
572  	unsigned flags;
573  	int min_ret = 0;
574  	int ret;
575  
576  	sock = sock_from_file(req->file);
577  	if (unlikely(!sock))
578  		return -ENOTSOCK;
579  
580  	if (!(req->flags & REQ_F_POLLED) &&
581  	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
582  		return -EAGAIN;
583  
584  	flags = sr->msg_flags;
585  	if (issue_flags & IO_URING_F_NONBLOCK)
586  		flags |= MSG_DONTWAIT;
587  
588  retry_bundle:
589  	if (io_do_buffer_select(req)) {
590  		struct buf_sel_arg arg = {
591  			.iovs = &kmsg->fast_iov,
592  			.max_len = INT_MAX,
593  			.nr_iovs = 1,
594  			.mode = KBUF_MODE_EXPAND,
595  		};
596  
597  		if (kmsg->free_iov) {
598  			arg.nr_iovs = kmsg->free_iov_nr;
599  			arg.iovs = kmsg->free_iov;
600  			arg.mode |= KBUF_MODE_FREE;
601  		}
602  
603  		if (!(sr->flags & IORING_RECVSEND_BUNDLE))
604  			arg.nr_iovs = 1;
605  
606  		ret = io_buffers_select(req, &arg, issue_flags);
607  		if (unlikely(ret < 0))
608  			return ret;
609  
610  		sr->len = arg.out_len;
611  		iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, arg.iovs, ret,
612  				arg.out_len);
613  		if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) {
614  			kmsg->free_iov_nr = ret;
615  			kmsg->free_iov = arg.iovs;
616  		}
617  	}
618  
619  	/*
620  	 * If MSG_WAITALL is set, or this is a bundle send, then we need
621  	 * the full amount. If just bundle is set, if we do a short send
622  	 * then we complete the bundle sequence rather than continue on.
623  	 */
624  	if (flags & MSG_WAITALL || sr->flags & IORING_RECVSEND_BUNDLE)
625  		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
626  
627  	flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
628  	kmsg->msg.msg_flags = flags;
629  	ret = sock_sendmsg(sock, &kmsg->msg);
630  	if (ret < min_ret) {
631  		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
632  			return -EAGAIN;
633  
634  		if (ret > 0 && io_net_retry(sock, flags)) {
635  			sr->len -= ret;
636  			sr->buf += ret;
637  			sr->done_io += ret;
638  			req->flags |= REQ_F_BL_NO_RECYCLE;
639  			return -EAGAIN;
640  		}
641  		if (ret == -ERESTARTSYS)
642  			ret = -EINTR;
643  		req_set_fail(req);
644  	}
645  	if (ret >= 0)
646  		ret += sr->done_io;
647  	else if (sr->done_io)
648  		ret = sr->done_io;
649  
650  	if (!io_send_finish(req, &ret, kmsg, issue_flags))
651  		goto retry_bundle;
652  
653  	io_req_msg_cleanup(req, issue_flags);
654  	return ret;
655  }
656  
657  static int io_recvmsg_mshot_prep(struct io_kiocb *req,
658  				 struct io_async_msghdr *iomsg,
659  				 int namelen, size_t controllen)
660  {
661  	if ((req->flags & (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) ==
662  			  (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) {
663  		int hdr;
664  
665  		if (unlikely(namelen < 0))
666  			return -EOVERFLOW;
667  		if (check_add_overflow(sizeof(struct io_uring_recvmsg_out),
668  					namelen, &hdr))
669  			return -EOVERFLOW;
670  		if (check_add_overflow(hdr, controllen, &hdr))
671  			return -EOVERFLOW;
672  
673  		iomsg->namelen = namelen;
674  		iomsg->controllen = controllen;
675  		return 0;
676  	}
677  
678  	return 0;
679  }
680  
681  static int io_recvmsg_copy_hdr(struct io_kiocb *req,
682  			       struct io_async_msghdr *iomsg)
683  {
684  	struct user_msghdr msg;
685  	int ret;
686  
687  	iomsg->msg.msg_name = &iomsg->addr;
688  	iomsg->msg.msg_iter.nr_segs = 0;
689  
690  #ifdef CONFIG_COMPAT
691  	if (unlikely(req->ctx->compat)) {
692  		struct compat_msghdr cmsg;
693  
694  		ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST);
695  		if (unlikely(ret))
696  			return ret;
697  
698  		ret = __get_compat_msghdr(&iomsg->msg, &cmsg, &iomsg->uaddr);
699  		if (unlikely(ret))
700  			return ret;
701  
702  		return io_recvmsg_mshot_prep(req, iomsg, cmsg.msg_namelen,
703  						cmsg.msg_controllen);
704  	}
705  #endif
706  
707  	ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST);
708  	if (unlikely(ret))
709  		return ret;
710  
711  	ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr);
712  	if (unlikely(ret))
713  		return ret;
714  
715  	return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen,
716  					msg.msg_controllen);
717  }
718  
719  static int io_recvmsg_prep_setup(struct io_kiocb *req)
720  {
721  	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
722  	struct io_async_msghdr *kmsg;
723  	int ret;
724  
725  	kmsg = io_msg_alloc_async(req);
726  	if (unlikely(!kmsg))
727  		return -ENOMEM;
728  
729  	if (req->opcode == IORING_OP_RECV) {
730  		kmsg->msg.msg_name = NULL;
731  		kmsg->msg.msg_namelen = 0;
732  		kmsg->msg.msg_control = NULL;
733  		kmsg->msg.msg_get_inq = 1;
734  		kmsg->msg.msg_controllen = 0;
735  		kmsg->msg.msg_iocb = NULL;
736  		kmsg->msg.msg_ubuf = NULL;
737  
738  		if (!io_do_buffer_select(req)) {
739  			ret = import_ubuf(ITER_DEST, sr->buf, sr->len,
740  					  &kmsg->msg.msg_iter);
741  			if (unlikely(ret))
742  				return ret;
743  		}
744  		return 0;
745  	}
746  
747  	ret = io_recvmsg_copy_hdr(req, kmsg);
748  	if (!ret)
749  		req->flags |= REQ_F_NEED_CLEANUP;
750  	return ret;
751  }
752  
753  #define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \
754  			IORING_RECVSEND_BUNDLE)
755  
756  int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
757  {
758  	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
759  
760  	sr->done_io = 0;
761  
762  	if (unlikely(sqe->file_index || sqe->addr2))
763  		return -EINVAL;
764  
765  	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
766  	sr->len = READ_ONCE(sqe->len);
767  	sr->flags = READ_ONCE(sqe->ioprio);
768  	if (sr->flags & ~RECVMSG_FLAGS)
769  		return -EINVAL;
770  	sr->msg_flags = READ_ONCE(sqe->msg_flags);
771  	if (sr->msg_flags & MSG_DONTWAIT)
772  		req->flags |= REQ_F_NOWAIT;
773  	if (sr->msg_flags & MSG_ERRQUEUE)
774  		req->flags |= REQ_F_CLEAR_POLLIN;
775  	if (req->flags & REQ_F_BUFFER_SELECT) {
776  		/*
777  		 * Store the buffer group for this multishot receive separately,
778  		 * as if we end up doing an io-wq based issue that selects a
779  		 * buffer, it has to be committed immediately and that will
780  		 * clear ->buf_list. This means we lose the link to the buffer
781  		 * list, and the eventual buffer put on completion then cannot
782  		 * restore it.
783  		 */
784  		sr->buf_group = req->buf_index;
785  		req->buf_list = NULL;
786  	}
787  	if (sr->flags & IORING_RECV_MULTISHOT) {
788  		if (!(req->flags & REQ_F_BUFFER_SELECT))
789  			return -EINVAL;
790  		if (sr->msg_flags & MSG_WAITALL)
791  			return -EINVAL;
792  		if (req->opcode == IORING_OP_RECV && sr->len)
793  			return -EINVAL;
794  		req->flags |= REQ_F_APOLL_MULTISHOT;
795  	}
796  	if (sr->flags & IORING_RECVSEND_BUNDLE) {
797  		if (req->opcode == IORING_OP_RECVMSG)
798  			return -EINVAL;
799  	}
800  
801  #ifdef CONFIG_COMPAT
802  	if (req->ctx->compat)
803  		sr->msg_flags |= MSG_CMSG_COMPAT;
804  #endif
805  	sr->nr_multishot_loops = 0;
806  	return io_recvmsg_prep_setup(req);
807  }
808  
809  /*
810   * Finishes io_recv and io_recvmsg.
811   *
812   * Returns true if it is actually finished, or false if it should run
813   * again (for multishot).
814   */
815  static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
816  				  struct io_async_msghdr *kmsg,
817  				  bool mshot_finished, unsigned issue_flags)
818  {
819  	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
820  	unsigned int cflags;
821  
822  	if (sr->flags & IORING_RECVSEND_BUNDLE)
823  		cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret),
824  				      issue_flags);
825  	else
826  		cflags = io_put_kbuf(req, issue_flags);
827  
828  	if (kmsg->msg.msg_inq > 0)
829  		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
830  
831  	/* bundle with no more immediate buffers, we're done */
832  	if (sr->flags & IORING_RECVSEND_BUNDLE && req->flags & REQ_F_BL_EMPTY)
833  		goto finish;
834  
835  	/*
836  	 * Fill CQE for this receive and see if we should keep trying to
837  	 * receive from this socket.
838  	 */
839  	if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished &&
840  	    io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) {
841  		int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE;
842  
843  		io_mshot_prep_retry(req, kmsg);
844  		/* Known not-empty or unknown state, retry */
845  		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) {
846  			if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY)
847  				return false;
848  			/* mshot retries exceeded, force a requeue */
849  			sr->nr_multishot_loops = 0;
850  			mshot_retry_ret = IOU_REQUEUE;
851  		}
852  		if (issue_flags & IO_URING_F_MULTISHOT)
853  			*ret = mshot_retry_ret;
854  		else
855  			*ret = -EAGAIN;
856  		return true;
857  	}
858  
859  	/* Finish the request / stop multishot. */
860  finish:
861  	io_req_set_res(req, *ret, cflags);
862  
863  	if (issue_flags & IO_URING_F_MULTISHOT)
864  		*ret = IOU_STOP_MULTISHOT;
865  	else
866  		*ret = IOU_OK;
867  	io_req_msg_cleanup(req, issue_flags);
868  	return true;
869  }
870  
871  static int io_recvmsg_prep_multishot(struct io_async_msghdr *kmsg,
872  				     struct io_sr_msg *sr, void __user **buf,
873  				     size_t *len)
874  {
875  	unsigned long ubuf = (unsigned long) *buf;
876  	unsigned long hdr;
877  
878  	hdr = sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
879  		kmsg->controllen;
880  	if (*len < hdr)
881  		return -EFAULT;
882  
883  	if (kmsg->controllen) {
884  		unsigned long control = ubuf + hdr - kmsg->controllen;
885  
886  		kmsg->msg.msg_control_user = (void __user *) control;
887  		kmsg->msg.msg_controllen = kmsg->controllen;
888  	}
889  
890  	sr->buf = *buf; /* stash for later copy */
891  	*buf = (void __user *) (ubuf + hdr);
892  	kmsg->payloadlen = *len = *len - hdr;
893  	return 0;
894  }
895  
896  struct io_recvmsg_multishot_hdr {
897  	struct io_uring_recvmsg_out msg;
898  	struct sockaddr_storage addr;
899  };
900  
901  static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io,
902  				struct io_async_msghdr *kmsg,
903  				unsigned int flags, bool *finished)
904  {
905  	int err;
906  	int copy_len;
907  	struct io_recvmsg_multishot_hdr hdr;
908  
909  	if (kmsg->namelen)
910  		kmsg->msg.msg_name = &hdr.addr;
911  	kmsg->msg.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
912  	kmsg->msg.msg_namelen = 0;
913  
914  	if (sock->file->f_flags & O_NONBLOCK)
915  		flags |= MSG_DONTWAIT;
916  
917  	err = sock_recvmsg(sock, &kmsg->msg, flags);
918  	*finished = err <= 0;
919  	if (err < 0)
920  		return err;
921  
922  	hdr.msg = (struct io_uring_recvmsg_out) {
923  		.controllen = kmsg->controllen - kmsg->msg.msg_controllen,
924  		.flags = kmsg->msg.msg_flags & ~MSG_CMSG_COMPAT
925  	};
926  
927  	hdr.msg.payloadlen = err;
928  	if (err > kmsg->payloadlen)
929  		err = kmsg->payloadlen;
930  
931  	copy_len = sizeof(struct io_uring_recvmsg_out);
932  	if (kmsg->msg.msg_namelen > kmsg->namelen)
933  		copy_len += kmsg->namelen;
934  	else
935  		copy_len += kmsg->msg.msg_namelen;
936  
937  	/*
938  	 *      "fromlen shall refer to the value before truncation.."
939  	 *                      1003.1g
940  	 */
941  	hdr.msg.namelen = kmsg->msg.msg_namelen;
942  
943  	/* ensure that there is no gap between hdr and sockaddr_storage */
944  	BUILD_BUG_ON(offsetof(struct io_recvmsg_multishot_hdr, addr) !=
945  		     sizeof(struct io_uring_recvmsg_out));
946  	if (copy_to_user(io->buf, &hdr, copy_len)) {
947  		*finished = true;
948  		return -EFAULT;
949  	}
950  
951  	return sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
952  			kmsg->controllen + err;
953  }
954  
955  int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
956  {
957  	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
958  	struct io_async_msghdr *kmsg = req->async_data;
959  	struct socket *sock;
960  	unsigned flags;
961  	int ret, min_ret = 0;
962  	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
963  	bool mshot_finished = true;
964  
965  	sock = sock_from_file(req->file);
966  	if (unlikely(!sock))
967  		return -ENOTSOCK;
968  
969  	if (!(req->flags & REQ_F_POLLED) &&
970  	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
971  		return -EAGAIN;
972  
973  	flags = sr->msg_flags;
974  	if (force_nonblock)
975  		flags |= MSG_DONTWAIT;
976  
977  retry_multishot:
978  	if (io_do_buffer_select(req)) {
979  		void __user *buf;
980  		size_t len = sr->len;
981  
982  		buf = io_buffer_select(req, &len, issue_flags);
983  		if (!buf)
984  			return -ENOBUFS;
985  
986  		if (req->flags & REQ_F_APOLL_MULTISHOT) {
987  			ret = io_recvmsg_prep_multishot(kmsg, sr, &buf, &len);
988  			if (ret) {
989  				io_kbuf_recycle(req, issue_flags);
990  				return ret;
991  			}
992  		}
993  
994  		iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, buf, len);
995  	}
996  
997  	kmsg->msg.msg_get_inq = 1;
998  	kmsg->msg.msg_inq = -1;
999  	if (req->flags & REQ_F_APOLL_MULTISHOT) {
1000  		ret = io_recvmsg_multishot(sock, sr, kmsg, flags,
1001  					   &mshot_finished);
1002  	} else {
1003  		/* disable partial retry for recvmsg with cmsg attached */
1004  		if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen)
1005  			min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1006  
1007  		ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg,
1008  					 kmsg->uaddr, flags);
1009  	}
1010  
1011  	if (ret < min_ret) {
1012  		if (ret == -EAGAIN && force_nonblock) {
1013  			if (issue_flags & IO_URING_F_MULTISHOT) {
1014  				io_kbuf_recycle(req, issue_flags);
1015  				return IOU_ISSUE_SKIP_COMPLETE;
1016  			}
1017  			return -EAGAIN;
1018  		}
1019  		if (ret > 0 && io_net_retry(sock, flags)) {
1020  			sr->done_io += ret;
1021  			req->flags |= REQ_F_BL_NO_RECYCLE;
1022  			return -EAGAIN;
1023  		}
1024  		if (ret == -ERESTARTSYS)
1025  			ret = -EINTR;
1026  		req_set_fail(req);
1027  	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
1028  		req_set_fail(req);
1029  	}
1030  
1031  	if (ret > 0)
1032  		ret += sr->done_io;
1033  	else if (sr->done_io)
1034  		ret = sr->done_io;
1035  	else
1036  		io_kbuf_recycle(req, issue_flags);
1037  
1038  	if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags))
1039  		goto retry_multishot;
1040  
1041  	return ret;
1042  }
1043  
1044  static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg,
1045  			      size_t *len, unsigned int issue_flags)
1046  {
1047  	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1048  	int ret;
1049  
1050  	/*
1051  	 * If the ring isn't locked, then don't use the peek interface
1052  	 * to grab multiple buffers as we will lock/unlock between
1053  	 * this selection and posting the buffers.
1054  	 */
1055  	if (!(issue_flags & IO_URING_F_UNLOCKED) &&
1056  	    sr->flags & IORING_RECVSEND_BUNDLE) {
1057  		struct buf_sel_arg arg = {
1058  			.iovs = &kmsg->fast_iov,
1059  			.nr_iovs = 1,
1060  			.mode = KBUF_MODE_EXPAND,
1061  		};
1062  
1063  		if (kmsg->free_iov) {
1064  			arg.nr_iovs = kmsg->free_iov_nr;
1065  			arg.iovs = kmsg->free_iov;
1066  			arg.mode |= KBUF_MODE_FREE;
1067  		}
1068  
1069  		if (kmsg->msg.msg_inq > 0)
1070  			arg.max_len = min_not_zero(sr->len, kmsg->msg.msg_inq);
1071  
1072  		ret = io_buffers_peek(req, &arg);
1073  		if (unlikely(ret < 0))
1074  			return ret;
1075  
1076  		/* special case 1 vec, can be a fast path */
1077  		if (ret == 1) {
1078  			sr->buf = arg.iovs[0].iov_base;
1079  			sr->len = arg.iovs[0].iov_len;
1080  			goto map_ubuf;
1081  		}
1082  		iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret,
1083  				arg.out_len);
1084  		if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) {
1085  			kmsg->free_iov_nr = ret;
1086  			kmsg->free_iov = arg.iovs;
1087  		}
1088  	} else {
1089  		void __user *buf;
1090  
1091  		*len = sr->len;
1092  		buf = io_buffer_select(req, len, issue_flags);
1093  		if (!buf)
1094  			return -ENOBUFS;
1095  		sr->buf = buf;
1096  		sr->len = *len;
1097  map_ubuf:
1098  		ret = import_ubuf(ITER_DEST, sr->buf, sr->len,
1099  				  &kmsg->msg.msg_iter);
1100  		if (unlikely(ret))
1101  			return ret;
1102  	}
1103  
1104  	return 0;
1105  }
1106  
1107  int io_recv(struct io_kiocb *req, unsigned int issue_flags)
1108  {
1109  	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1110  	struct io_async_msghdr *kmsg = req->async_data;
1111  	struct socket *sock;
1112  	unsigned flags;
1113  	int ret, min_ret = 0;
1114  	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1115  	size_t len = sr->len;
1116  
1117  	if (!(req->flags & REQ_F_POLLED) &&
1118  	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
1119  		return -EAGAIN;
1120  
1121  	sock = sock_from_file(req->file);
1122  	if (unlikely(!sock))
1123  		return -ENOTSOCK;
1124  
1125  	flags = sr->msg_flags;
1126  	if (force_nonblock)
1127  		flags |= MSG_DONTWAIT;
1128  
1129  retry_multishot:
1130  	kmsg->msg.msg_inq = -1;
1131  	kmsg->msg.msg_flags = 0;
1132  
1133  	if (io_do_buffer_select(req)) {
1134  		ret = io_recv_buf_select(req, kmsg, &len, issue_flags);
1135  		if (unlikely(ret))
1136  			goto out_free;
1137  		sr->buf = NULL;
1138  	}
1139  
1140  	if (flags & MSG_WAITALL)
1141  		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1142  
1143  	ret = sock_recvmsg(sock, &kmsg->msg, flags);
1144  	if (ret < min_ret) {
1145  		if (ret == -EAGAIN && force_nonblock) {
1146  			if (issue_flags & IO_URING_F_MULTISHOT) {
1147  				io_kbuf_recycle(req, issue_flags);
1148  				return IOU_ISSUE_SKIP_COMPLETE;
1149  			}
1150  
1151  			return -EAGAIN;
1152  		}
1153  		if (ret > 0 && io_net_retry(sock, flags)) {
1154  			sr->len -= ret;
1155  			sr->buf += ret;
1156  			sr->done_io += ret;
1157  			req->flags |= REQ_F_BL_NO_RECYCLE;
1158  			return -EAGAIN;
1159  		}
1160  		if (ret == -ERESTARTSYS)
1161  			ret = -EINTR;
1162  		req_set_fail(req);
1163  	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
1164  out_free:
1165  		req_set_fail(req);
1166  	}
1167  
1168  	if (ret > 0)
1169  		ret += sr->done_io;
1170  	else if (sr->done_io)
1171  		ret = sr->done_io;
1172  	else
1173  		io_kbuf_recycle(req, issue_flags);
1174  
1175  	if (!io_recv_finish(req, &ret, kmsg, ret <= 0, issue_flags))
1176  		goto retry_multishot;
1177  
1178  	return ret;
1179  }
1180  
1181  void io_send_zc_cleanup(struct io_kiocb *req)
1182  {
1183  	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1184  	struct io_async_msghdr *io = req->async_data;
1185  
1186  	if (req_has_async_data(req))
1187  		io_netmsg_iovec_free(io);
1188  	if (zc->notif) {
1189  		io_notif_flush(zc->notif);
1190  		zc->notif = NULL;
1191  	}
1192  }
1193  
1194  #define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF)
1195  #define IO_ZC_FLAGS_VALID  (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE)
1196  
1197  int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1198  {
1199  	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1200  	struct io_ring_ctx *ctx = req->ctx;
1201  	struct io_kiocb *notif;
1202  
1203  	zc->done_io = 0;
1204  	req->flags |= REQ_F_POLL_NO_LAZY;
1205  
1206  	if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)))
1207  		return -EINVAL;
1208  	/* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */
1209  	if (req->flags & REQ_F_CQE_SKIP)
1210  		return -EINVAL;
1211  
1212  	notif = zc->notif = io_alloc_notif(ctx);
1213  	if (!notif)
1214  		return -ENOMEM;
1215  	notif->cqe.user_data = req->cqe.user_data;
1216  	notif->cqe.res = 0;
1217  	notif->cqe.flags = IORING_CQE_F_NOTIF;
1218  	req->flags |= REQ_F_NEED_CLEANUP;
1219  
1220  	zc->flags = READ_ONCE(sqe->ioprio);
1221  	if (unlikely(zc->flags & ~IO_ZC_FLAGS_COMMON)) {
1222  		if (zc->flags & ~IO_ZC_FLAGS_VALID)
1223  			return -EINVAL;
1224  		if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) {
1225  			struct io_notif_data *nd = io_notif_to_data(notif);
1226  
1227  			nd->zc_report = true;
1228  			nd->zc_used = false;
1229  			nd->zc_copied = false;
1230  		}
1231  	}
1232  
1233  	if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
1234  		unsigned idx = READ_ONCE(sqe->buf_index);
1235  
1236  		if (unlikely(idx >= ctx->nr_user_bufs))
1237  			return -EFAULT;
1238  		idx = array_index_nospec(idx, ctx->nr_user_bufs);
1239  		req->imu = READ_ONCE(ctx->user_bufs[idx]);
1240  		io_req_set_rsrc_node(notif, ctx, 0);
1241  	}
1242  
1243  	if (req->opcode == IORING_OP_SEND_ZC) {
1244  		if (READ_ONCE(sqe->__pad3[0]))
1245  			return -EINVAL;
1246  		zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
1247  		zc->addr_len = READ_ONCE(sqe->addr_len);
1248  	} else {
1249  		if (unlikely(sqe->addr2 || sqe->file_index))
1250  			return -EINVAL;
1251  		if (unlikely(zc->flags & IORING_RECVSEND_FIXED_BUF))
1252  			return -EINVAL;
1253  	}
1254  
1255  	zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
1256  	zc->len = READ_ONCE(sqe->len);
1257  	zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY;
1258  	if (zc->msg_flags & MSG_DONTWAIT)
1259  		req->flags |= REQ_F_NOWAIT;
1260  
1261  #ifdef CONFIG_COMPAT
1262  	if (req->ctx->compat)
1263  		zc->msg_flags |= MSG_CMSG_COMPAT;
1264  #endif
1265  	return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG_ZC);
1266  }
1267  
1268  static int io_sg_from_iter_iovec(struct sock *sk, struct sk_buff *skb,
1269  				 struct iov_iter *from, size_t length)
1270  {
1271  	skb_zcopy_downgrade_managed(skb);
1272  	return __zerocopy_sg_from_iter(NULL, sk, skb, from, length);
1273  }
1274  
1275  static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb,
1276  			   struct iov_iter *from, size_t length)
1277  {
1278  	struct skb_shared_info *shinfo = skb_shinfo(skb);
1279  	int frag = shinfo->nr_frags;
1280  	int ret = 0;
1281  	struct bvec_iter bi;
1282  	ssize_t copied = 0;
1283  	unsigned long truesize = 0;
1284  
1285  	if (!frag)
1286  		shinfo->flags |= SKBFL_MANAGED_FRAG_REFS;
1287  	else if (unlikely(!skb_zcopy_managed(skb)))
1288  		return __zerocopy_sg_from_iter(NULL, sk, skb, from, length);
1289  
1290  	bi.bi_size = min(from->count, length);
1291  	bi.bi_bvec_done = from->iov_offset;
1292  	bi.bi_idx = 0;
1293  
1294  	while (bi.bi_size && frag < MAX_SKB_FRAGS) {
1295  		struct bio_vec v = mp_bvec_iter_bvec(from->bvec, bi);
1296  
1297  		copied += v.bv_len;
1298  		truesize += PAGE_ALIGN(v.bv_len + v.bv_offset);
1299  		__skb_fill_page_desc_noacc(shinfo, frag++, v.bv_page,
1300  					   v.bv_offset, v.bv_len);
1301  		bvec_iter_advance_single(from->bvec, &bi, v.bv_len);
1302  	}
1303  	if (bi.bi_size)
1304  		ret = -EMSGSIZE;
1305  
1306  	shinfo->nr_frags = frag;
1307  	from->bvec += bi.bi_idx;
1308  	from->nr_segs -= bi.bi_idx;
1309  	from->count -= copied;
1310  	from->iov_offset = bi.bi_bvec_done;
1311  
1312  	skb->data_len += copied;
1313  	skb->len += copied;
1314  	skb->truesize += truesize;
1315  
1316  	if (sk && sk->sk_type == SOCK_STREAM) {
1317  		sk_wmem_queued_add(sk, truesize);
1318  		if (!skb_zcopy_pure(skb))
1319  			sk_mem_charge(sk, truesize);
1320  	} else {
1321  		refcount_add(truesize, &skb->sk->sk_wmem_alloc);
1322  	}
1323  	return ret;
1324  }
1325  
1326  static int io_send_zc_import(struct io_kiocb *req, struct io_async_msghdr *kmsg)
1327  {
1328  	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1329  	int ret;
1330  
1331  	if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
1332  		ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter, req->imu,
1333  					(u64)(uintptr_t)sr->buf, sr->len);
1334  		if (unlikely(ret))
1335  			return ret;
1336  		kmsg->msg.sg_from_iter = io_sg_from_iter;
1337  	} else {
1338  		ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter);
1339  		if (unlikely(ret))
1340  			return ret;
1341  		ret = io_notif_account_mem(sr->notif, sr->len);
1342  		if (unlikely(ret))
1343  			return ret;
1344  		kmsg->msg.sg_from_iter = io_sg_from_iter_iovec;
1345  	}
1346  
1347  	return ret;
1348  }
1349  
1350  int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
1351  {
1352  	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1353  	struct io_async_msghdr *kmsg = req->async_data;
1354  	struct socket *sock;
1355  	unsigned msg_flags;
1356  	int ret, min_ret = 0;
1357  
1358  	sock = sock_from_file(req->file);
1359  	if (unlikely(!sock))
1360  		return -ENOTSOCK;
1361  	if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
1362  		return -EOPNOTSUPP;
1363  
1364  	if (!(req->flags & REQ_F_POLLED) &&
1365  	    (zc->flags & IORING_RECVSEND_POLL_FIRST))
1366  		return -EAGAIN;
1367  
1368  	if (!zc->done_io) {
1369  		ret = io_send_zc_import(req, kmsg);
1370  		if (unlikely(ret))
1371  			return ret;
1372  	}
1373  
1374  	msg_flags = zc->msg_flags;
1375  	if (issue_flags & IO_URING_F_NONBLOCK)
1376  		msg_flags |= MSG_DONTWAIT;
1377  	if (msg_flags & MSG_WAITALL)
1378  		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1379  	msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
1380  
1381  	kmsg->msg.msg_flags = msg_flags;
1382  	kmsg->msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg;
1383  	ret = sock_sendmsg(sock, &kmsg->msg);
1384  
1385  	if (unlikely(ret < min_ret)) {
1386  		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1387  			return -EAGAIN;
1388  
1389  		if (ret > 0 && io_net_retry(sock, kmsg->msg.msg_flags)) {
1390  			zc->len -= ret;
1391  			zc->buf += ret;
1392  			zc->done_io += ret;
1393  			req->flags |= REQ_F_BL_NO_RECYCLE;
1394  			return -EAGAIN;
1395  		}
1396  		if (ret == -ERESTARTSYS)
1397  			ret = -EINTR;
1398  		req_set_fail(req);
1399  	}
1400  
1401  	if (ret >= 0)
1402  		ret += zc->done_io;
1403  	else if (zc->done_io)
1404  		ret = zc->done_io;
1405  
1406  	/*
1407  	 * If we're in io-wq we can't rely on tw ordering guarantees, defer
1408  	 * flushing notif to io_send_zc_cleanup()
1409  	 */
1410  	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
1411  		io_notif_flush(zc->notif);
1412  		io_req_msg_cleanup(req, 0);
1413  	}
1414  	io_req_set_res(req, ret, IORING_CQE_F_MORE);
1415  	return IOU_OK;
1416  }
1417  
1418  int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
1419  {
1420  	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1421  	struct io_async_msghdr *kmsg = req->async_data;
1422  	struct socket *sock;
1423  	unsigned flags;
1424  	int ret, min_ret = 0;
1425  
1426  	sock = sock_from_file(req->file);
1427  	if (unlikely(!sock))
1428  		return -ENOTSOCK;
1429  	if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
1430  		return -EOPNOTSUPP;
1431  
1432  	if (!(req->flags & REQ_F_POLLED) &&
1433  	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
1434  		return -EAGAIN;
1435  
1436  	flags = sr->msg_flags;
1437  	if (issue_flags & IO_URING_F_NONBLOCK)
1438  		flags |= MSG_DONTWAIT;
1439  	if (flags & MSG_WAITALL)
1440  		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1441  
1442  	kmsg->msg.msg_control_user = sr->msg_control;
1443  	kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg;
1444  	kmsg->msg.sg_from_iter = io_sg_from_iter_iovec;
1445  	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
1446  
1447  	if (unlikely(ret < min_ret)) {
1448  		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1449  			return -EAGAIN;
1450  
1451  		if (ret > 0 && io_net_retry(sock, flags)) {
1452  			sr->done_io += ret;
1453  			req->flags |= REQ_F_BL_NO_RECYCLE;
1454  			return -EAGAIN;
1455  		}
1456  		if (ret == -ERESTARTSYS)
1457  			ret = -EINTR;
1458  		req_set_fail(req);
1459  	}
1460  
1461  	if (ret >= 0)
1462  		ret += sr->done_io;
1463  	else if (sr->done_io)
1464  		ret = sr->done_io;
1465  
1466  	/*
1467  	 * If we're in io-wq we can't rely on tw ordering guarantees, defer
1468  	 * flushing notif to io_send_zc_cleanup()
1469  	 */
1470  	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
1471  		io_notif_flush(sr->notif);
1472  		io_req_msg_cleanup(req, 0);
1473  	}
1474  	io_req_set_res(req, ret, IORING_CQE_F_MORE);
1475  	return IOU_OK;
1476  }
1477  
1478  void io_sendrecv_fail(struct io_kiocb *req)
1479  {
1480  	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1481  
1482  	if (sr->done_io)
1483  		req->cqe.res = sr->done_io;
1484  
1485  	if ((req->flags & REQ_F_NEED_CLEANUP) &&
1486  	    (req->opcode == IORING_OP_SEND_ZC || req->opcode == IORING_OP_SENDMSG_ZC))
1487  		req->cqe.flags |= IORING_CQE_F_MORE;
1488  }
1489  
1490  #define ACCEPT_FLAGS	(IORING_ACCEPT_MULTISHOT | IORING_ACCEPT_DONTWAIT | \
1491  			 IORING_ACCEPT_POLL_FIRST)
1492  
1493  int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1494  {
1495  	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
1496  
1497  	if (sqe->len || sqe->buf_index)
1498  		return -EINVAL;
1499  
1500  	accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1501  	accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
1502  	accept->flags = READ_ONCE(sqe->accept_flags);
1503  	accept->nofile = rlimit(RLIMIT_NOFILE);
1504  	accept->iou_flags = READ_ONCE(sqe->ioprio);
1505  	if (accept->iou_flags & ~ACCEPT_FLAGS)
1506  		return -EINVAL;
1507  
1508  	accept->file_slot = READ_ONCE(sqe->file_index);
1509  	if (accept->file_slot) {
1510  		if (accept->flags & SOCK_CLOEXEC)
1511  			return -EINVAL;
1512  		if (accept->iou_flags & IORING_ACCEPT_MULTISHOT &&
1513  		    accept->file_slot != IORING_FILE_INDEX_ALLOC)
1514  			return -EINVAL;
1515  	}
1516  	if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1517  		return -EINVAL;
1518  	if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
1519  		accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
1520  	if (accept->iou_flags & IORING_ACCEPT_MULTISHOT)
1521  		req->flags |= REQ_F_APOLL_MULTISHOT;
1522  	if (accept->iou_flags & IORING_ACCEPT_DONTWAIT)
1523  		req->flags |= REQ_F_NOWAIT;
1524  	return 0;
1525  }
1526  
1527  int io_accept(struct io_kiocb *req, unsigned int issue_flags)
1528  {
1529  	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
1530  	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1531  	bool fixed = !!accept->file_slot;
1532  	struct proto_accept_arg arg = {
1533  		.flags = force_nonblock ? O_NONBLOCK : 0,
1534  	};
1535  	struct file *file;
1536  	unsigned cflags;
1537  	int ret, fd;
1538  
1539  	if (!(req->flags & REQ_F_POLLED) &&
1540  	    accept->iou_flags & IORING_ACCEPT_POLL_FIRST)
1541  		return -EAGAIN;
1542  
1543  retry:
1544  	if (!fixed) {
1545  		fd = __get_unused_fd_flags(accept->flags, accept->nofile);
1546  		if (unlikely(fd < 0))
1547  			return fd;
1548  	}
1549  	arg.err = 0;
1550  	arg.is_empty = -1;
1551  	file = do_accept(req->file, &arg, accept->addr, accept->addr_len,
1552  			 accept->flags);
1553  	if (IS_ERR(file)) {
1554  		if (!fixed)
1555  			put_unused_fd(fd);
1556  		ret = PTR_ERR(file);
1557  		if (ret == -EAGAIN && force_nonblock &&
1558  		    !(accept->iou_flags & IORING_ACCEPT_DONTWAIT)) {
1559  			/*
1560  			 * if it's multishot and polled, we don't need to
1561  			 * return EAGAIN to arm the poll infra since it
1562  			 * has already been done
1563  			 */
1564  			if (issue_flags & IO_URING_F_MULTISHOT)
1565  				return IOU_ISSUE_SKIP_COMPLETE;
1566  			return ret;
1567  		}
1568  		if (ret == -ERESTARTSYS)
1569  			ret = -EINTR;
1570  		req_set_fail(req);
1571  	} else if (!fixed) {
1572  		fd_install(fd, file);
1573  		ret = fd;
1574  	} else {
1575  		ret = io_fixed_fd_install(req, issue_flags, file,
1576  						accept->file_slot);
1577  	}
1578  
1579  	cflags = 0;
1580  	if (!arg.is_empty)
1581  		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
1582  
1583  	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
1584  		io_req_set_res(req, ret, cflags);
1585  		return IOU_OK;
1586  	}
1587  
1588  	if (ret < 0)
1589  		return ret;
1590  	if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) {
1591  		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1)
1592  			goto retry;
1593  		if (issue_flags & IO_URING_F_MULTISHOT)
1594  			return IOU_ISSUE_SKIP_COMPLETE;
1595  		return -EAGAIN;
1596  	}
1597  
1598  	io_req_set_res(req, ret, cflags);
1599  	return IOU_STOP_MULTISHOT;
1600  }
1601  
1602  int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1603  {
1604  	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1605  
1606  	if (sqe->addr || sqe->rw_flags || sqe->buf_index)
1607  		return -EINVAL;
1608  
1609  	sock->domain = READ_ONCE(sqe->fd);
1610  	sock->type = READ_ONCE(sqe->off);
1611  	sock->protocol = READ_ONCE(sqe->len);
1612  	sock->file_slot = READ_ONCE(sqe->file_index);
1613  	sock->nofile = rlimit(RLIMIT_NOFILE);
1614  
1615  	sock->flags = sock->type & ~SOCK_TYPE_MASK;
1616  	if (sock->file_slot && (sock->flags & SOCK_CLOEXEC))
1617  		return -EINVAL;
1618  	if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1619  		return -EINVAL;
1620  	return 0;
1621  }
1622  
1623  int io_socket(struct io_kiocb *req, unsigned int issue_flags)
1624  {
1625  	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1626  	bool fixed = !!sock->file_slot;
1627  	struct file *file;
1628  	int ret, fd;
1629  
1630  	if (!fixed) {
1631  		fd = __get_unused_fd_flags(sock->flags, sock->nofile);
1632  		if (unlikely(fd < 0))
1633  			return fd;
1634  	}
1635  	file = __sys_socket_file(sock->domain, sock->type, sock->protocol);
1636  	if (IS_ERR(file)) {
1637  		if (!fixed)
1638  			put_unused_fd(fd);
1639  		ret = PTR_ERR(file);
1640  		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1641  			return -EAGAIN;
1642  		if (ret == -ERESTARTSYS)
1643  			ret = -EINTR;
1644  		req_set_fail(req);
1645  	} else if (!fixed) {
1646  		fd_install(fd, file);
1647  		ret = fd;
1648  	} else {
1649  		ret = io_fixed_fd_install(req, issue_flags, file,
1650  					    sock->file_slot);
1651  	}
1652  	io_req_set_res(req, ret, 0);
1653  	return IOU_OK;
1654  }
1655  
1656  int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1657  {
1658  	struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect);
1659  	struct io_async_msghdr *io;
1660  
1661  	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
1662  		return -EINVAL;
1663  
1664  	conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1665  	conn->addr_len =  READ_ONCE(sqe->addr2);
1666  	conn->in_progress = conn->seen_econnaborted = false;
1667  
1668  	io = io_msg_alloc_async(req);
1669  	if (unlikely(!io))
1670  		return -ENOMEM;
1671  
1672  	return move_addr_to_kernel(conn->addr, conn->addr_len, &io->addr);
1673  }
1674  
1675  int io_connect(struct io_kiocb *req, unsigned int issue_flags)
1676  {
1677  	struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect);
1678  	struct io_async_msghdr *io = req->async_data;
1679  	unsigned file_flags;
1680  	int ret;
1681  	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1682  
1683  	file_flags = force_nonblock ? O_NONBLOCK : 0;
1684  
1685  	ret = __sys_connect_file(req->file, &io->addr, connect->addr_len,
1686  				 file_flags);
1687  	if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED)
1688  	    && force_nonblock) {
1689  		if (ret == -EINPROGRESS) {
1690  			connect->in_progress = true;
1691  		} else if (ret == -ECONNABORTED) {
1692  			if (connect->seen_econnaborted)
1693  				goto out;
1694  			connect->seen_econnaborted = true;
1695  		}
1696  		return -EAGAIN;
1697  	}
1698  	if (connect->in_progress) {
1699  		/*
1700  		 * At least bluetooth will return -EBADFD on a re-connect
1701  		 * attempt, and it's (supposedly) also valid to get -EISCONN
1702  		 * which means the previous result is good. For both of these,
1703  		 * grab the sock_error() and use that for the completion.
1704  		 */
1705  		if (ret == -EBADFD || ret == -EISCONN)
1706  			ret = sock_error(sock_from_file(req->file)->sk);
1707  	}
1708  	if (ret == -ERESTARTSYS)
1709  		ret = -EINTR;
1710  out:
1711  	if (ret < 0)
1712  		req_set_fail(req);
1713  	io_req_msg_cleanup(req, issue_flags);
1714  	io_req_set_res(req, ret, 0);
1715  	return IOU_OK;
1716  }
1717  
1718  void io_netmsg_cache_free(const void *entry)
1719  {
1720  	struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry;
1721  
1722  	if (kmsg->free_iov) {
1723  		kasan_mempool_unpoison_object(kmsg->free_iov,
1724  				kmsg->free_iov_nr * sizeof(struct iovec));
1725  		io_netmsg_iovec_free(kmsg);
1726  	}
1727  	kfree(kmsg);
1728  }
1729  #endif
1730