xref: /linux/io_uring/net.c (revision 5832d26433f2bd0d28f8b12526e3c2fdb203507f)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/file.h>
5 #include <linux/slab.h>
6 #include <linux/net.h>
7 #include <linux/compat.h>
8 #include <net/compat.h>
9 #include <linux/io_uring.h>
10 
11 #include <uapi/linux/io_uring.h>
12 
13 #include "filetable.h"
14 #include "io_uring.h"
15 #include "kbuf.h"
16 #include "alloc_cache.h"
17 #include "net.h"
18 #include "notif.h"
19 #include "rsrc.h"
20 #include "zcrx.h"
21 
22 struct io_shutdown {
23 	struct file			*file;
24 	int				how;
25 };
26 
27 struct io_accept {
28 	struct file			*file;
29 	struct sockaddr __user		*addr;
30 	int __user			*addr_len;
31 	int				flags;
32 	int				iou_flags;
33 	u32				file_slot;
34 	unsigned long			nofile;
35 };
36 
37 struct io_socket {
38 	struct file			*file;
39 	int				domain;
40 	int				type;
41 	int				protocol;
42 	int				flags;
43 	u32				file_slot;
44 	unsigned long			nofile;
45 };
46 
47 struct io_connect {
48 	struct file			*file;
49 	struct sockaddr __user		*addr;
50 	int				addr_len;
51 	bool				in_progress;
52 	bool				seen_econnaborted;
53 };
54 
55 struct io_bind {
56 	struct file			*file;
57 	int				addr_len;
58 };
59 
60 struct io_listen {
61 	struct file			*file;
62 	int				backlog;
63 };
64 
65 struct io_sr_msg {
66 	struct file			*file;
67 	union {
68 		struct compat_msghdr __user	*umsg_compat;
69 		struct user_msghdr __user	*umsg;
70 		void __user			*buf;
71 	};
72 	int				len;
73 	unsigned			done_io;
74 	unsigned			msg_flags;
75 	unsigned			nr_multishot_loops;
76 	u16				flags;
77 	/* initialised and used only by !msg send variants */
78 	u16				buf_group;
79 	/* per-invocation mshot limit */
80 	unsigned			mshot_len;
81 	/* overall mshot byte limit */
82 	unsigned			mshot_total_len;
83 	void __user			*msg_control;
84 	/* used only for send zerocopy */
85 	struct io_kiocb 		*notif;
86 };
87 
88 /*
89  * The UAPI flags are the lower 8 bits, as that's all sqe->ioprio will hold
90  * anyway. Use the upper 8 bits for internal uses.
91  */
92 enum sr_retry_flags {
93 	IORING_RECV_RETRY	= (1U << 15),
94 	IORING_RECV_PARTIAL_MAP	= (1U << 14),
95 	IORING_RECV_MSHOT_CAP	= (1U << 13),
96 	IORING_RECV_MSHOT_LIM	= (1U << 12),
97 	IORING_RECV_MSHOT_DONE	= (1U << 11),
98 
99 	IORING_RECV_RETRY_CLEAR	= IORING_RECV_RETRY | IORING_RECV_PARTIAL_MAP,
100 	IORING_RECV_NO_RETRY	= IORING_RECV_RETRY | IORING_RECV_PARTIAL_MAP |
101 				  IORING_RECV_MSHOT_CAP | IORING_RECV_MSHOT_DONE,
102 };
103 
104 /*
105  * Number of times we'll try and do receives if there's more data. If we
106  * exceed this limit, then add us to the back of the queue and retry from
107  * there. This helps fairness between flooding clients.
108  */
109 #define MULTISHOT_MAX_RETRY	32
110 
111 struct io_recvzc {
112 	struct file			*file;
113 	unsigned			msg_flags;
114 	u16				flags;
115 	u32				len;
116 	struct io_zcrx_ifq		*ifq;
117 };
118 
119 static int io_sg_from_iter_iovec(struct sk_buff *skb,
120 				 struct iov_iter *from, size_t length);
121 static int io_sg_from_iter(struct sk_buff *skb,
122 			   struct iov_iter *from, size_t length);
123 
124 int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
125 {
126 	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
127 
128 	if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
129 		     sqe->buf_index || sqe->splice_fd_in))
130 		return -EINVAL;
131 
132 	shutdown->how = READ_ONCE(sqe->len);
133 	req->flags |= REQ_F_FORCE_ASYNC;
134 	return 0;
135 }
136 
137 int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
138 {
139 	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
140 	struct socket *sock;
141 	int ret;
142 
143 	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
144 
145 	sock = sock_from_file(req->file);
146 	if (unlikely(!sock))
147 		return -ENOTSOCK;
148 
149 	ret = __sys_shutdown_sock(sock, shutdown->how);
150 	io_req_set_res(req, ret, 0);
151 	return IOU_COMPLETE;
152 }
153 
154 static bool io_net_retry(struct socket *sock, int flags)
155 {
156 	if (!(flags & MSG_WAITALL))
157 		return false;
158 	return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
159 }
160 
161 static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg)
162 {
163 	if (kmsg->vec.iovec)
164 		io_vec_free(&kmsg->vec);
165 }
166 
167 static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
168 {
169 	struct io_async_msghdr *hdr = req->async_data;
170 
171 	/* can't recycle, ensure we free the iovec if we have one */
172 	if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
173 		io_netmsg_iovec_free(hdr);
174 		return;
175 	}
176 
177 	/* Let normal cleanup path reap it if we fail adding to the cache */
178 	io_alloc_cache_vec_kasan(&hdr->vec);
179 	if (hdr->vec.nr > IO_VEC_CACHE_SOFT_CAP)
180 		io_vec_free(&hdr->vec);
181 
182 	if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr))
183 		io_req_async_data_clear(req, REQ_F_NEED_CLEANUP);
184 }
185 
186 static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req)
187 {
188 	struct io_ring_ctx *ctx = req->ctx;
189 	struct io_async_msghdr *hdr;
190 
191 	hdr = io_uring_alloc_async_data(&ctx->netmsg_cache, req);
192 	if (!hdr)
193 		return NULL;
194 
195 	/* If the async data was cached, we might have an iov cached inside. */
196 	if (hdr->vec.iovec)
197 		req->flags |= REQ_F_NEED_CLEANUP;
198 	return hdr;
199 }
200 
201 static inline void io_mshot_prep_retry(struct io_kiocb *req,
202 				       struct io_async_msghdr *kmsg)
203 {
204 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
205 
206 	req->flags &= ~REQ_F_BL_EMPTY;
207 	sr->done_io = 0;
208 	sr->flags &= ~IORING_RECV_RETRY_CLEAR;
209 	sr->len = sr->mshot_len;
210 }
211 
212 static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg,
213 			     const struct iovec __user *uiov, unsigned uvec_seg,
214 			     int ddir)
215 {
216 	struct iovec *iov;
217 	int ret, nr_segs;
218 
219 	if (iomsg->vec.iovec) {
220 		nr_segs = iomsg->vec.nr;
221 		iov = iomsg->vec.iovec;
222 	} else {
223 		nr_segs = 1;
224 		iov = &iomsg->fast_iov;
225 	}
226 
227 	ret = __import_iovec(ddir, uiov, uvec_seg, nr_segs, &iov,
228 			     &iomsg->msg.msg_iter, io_is_compat(req->ctx));
229 	if (unlikely(ret < 0))
230 		return ret;
231 
232 	if (iov) {
233 		req->flags |= REQ_F_NEED_CLEANUP;
234 		io_vec_reset_iovec(&iomsg->vec, iov, iomsg->msg.msg_iter.nr_segs);
235 	}
236 	return 0;
237 }
238 
239 static int io_compat_msg_copy_hdr(struct io_kiocb *req,
240 				  struct io_async_msghdr *iomsg,
241 				  struct compat_msghdr *msg, int ddir,
242 				  struct sockaddr __user **save_addr)
243 {
244 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
245 	struct compat_iovec __user *uiov;
246 	int ret;
247 
248 	if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg)))
249 		return -EFAULT;
250 
251 	ret = __get_compat_msghdr(&iomsg->msg, msg, save_addr);
252 	if (ret)
253 		return ret;
254 
255 	uiov = compat_ptr(msg->msg_iov);
256 	if (req->flags & REQ_F_BUFFER_SELECT) {
257 		if (msg->msg_iovlen == 0) {
258 			sr->len = 0;
259 		} else if (msg->msg_iovlen > 1) {
260 			return -EINVAL;
261 		} else {
262 			struct compat_iovec tmp_iov;
263 
264 			if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov)))
265 				return -EFAULT;
266 			sr->len = tmp_iov.iov_len;
267 		}
268 	}
269 	return 0;
270 }
271 
272 static int io_copy_msghdr_from_user(struct user_msghdr *msg,
273 				    struct user_msghdr __user *umsg)
274 {
275 	if (!user_access_begin(umsg, sizeof(*umsg)))
276 		return -EFAULT;
277 	unsafe_get_user(msg->msg_name, &umsg->msg_name, ua_end);
278 	unsafe_get_user(msg->msg_namelen, &umsg->msg_namelen, ua_end);
279 	unsafe_get_user(msg->msg_iov, &umsg->msg_iov, ua_end);
280 	unsafe_get_user(msg->msg_iovlen, &umsg->msg_iovlen, ua_end);
281 	unsafe_get_user(msg->msg_control, &umsg->msg_control, ua_end);
282 	unsafe_get_user(msg->msg_controllen, &umsg->msg_controllen, ua_end);
283 	user_access_end();
284 	return 0;
285 ua_end:
286 	user_access_end();
287 	return -EFAULT;
288 }
289 
290 static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
291 			   struct user_msghdr *msg, int ddir,
292 			   struct sockaddr __user **save_addr)
293 {
294 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
295 	struct user_msghdr __user *umsg = sr->umsg;
296 	int ret;
297 
298 	iomsg->msg.msg_name = &iomsg->addr;
299 	iomsg->msg.msg_iter.nr_segs = 0;
300 
301 	if (io_is_compat(req->ctx)) {
302 		struct compat_msghdr cmsg;
303 
304 		ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ddir, save_addr);
305 		if (ret)
306 			return ret;
307 
308 		memset(msg, 0, sizeof(*msg));
309 		msg->msg_namelen = cmsg.msg_namelen;
310 		msg->msg_controllen = cmsg.msg_controllen;
311 		msg->msg_iov = compat_ptr(cmsg.msg_iov);
312 		msg->msg_iovlen = cmsg.msg_iovlen;
313 		return 0;
314 	}
315 
316 	ret = io_copy_msghdr_from_user(msg, umsg);
317 	if (unlikely(ret))
318 		return ret;
319 
320 	msg->msg_flags = 0;
321 
322 	ret = __copy_msghdr(&iomsg->msg, msg, save_addr);
323 	if (ret)
324 		return ret;
325 
326 	if (req->flags & REQ_F_BUFFER_SELECT) {
327 		if (msg->msg_iovlen == 0) {
328 			sr->len = 0;
329 		} else if (msg->msg_iovlen > 1) {
330 			return -EINVAL;
331 		} else {
332 			struct iovec __user *uiov = msg->msg_iov;
333 			struct iovec tmp_iov;
334 
335 			if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov)))
336 				return -EFAULT;
337 			sr->len = tmp_iov.iov_len;
338 		}
339 	}
340 	return 0;
341 }
342 
343 void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req)
344 {
345 	struct io_async_msghdr *io = req->async_data;
346 
347 	io_netmsg_iovec_free(io);
348 }
349 
350 static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
351 {
352 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
353 	struct io_async_msghdr *kmsg = req->async_data;
354 	void __user *addr;
355 	u16 addr_len;
356 	int ret;
357 
358 	sr->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
359 
360 	if (READ_ONCE(sqe->__pad3[0]))
361 		return -EINVAL;
362 
363 	kmsg->msg.msg_name = NULL;
364 	kmsg->msg.msg_namelen = 0;
365 	kmsg->msg.msg_control = NULL;
366 	kmsg->msg.msg_controllen = 0;
367 	kmsg->msg.msg_ubuf = NULL;
368 
369 	addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
370 	addr_len = READ_ONCE(sqe->addr_len);
371 	if (addr) {
372 		ret = move_addr_to_kernel(addr, addr_len, &kmsg->addr);
373 		if (unlikely(ret < 0))
374 			return ret;
375 		kmsg->msg.msg_name = &kmsg->addr;
376 		kmsg->msg.msg_namelen = addr_len;
377 	}
378 	if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
379 		req->flags |= REQ_F_IMPORT_BUFFER;
380 		return 0;
381 	}
382 	if (req->flags & REQ_F_BUFFER_SELECT)
383 		return 0;
384 
385 	if (sr->flags & IORING_SEND_VECTORIZED)
386                return io_net_import_vec(req, kmsg, sr->buf, sr->len, ITER_SOURCE);
387 
388 	return import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter);
389 }
390 
391 static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
392 {
393 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
394 	struct io_async_msghdr *kmsg = req->async_data;
395 	struct user_msghdr msg;
396 	int ret;
397 
398 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
399 	ret = io_msg_copy_hdr(req, kmsg, &msg, ITER_SOURCE, NULL);
400 	if (unlikely(ret))
401 		return ret;
402 	/* save msg_control as sys_sendmsg() overwrites it */
403 	sr->msg_control = kmsg->msg.msg_control_user;
404 
405 	if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
406 		kmsg->msg.msg_iter.nr_segs = msg.msg_iovlen;
407 		return io_prep_reg_iovec(req, &kmsg->vec, msg.msg_iov,
408 					 msg.msg_iovlen);
409 	}
410 	if (req->flags & REQ_F_BUFFER_SELECT)
411 		return 0;
412 	return io_net_import_vec(req, kmsg, msg.msg_iov, msg.msg_iovlen, ITER_SOURCE);
413 }
414 
415 #define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE | IORING_SEND_VECTORIZED)
416 
417 int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
418 {
419 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
420 
421 	sr->done_io = 0;
422 	sr->len = READ_ONCE(sqe->len);
423 	sr->flags = READ_ONCE(sqe->ioprio);
424 	if (sr->flags & ~SENDMSG_FLAGS)
425 		return -EINVAL;
426 	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
427 	if (sr->msg_flags & MSG_DONTWAIT)
428 		req->flags |= REQ_F_NOWAIT;
429 	if (req->flags & REQ_F_BUFFER_SELECT)
430 		sr->buf_group = req->buf_index;
431 	if (sr->flags & IORING_RECVSEND_BUNDLE) {
432 		if (req->opcode == IORING_OP_SENDMSG)
433 			return -EINVAL;
434 		sr->msg_flags |= MSG_WAITALL;
435 		req->flags |= REQ_F_MULTISHOT;
436 	}
437 
438 	if (io_is_compat(req->ctx))
439 		sr->msg_flags |= MSG_CMSG_COMPAT;
440 
441 	if (unlikely(!io_msg_alloc_async(req)))
442 		return -ENOMEM;
443 	if (req->opcode != IORING_OP_SENDMSG)
444 		return io_send_setup(req, sqe);
445 	if (unlikely(sqe->addr2 || sqe->file_index))
446 		return -EINVAL;
447 	return io_sendmsg_setup(req, sqe);
448 }
449 
450 static void io_req_msg_cleanup(struct io_kiocb *req,
451 			       unsigned int issue_flags)
452 {
453 	io_netmsg_recycle(req, issue_flags);
454 }
455 
456 /*
457  * For bundle completions, we need to figure out how many segments we consumed.
458  * A bundle could be using a single ITER_UBUF if that's all we mapped, or it
459  * could be using an ITER_IOVEC. If the latter, then if we consumed all of
460  * the segments, then it's a trivial questiont o answer. If we have residual
461  * data in the iter, then loop the segments to figure out how much we
462  * transferred.
463  */
464 static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret)
465 {
466 	struct iovec *iov;
467 	int nbufs;
468 
469 	/* no data is always zero segments, and a ubuf is always 1 segment */
470 	if (ret <= 0)
471 		return 0;
472 	if (iter_is_ubuf(&kmsg->msg.msg_iter))
473 		return 1;
474 
475 	iov = kmsg->vec.iovec;
476 	if (!iov)
477 		iov = &kmsg->fast_iov;
478 
479 	/* if all data was transferred, it's basic pointer math */
480 	if (!iov_iter_count(&kmsg->msg.msg_iter))
481 		return iter_iov(&kmsg->msg.msg_iter) - iov;
482 
483 	/* short transfer, count segments */
484 	nbufs = 0;
485 	do {
486 		int this_len = min_t(int, iov[nbufs].iov_len, ret);
487 
488 		nbufs++;
489 		ret -= this_len;
490 	} while (ret);
491 
492 	return nbufs;
493 }
494 
495 static int io_net_kbuf_recyle(struct io_kiocb *req, struct io_buffer_list *bl,
496 			      struct io_async_msghdr *kmsg, int len)
497 {
498 	req->flags |= REQ_F_BL_NO_RECYCLE;
499 	if (req->flags & REQ_F_BUFFERS_COMMIT)
500 		io_kbuf_commit(req, bl, len, io_bundle_nbufs(kmsg, len));
501 	return IOU_RETRY;
502 }
503 
504 static inline bool io_send_finish(struct io_kiocb *req,
505 				  struct io_async_msghdr *kmsg,
506 				  struct io_br_sel *sel)
507 {
508 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
509 	bool bundle_finished = sel->val <= 0;
510 	unsigned int cflags;
511 
512 	if (!(sr->flags & IORING_RECVSEND_BUNDLE)) {
513 		cflags = io_put_kbuf(req, sel->val, sel->buf_list);
514 		goto finish;
515 	}
516 
517 	cflags = io_put_kbufs(req, sel->val, sel->buf_list, io_bundle_nbufs(kmsg, sel->val));
518 
519 	if (bundle_finished || req->flags & REQ_F_BL_EMPTY)
520 		goto finish;
521 
522 	/*
523 	 * Fill CQE for this receive and see if we should keep trying to
524 	 * receive from this socket.
525 	 */
526 	if (io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) {
527 		io_mshot_prep_retry(req, kmsg);
528 		return false;
529 	}
530 
531 	/* Otherwise stop bundle and use the current result. */
532 finish:
533 	io_req_set_res(req, sel->val, cflags);
534 	sel->val = IOU_COMPLETE;
535 	return true;
536 }
537 
538 int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
539 {
540 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
541 	struct io_async_msghdr *kmsg = req->async_data;
542 	struct socket *sock;
543 	unsigned flags;
544 	int min_ret = 0;
545 	int ret;
546 
547 	sock = sock_from_file(req->file);
548 	if (unlikely(!sock))
549 		return -ENOTSOCK;
550 
551 	if (!(req->flags & REQ_F_POLLED) &&
552 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
553 		return -EAGAIN;
554 
555 	flags = sr->msg_flags;
556 	if (issue_flags & IO_URING_F_NONBLOCK)
557 		flags |= MSG_DONTWAIT;
558 	if (flags & MSG_WAITALL)
559 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
560 
561 	kmsg->msg.msg_control_user = sr->msg_control;
562 
563 	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
564 
565 	if (ret < min_ret) {
566 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
567 			return -EAGAIN;
568 		if (ret > 0 && io_net_retry(sock, flags)) {
569 			kmsg->msg.msg_controllen = 0;
570 			kmsg->msg.msg_control = NULL;
571 			sr->done_io += ret;
572 			return -EAGAIN;
573 		}
574 		if (ret == -ERESTARTSYS)
575 			ret = -EINTR;
576 		req_set_fail(req);
577 	}
578 	io_req_msg_cleanup(req, issue_flags);
579 	if (ret >= 0)
580 		ret += sr->done_io;
581 	else if (sr->done_io)
582 		ret = sr->done_io;
583 	io_req_set_res(req, ret, 0);
584 	return IOU_COMPLETE;
585 }
586 
587 static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags,
588 				 struct io_br_sel *sel, struct io_async_msghdr *kmsg)
589 {
590 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
591 	struct buf_sel_arg arg = {
592 		.iovs = &kmsg->fast_iov,
593 		.max_len = min_not_zero(sr->len, INT_MAX),
594 		.nr_iovs = 1,
595 		.buf_group = sr->buf_group,
596 	};
597 	int ret;
598 
599 	if (kmsg->vec.iovec) {
600 		arg.nr_iovs = kmsg->vec.nr;
601 		arg.iovs = kmsg->vec.iovec;
602 		arg.mode = KBUF_MODE_FREE;
603 	}
604 
605 	if (!(sr->flags & IORING_RECVSEND_BUNDLE))
606 		arg.nr_iovs = 1;
607 	else
608 		arg.mode |= KBUF_MODE_EXPAND;
609 
610 	ret = io_buffers_select(req, &arg, sel, issue_flags);
611 	if (unlikely(ret < 0))
612 		return ret;
613 
614 	if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
615 		kmsg->vec.nr = ret;
616 		kmsg->vec.iovec = arg.iovs;
617 		req->flags |= REQ_F_NEED_CLEANUP;
618 	}
619 	sr->len = arg.out_len;
620 
621 	if (ret == 1) {
622 		sr->buf = arg.iovs[0].iov_base;
623 		ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len,
624 					&kmsg->msg.msg_iter);
625 		if (unlikely(ret))
626 			return ret;
627 	} else {
628 		iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE,
629 				arg.iovs, ret, arg.out_len);
630 	}
631 
632 	return 0;
633 }
634 
635 int io_send(struct io_kiocb *req, unsigned int issue_flags)
636 {
637 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
638 	struct io_async_msghdr *kmsg = req->async_data;
639 	struct io_br_sel sel = { };
640 	struct socket *sock;
641 	unsigned flags;
642 	int min_ret = 0;
643 	int ret;
644 
645 	sock = sock_from_file(req->file);
646 	if (unlikely(!sock))
647 		return -ENOTSOCK;
648 
649 	if (!(req->flags & REQ_F_POLLED) &&
650 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
651 		return -EAGAIN;
652 
653 	flags = sr->msg_flags;
654 	if (issue_flags & IO_URING_F_NONBLOCK)
655 		flags |= MSG_DONTWAIT;
656 
657 retry_bundle:
658 	sel.buf_list = NULL;
659 	if (io_do_buffer_select(req)) {
660 		ret = io_send_select_buffer(req, issue_flags, &sel, kmsg);
661 		if (ret)
662 			return ret;
663 	}
664 
665 	/*
666 	 * If MSG_WAITALL is set, or this is a bundle send, then we need
667 	 * the full amount. If just bundle is set, if we do a short send
668 	 * then we complete the bundle sequence rather than continue on.
669 	 */
670 	if (flags & MSG_WAITALL || sr->flags & IORING_RECVSEND_BUNDLE)
671 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
672 
673 	flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
674 	kmsg->msg.msg_flags = flags;
675 	ret = sock_sendmsg(sock, &kmsg->msg);
676 	if (ret < min_ret) {
677 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
678 			return -EAGAIN;
679 
680 		if (ret > 0 && io_net_retry(sock, flags)) {
681 			sr->len -= ret;
682 			sr->buf += ret;
683 			sr->done_io += ret;
684 			return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret);
685 		}
686 		if (ret == -ERESTARTSYS)
687 			ret = -EINTR;
688 		req_set_fail(req);
689 	}
690 	if (ret >= 0)
691 		ret += sr->done_io;
692 	else if (sr->done_io)
693 		ret = sr->done_io;
694 
695 	sel.val = ret;
696 	if (!io_send_finish(req, kmsg, &sel))
697 		goto retry_bundle;
698 
699 	io_req_msg_cleanup(req, issue_flags);
700 	return sel.val;
701 }
702 
703 static int io_recvmsg_mshot_prep(struct io_kiocb *req,
704 				 struct io_async_msghdr *iomsg,
705 				 int namelen, size_t controllen)
706 {
707 	if ((req->flags & (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) ==
708 			  (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) {
709 		int hdr;
710 
711 		if (unlikely(namelen < 0))
712 			return -EOVERFLOW;
713 		if (check_add_overflow(sizeof(struct io_uring_recvmsg_out),
714 					namelen, &hdr))
715 			return -EOVERFLOW;
716 		if (check_add_overflow(hdr, controllen, &hdr))
717 			return -EOVERFLOW;
718 
719 		iomsg->namelen = namelen;
720 		iomsg->controllen = controllen;
721 		return 0;
722 	}
723 
724 	return 0;
725 }
726 
727 static int io_recvmsg_copy_hdr(struct io_kiocb *req,
728 			       struct io_async_msghdr *iomsg)
729 {
730 	struct user_msghdr msg;
731 	int ret;
732 
733 	ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr);
734 	if (unlikely(ret))
735 		return ret;
736 
737 	if (!(req->flags & REQ_F_BUFFER_SELECT)) {
738 		ret = io_net_import_vec(req, iomsg, msg.msg_iov, msg.msg_iovlen,
739 					ITER_DEST);
740 		if (unlikely(ret))
741 			return ret;
742 	}
743 	return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen,
744 					msg.msg_controllen);
745 }
746 
747 static int io_recvmsg_prep_setup(struct io_kiocb *req)
748 {
749 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
750 	struct io_async_msghdr *kmsg;
751 
752 	kmsg = io_msg_alloc_async(req);
753 	if (unlikely(!kmsg))
754 		return -ENOMEM;
755 
756 	if (req->opcode == IORING_OP_RECV) {
757 		kmsg->msg.msg_name = NULL;
758 		kmsg->msg.msg_namelen = 0;
759 		kmsg->msg.msg_inq = 0;
760 		kmsg->msg.msg_control = NULL;
761 		kmsg->msg.msg_get_inq = 1;
762 		kmsg->msg.msg_controllen = 0;
763 		kmsg->msg.msg_iocb = NULL;
764 		kmsg->msg.msg_ubuf = NULL;
765 
766 		if (req->flags & REQ_F_BUFFER_SELECT)
767 			return 0;
768 		return import_ubuf(ITER_DEST, sr->buf, sr->len,
769 				   &kmsg->msg.msg_iter);
770 	}
771 
772 	return io_recvmsg_copy_hdr(req, kmsg);
773 }
774 
775 #define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \
776 			IORING_RECVSEND_BUNDLE)
777 
778 int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
779 {
780 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
781 
782 	sr->done_io = 0;
783 
784 	if (unlikely(sqe->addr2))
785 		return -EINVAL;
786 
787 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
788 	sr->len = READ_ONCE(sqe->len);
789 	sr->flags = READ_ONCE(sqe->ioprio);
790 	if (sr->flags & ~RECVMSG_FLAGS)
791 		return -EINVAL;
792 	sr->msg_flags = READ_ONCE(sqe->msg_flags);
793 	if (sr->msg_flags & MSG_DONTWAIT)
794 		req->flags |= REQ_F_NOWAIT;
795 	if (sr->msg_flags & MSG_ERRQUEUE)
796 		req->flags |= REQ_F_CLEAR_POLLIN;
797 	if (req->flags & REQ_F_BUFFER_SELECT)
798 		sr->buf_group = req->buf_index;
799 	sr->mshot_total_len = sr->mshot_len = 0;
800 	if (sr->flags & IORING_RECV_MULTISHOT) {
801 		if (!(req->flags & REQ_F_BUFFER_SELECT))
802 			return -EINVAL;
803 		if (sr->msg_flags & MSG_WAITALL)
804 			return -EINVAL;
805 		if (req->opcode == IORING_OP_RECV) {
806 			sr->mshot_len = sr->len;
807 			sr->mshot_total_len = READ_ONCE(sqe->optlen);
808 			if (sr->mshot_total_len)
809 				sr->flags |= IORING_RECV_MSHOT_LIM;
810 		} else if (sqe->optlen) {
811 			return -EINVAL;
812 		}
813 		req->flags |= REQ_F_APOLL_MULTISHOT;
814 	} else if (sqe->optlen) {
815 		return -EINVAL;
816 	}
817 
818 	if (sr->flags & IORING_RECVSEND_BUNDLE) {
819 		if (req->opcode == IORING_OP_RECVMSG)
820 			return -EINVAL;
821 	}
822 
823 	if (io_is_compat(req->ctx))
824 		sr->msg_flags |= MSG_CMSG_COMPAT;
825 
826 	sr->nr_multishot_loops = 0;
827 	return io_recvmsg_prep_setup(req);
828 }
829 
830 /* bits to clear in old and inherit in new cflags on bundle retry */
831 #define CQE_F_MASK	(IORING_CQE_F_SOCK_NONEMPTY|IORING_CQE_F_MORE)
832 
833 /*
834  * Finishes io_recv and io_recvmsg.
835  *
836  * Returns true if it is actually finished, or false if it should run
837  * again (for multishot).
838  */
839 static inline bool io_recv_finish(struct io_kiocb *req,
840 				  struct io_async_msghdr *kmsg,
841 				  struct io_br_sel *sel, bool mshot_finished,
842 				  unsigned issue_flags)
843 {
844 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
845 	unsigned int cflags = 0;
846 
847 	if (kmsg->msg.msg_inq > 0)
848 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
849 
850 	if (sel->val > 0 && sr->flags & IORING_RECV_MSHOT_LIM) {
851 		/*
852 		 * If sr->len hits zero, the limit has been reached. Mark
853 		 * mshot as finished, and flag MSHOT_DONE as well to prevent
854 		 * a potential bundle from being retried.
855 		 */
856 		sr->mshot_total_len -= min_t(int, sel->val, sr->mshot_total_len);
857 		if (!sr->mshot_total_len) {
858 			sr->flags |= IORING_RECV_MSHOT_DONE;
859 			mshot_finished = true;
860 		}
861 	}
862 
863 	if (sr->flags & IORING_RECVSEND_BUNDLE) {
864 		size_t this_ret = sel->val - sr->done_io;
865 
866 		cflags |= io_put_kbufs(req, this_ret, sel->buf_list, io_bundle_nbufs(kmsg, this_ret));
867 		if (sr->flags & IORING_RECV_RETRY)
868 			cflags = req->cqe.flags | (cflags & CQE_F_MASK);
869 		if (sr->mshot_len && sel->val >= sr->mshot_len)
870 			sr->flags |= IORING_RECV_MSHOT_CAP;
871 		/* bundle with no more immediate buffers, we're done */
872 		if (req->flags & REQ_F_BL_EMPTY)
873 			goto finish;
874 		/*
875 		 * If more is available AND it was a full transfer, retry and
876 		 * append to this one
877 		 */
878 		if (!(sr->flags & IORING_RECV_NO_RETRY) &&
879 		    kmsg->msg.msg_inq > 1 && this_ret > 0 &&
880 		    !iov_iter_count(&kmsg->msg.msg_iter)) {
881 			req->cqe.flags = cflags & ~CQE_F_MASK;
882 			sr->len = kmsg->msg.msg_inq;
883 			sr->done_io += this_ret;
884 			sr->flags |= IORING_RECV_RETRY;
885 			return false;
886 		}
887 	} else {
888 		cflags |= io_put_kbuf(req, sel->val, sel->buf_list);
889 	}
890 
891 	/*
892 	 * Fill CQE for this receive and see if we should keep trying to
893 	 * receive from this socket.
894 	 */
895 	if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished &&
896 	    io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) {
897 		sel->val = IOU_RETRY;
898 		io_mshot_prep_retry(req, kmsg);
899 		/* Known not-empty or unknown state, retry */
900 		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) {
901 			if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY &&
902 			    !(sr->flags & IORING_RECV_MSHOT_CAP)) {
903 				return false;
904 			}
905 			/* mshot retries exceeded, force a requeue */
906 			sr->nr_multishot_loops = 0;
907 			sr->flags &= ~IORING_RECV_MSHOT_CAP;
908 			if (issue_flags & IO_URING_F_MULTISHOT)
909 				sel->val = IOU_REQUEUE;
910 		}
911 		return true;
912 	}
913 
914 	/* Finish the request / stop multishot. */
915 finish:
916 	io_req_set_res(req, sel->val, cflags);
917 	sel->val = IOU_COMPLETE;
918 	io_req_msg_cleanup(req, issue_flags);
919 	return true;
920 }
921 
922 static int io_recvmsg_prep_multishot(struct io_async_msghdr *kmsg,
923 				     struct io_sr_msg *sr, void __user **buf,
924 				     size_t *len)
925 {
926 	unsigned long ubuf = (unsigned long) *buf;
927 	unsigned long hdr;
928 
929 	hdr = sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
930 		kmsg->controllen;
931 	if (*len < hdr)
932 		return -EFAULT;
933 
934 	if (kmsg->controllen) {
935 		unsigned long control = ubuf + hdr - kmsg->controllen;
936 
937 		kmsg->msg.msg_control_user = (void __user *) control;
938 		kmsg->msg.msg_controllen = kmsg->controllen;
939 	}
940 
941 	sr->buf = *buf; /* stash for later copy */
942 	*buf = (void __user *) (ubuf + hdr);
943 	kmsg->payloadlen = *len = *len - hdr;
944 	return 0;
945 }
946 
947 struct io_recvmsg_multishot_hdr {
948 	struct io_uring_recvmsg_out msg;
949 	struct sockaddr_storage addr;
950 };
951 
952 static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io,
953 				struct io_async_msghdr *kmsg,
954 				unsigned int flags, bool *finished)
955 {
956 	int err;
957 	int copy_len;
958 	struct io_recvmsg_multishot_hdr hdr;
959 
960 	if (kmsg->namelen)
961 		kmsg->msg.msg_name = &hdr.addr;
962 	kmsg->msg.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
963 	kmsg->msg.msg_namelen = 0;
964 
965 	if (sock->file->f_flags & O_NONBLOCK)
966 		flags |= MSG_DONTWAIT;
967 
968 	err = sock_recvmsg(sock, &kmsg->msg, flags);
969 	*finished = err <= 0;
970 	if (err < 0)
971 		return err;
972 
973 	hdr.msg = (struct io_uring_recvmsg_out) {
974 		.controllen = kmsg->controllen - kmsg->msg.msg_controllen,
975 		.flags = kmsg->msg.msg_flags & ~MSG_CMSG_COMPAT
976 	};
977 
978 	hdr.msg.payloadlen = err;
979 	if (err > kmsg->payloadlen)
980 		err = kmsg->payloadlen;
981 
982 	copy_len = sizeof(struct io_uring_recvmsg_out);
983 	if (kmsg->msg.msg_namelen > kmsg->namelen)
984 		copy_len += kmsg->namelen;
985 	else
986 		copy_len += kmsg->msg.msg_namelen;
987 
988 	/*
989 	 *      "fromlen shall refer to the value before truncation.."
990 	 *                      1003.1g
991 	 */
992 	hdr.msg.namelen = kmsg->msg.msg_namelen;
993 
994 	/* ensure that there is no gap between hdr and sockaddr_storage */
995 	BUILD_BUG_ON(offsetof(struct io_recvmsg_multishot_hdr, addr) !=
996 		     sizeof(struct io_uring_recvmsg_out));
997 	if (copy_to_user(io->buf, &hdr, copy_len)) {
998 		*finished = true;
999 		return -EFAULT;
1000 	}
1001 
1002 	return sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
1003 			kmsg->controllen + err;
1004 }
1005 
1006 int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
1007 {
1008 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1009 	struct io_async_msghdr *kmsg = req->async_data;
1010 	struct io_br_sel sel = { };
1011 	struct socket *sock;
1012 	unsigned flags;
1013 	int ret, min_ret = 0;
1014 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1015 	bool mshot_finished = true;
1016 
1017 	sock = sock_from_file(req->file);
1018 	if (unlikely(!sock))
1019 		return -ENOTSOCK;
1020 
1021 	if (!(req->flags & REQ_F_POLLED) &&
1022 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
1023 		return -EAGAIN;
1024 
1025 	flags = sr->msg_flags;
1026 	if (force_nonblock)
1027 		flags |= MSG_DONTWAIT;
1028 
1029 retry_multishot:
1030 	sel.buf_list = NULL;
1031 	if (io_do_buffer_select(req)) {
1032 		size_t len = sr->len;
1033 
1034 		sel = io_buffer_select(req, &len, sr->buf_group, issue_flags);
1035 		if (!sel.addr)
1036 			return -ENOBUFS;
1037 
1038 		if (req->flags & REQ_F_APOLL_MULTISHOT) {
1039 			ret = io_recvmsg_prep_multishot(kmsg, sr, &sel.addr, &len);
1040 			if (ret) {
1041 				io_kbuf_recycle(req, sel.buf_list, issue_flags);
1042 				return ret;
1043 			}
1044 		}
1045 
1046 		iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, sel.addr, len);
1047 	}
1048 
1049 	kmsg->msg.msg_get_inq = 1;
1050 	kmsg->msg.msg_inq = -1;
1051 	if (req->flags & REQ_F_APOLL_MULTISHOT) {
1052 		ret = io_recvmsg_multishot(sock, sr, kmsg, flags,
1053 					   &mshot_finished);
1054 	} else {
1055 		/* disable partial retry for recvmsg with cmsg attached */
1056 		if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen)
1057 			min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1058 
1059 		ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg,
1060 					 kmsg->uaddr, flags);
1061 	}
1062 
1063 	if (ret < min_ret) {
1064 		if (ret == -EAGAIN && force_nonblock) {
1065 			io_kbuf_recycle(req, sel.buf_list, issue_flags);
1066 			return IOU_RETRY;
1067 		}
1068 		if (ret > 0 && io_net_retry(sock, flags)) {
1069 			sr->done_io += ret;
1070 			return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret);
1071 		}
1072 		if (ret == -ERESTARTSYS)
1073 			ret = -EINTR;
1074 		req_set_fail(req);
1075 	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
1076 		req_set_fail(req);
1077 	}
1078 
1079 	if (ret > 0)
1080 		ret += sr->done_io;
1081 	else if (sr->done_io)
1082 		ret = sr->done_io;
1083 	else
1084 		io_kbuf_recycle(req, sel.buf_list, issue_flags);
1085 
1086 	sel.val = ret;
1087 	if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags))
1088 		goto retry_multishot;
1089 
1090 	return sel.val;
1091 }
1092 
1093 static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg,
1094 			      struct io_br_sel *sel, unsigned int issue_flags)
1095 {
1096 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1097 	int ret;
1098 
1099 	/*
1100 	 * If the ring isn't locked, then don't use the peek interface
1101 	 * to grab multiple buffers as we will lock/unlock between
1102 	 * this selection and posting the buffers.
1103 	 */
1104 	if (!(issue_flags & IO_URING_F_UNLOCKED) &&
1105 	    sr->flags & IORING_RECVSEND_BUNDLE) {
1106 		struct buf_sel_arg arg = {
1107 			.iovs = &kmsg->fast_iov,
1108 			.nr_iovs = 1,
1109 			.mode = KBUF_MODE_EXPAND,
1110 			.buf_group = sr->buf_group,
1111 		};
1112 
1113 		if (kmsg->vec.iovec) {
1114 			arg.nr_iovs = kmsg->vec.nr;
1115 			arg.iovs = kmsg->vec.iovec;
1116 			arg.mode |= KBUF_MODE_FREE;
1117 		}
1118 
1119 		if (sel->val)
1120 			arg.max_len = sel->val;
1121 		else if (kmsg->msg.msg_inq > 1)
1122 			arg.max_len = min_not_zero(sel->val, (ssize_t) kmsg->msg.msg_inq);
1123 
1124 		/* if mshot limited, ensure we don't go over */
1125 		if (sr->flags & IORING_RECV_MSHOT_LIM)
1126 			arg.max_len = min_not_zero(arg.max_len, sr->mshot_total_len);
1127 		ret = io_buffers_peek(req, &arg, sel);
1128 		if (unlikely(ret < 0))
1129 			return ret;
1130 
1131 		if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
1132 			kmsg->vec.nr = ret;
1133 			kmsg->vec.iovec = arg.iovs;
1134 			req->flags |= REQ_F_NEED_CLEANUP;
1135 		}
1136 		if (arg.partial_map)
1137 			sr->flags |= IORING_RECV_PARTIAL_MAP;
1138 
1139 		/* special case 1 vec, can be a fast path */
1140 		if (ret == 1) {
1141 			sr->buf = arg.iovs[0].iov_base;
1142 			sr->len = arg.iovs[0].iov_len;
1143 			goto map_ubuf;
1144 		}
1145 		iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret,
1146 				arg.out_len);
1147 	} else {
1148 		size_t len = sel->val;
1149 
1150 		*sel = io_buffer_select(req, &len, sr->buf_group, issue_flags);
1151 		if (!sel->addr)
1152 			return -ENOBUFS;
1153 		sr->buf = sel->addr;
1154 		sr->len = len;
1155 map_ubuf:
1156 		ret = import_ubuf(ITER_DEST, sr->buf, sr->len,
1157 				  &kmsg->msg.msg_iter);
1158 		if (unlikely(ret))
1159 			return ret;
1160 	}
1161 
1162 	return 0;
1163 }
1164 
1165 int io_recv(struct io_kiocb *req, unsigned int issue_flags)
1166 {
1167 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1168 	struct io_async_msghdr *kmsg = req->async_data;
1169 	struct io_br_sel sel;
1170 	struct socket *sock;
1171 	unsigned flags;
1172 	int ret, min_ret = 0;
1173 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1174 	bool mshot_finished;
1175 
1176 	if (!(req->flags & REQ_F_POLLED) &&
1177 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
1178 		return -EAGAIN;
1179 
1180 	sock = sock_from_file(req->file);
1181 	if (unlikely(!sock))
1182 		return -ENOTSOCK;
1183 
1184 	flags = sr->msg_flags;
1185 	if (force_nonblock)
1186 		flags |= MSG_DONTWAIT;
1187 
1188 retry_multishot:
1189 	sel.buf_list = NULL;
1190 	if (io_do_buffer_select(req)) {
1191 		sel.val = sr->len;
1192 		ret = io_recv_buf_select(req, kmsg, &sel, issue_flags);
1193 		if (unlikely(ret < 0)) {
1194 			kmsg->msg.msg_inq = -1;
1195 			goto out_free;
1196 		}
1197 		sr->buf = NULL;
1198 	}
1199 
1200 	kmsg->msg.msg_flags = 0;
1201 	kmsg->msg.msg_inq = -1;
1202 
1203 	if (flags & MSG_WAITALL)
1204 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1205 
1206 	ret = sock_recvmsg(sock, &kmsg->msg, flags);
1207 	if (ret < min_ret) {
1208 		if (ret == -EAGAIN && force_nonblock) {
1209 			io_kbuf_recycle(req, sel.buf_list, issue_flags);
1210 			return IOU_RETRY;
1211 		}
1212 		if (ret > 0 && io_net_retry(sock, flags)) {
1213 			sr->len -= ret;
1214 			sr->buf += ret;
1215 			sr->done_io += ret;
1216 			return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret);
1217 		}
1218 		if (ret == -ERESTARTSYS)
1219 			ret = -EINTR;
1220 		req_set_fail(req);
1221 	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
1222 out_free:
1223 		req_set_fail(req);
1224 	}
1225 
1226 	mshot_finished = ret <= 0;
1227 	if (ret > 0)
1228 		ret += sr->done_io;
1229 	else if (sr->done_io)
1230 		ret = sr->done_io;
1231 	else
1232 		io_kbuf_recycle(req, sel.buf_list, issue_flags);
1233 
1234 	sel.val = ret;
1235 	if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags))
1236 		goto retry_multishot;
1237 
1238 	return sel.val;
1239 }
1240 
1241 int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1242 {
1243 	struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
1244 	unsigned ifq_idx;
1245 
1246 	if (unlikely(sqe->addr2 || sqe->addr || sqe->addr3))
1247 		return -EINVAL;
1248 
1249 	ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx);
1250 	zc->ifq = xa_load(&req->ctx->zcrx_ctxs, ifq_idx);
1251 	if (!zc->ifq)
1252 		return -EINVAL;
1253 
1254 	zc->len = READ_ONCE(sqe->len);
1255 	zc->flags = READ_ONCE(sqe->ioprio);
1256 	zc->msg_flags = READ_ONCE(sqe->msg_flags);
1257 	if (zc->msg_flags)
1258 		return -EINVAL;
1259 	if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT))
1260 		return -EINVAL;
1261 	/* multishot required */
1262 	if (!(zc->flags & IORING_RECV_MULTISHOT))
1263 		return -EINVAL;
1264 	/* All data completions are posted as aux CQEs. */
1265 	req->flags |= REQ_F_APOLL_MULTISHOT;
1266 
1267 	return 0;
1268 }
1269 
1270 int io_recvzc(struct io_kiocb *req, unsigned int issue_flags)
1271 {
1272 	struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
1273 	struct socket *sock;
1274 	unsigned int len;
1275 	int ret;
1276 
1277 	if (!(req->flags & REQ_F_POLLED) &&
1278 	    (zc->flags & IORING_RECVSEND_POLL_FIRST))
1279 		return -EAGAIN;
1280 
1281 	sock = sock_from_file(req->file);
1282 	if (unlikely(!sock))
1283 		return -ENOTSOCK;
1284 
1285 	len = zc->len;
1286 	ret = io_zcrx_recv(req, zc->ifq, sock, zc->msg_flags | MSG_DONTWAIT,
1287 			   issue_flags, &zc->len);
1288 	if (len && zc->len == 0) {
1289 		io_req_set_res(req, 0, 0);
1290 
1291 		return IOU_COMPLETE;
1292 	}
1293 	if (unlikely(ret <= 0) && ret != -EAGAIN) {
1294 		if (ret == -ERESTARTSYS)
1295 			ret = -EINTR;
1296 		if (ret == IOU_REQUEUE)
1297 			return IOU_REQUEUE;
1298 
1299 		req_set_fail(req);
1300 		io_req_set_res(req, ret, 0);
1301 		return IOU_COMPLETE;
1302 	}
1303 	return IOU_RETRY;
1304 }
1305 
1306 void io_send_zc_cleanup(struct io_kiocb *req)
1307 {
1308 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1309 	struct io_async_msghdr *io = req->async_data;
1310 
1311 	if (req_has_async_data(req))
1312 		io_netmsg_iovec_free(io);
1313 	if (zc->notif) {
1314 		io_notif_flush(zc->notif);
1315 		zc->notif = NULL;
1316 	}
1317 }
1318 
1319 #define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF)
1320 #define IO_ZC_FLAGS_VALID  (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE | \
1321 				IORING_SEND_VECTORIZED)
1322 
1323 int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1324 {
1325 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1326 	struct io_ring_ctx *ctx = req->ctx;
1327 	struct io_async_msghdr *iomsg;
1328 	struct io_kiocb *notif;
1329 	int ret;
1330 
1331 	zc->done_io = 0;
1332 
1333 	if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)))
1334 		return -EINVAL;
1335 	/* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */
1336 	if (req->flags & REQ_F_CQE_SKIP)
1337 		return -EINVAL;
1338 
1339 	notif = zc->notif = io_alloc_notif(ctx);
1340 	if (!notif)
1341 		return -ENOMEM;
1342 	notif->cqe.user_data = req->cqe.user_data;
1343 	notif->cqe.res = 0;
1344 	notif->cqe.flags = IORING_CQE_F_NOTIF;
1345 	req->flags |= REQ_F_NEED_CLEANUP | REQ_F_POLL_NO_LAZY;
1346 
1347 	zc->flags = READ_ONCE(sqe->ioprio);
1348 	if (unlikely(zc->flags & ~IO_ZC_FLAGS_COMMON)) {
1349 		if (zc->flags & ~IO_ZC_FLAGS_VALID)
1350 			return -EINVAL;
1351 		if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) {
1352 			struct io_notif_data *nd = io_notif_to_data(notif);
1353 
1354 			nd->zc_report = true;
1355 			nd->zc_used = false;
1356 			nd->zc_copied = false;
1357 		}
1358 	}
1359 
1360 	zc->len = READ_ONCE(sqe->len);
1361 	zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY;
1362 	req->buf_index = READ_ONCE(sqe->buf_index);
1363 	if (zc->msg_flags & MSG_DONTWAIT)
1364 		req->flags |= REQ_F_NOWAIT;
1365 
1366 	if (io_is_compat(req->ctx))
1367 		zc->msg_flags |= MSG_CMSG_COMPAT;
1368 
1369 	iomsg = io_msg_alloc_async(req);
1370 	if (unlikely(!iomsg))
1371 		return -ENOMEM;
1372 
1373 	if (req->opcode == IORING_OP_SEND_ZC) {
1374 		ret = io_send_setup(req, sqe);
1375 	} else {
1376 		if (unlikely(sqe->addr2 || sqe->file_index))
1377 			return -EINVAL;
1378 		ret = io_sendmsg_setup(req, sqe);
1379 	}
1380 	if (unlikely(ret))
1381 		return ret;
1382 
1383 	if (!(zc->flags & IORING_RECVSEND_FIXED_BUF)) {
1384 		iomsg->msg.sg_from_iter = io_sg_from_iter_iovec;
1385 		return io_notif_account_mem(zc->notif, iomsg->msg.msg_iter.count);
1386 	}
1387 	iomsg->msg.sg_from_iter = io_sg_from_iter;
1388 	return 0;
1389 }
1390 
1391 static int io_sg_from_iter_iovec(struct sk_buff *skb,
1392 				 struct iov_iter *from, size_t length)
1393 {
1394 	skb_zcopy_downgrade_managed(skb);
1395 	return zerocopy_fill_skb_from_iter(skb, from, length);
1396 }
1397 
1398 static int io_sg_from_iter(struct sk_buff *skb,
1399 			   struct iov_iter *from, size_t length)
1400 {
1401 	struct skb_shared_info *shinfo = skb_shinfo(skb);
1402 	int frag = shinfo->nr_frags;
1403 	int ret = 0;
1404 	struct bvec_iter bi;
1405 	ssize_t copied = 0;
1406 	unsigned long truesize = 0;
1407 
1408 	if (!frag)
1409 		shinfo->flags |= SKBFL_MANAGED_FRAG_REFS;
1410 	else if (unlikely(!skb_zcopy_managed(skb)))
1411 		return zerocopy_fill_skb_from_iter(skb, from, length);
1412 
1413 	bi.bi_size = min(from->count, length);
1414 	bi.bi_bvec_done = from->iov_offset;
1415 	bi.bi_idx = 0;
1416 
1417 	while (bi.bi_size && frag < MAX_SKB_FRAGS) {
1418 		struct bio_vec v = mp_bvec_iter_bvec(from->bvec, bi);
1419 
1420 		copied += v.bv_len;
1421 		truesize += PAGE_ALIGN(v.bv_len + v.bv_offset);
1422 		__skb_fill_page_desc_noacc(shinfo, frag++, v.bv_page,
1423 					   v.bv_offset, v.bv_len);
1424 		bvec_iter_advance_single(from->bvec, &bi, v.bv_len);
1425 	}
1426 	if (bi.bi_size)
1427 		ret = -EMSGSIZE;
1428 
1429 	shinfo->nr_frags = frag;
1430 	from->bvec += bi.bi_idx;
1431 	from->nr_segs -= bi.bi_idx;
1432 	from->count -= copied;
1433 	from->iov_offset = bi.bi_bvec_done;
1434 
1435 	skb->data_len += copied;
1436 	skb->len += copied;
1437 	skb->truesize += truesize;
1438 	return ret;
1439 }
1440 
1441 static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags)
1442 {
1443 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1444 	struct io_async_msghdr *kmsg = req->async_data;
1445 
1446 	WARN_ON_ONCE(!(sr->flags & IORING_RECVSEND_FIXED_BUF));
1447 
1448 	sr->notif->buf_index = req->buf_index;
1449 	return io_import_reg_buf(sr->notif, &kmsg->msg.msg_iter,
1450 				(u64)(uintptr_t)sr->buf, sr->len,
1451 				ITER_SOURCE, issue_flags);
1452 }
1453 
1454 int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
1455 {
1456 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1457 	struct io_async_msghdr *kmsg = req->async_data;
1458 	struct socket *sock;
1459 	unsigned msg_flags;
1460 	int ret, min_ret = 0;
1461 
1462 	sock = sock_from_file(req->file);
1463 	if (unlikely(!sock))
1464 		return -ENOTSOCK;
1465 	if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
1466 		return -EOPNOTSUPP;
1467 
1468 	if (!(req->flags & REQ_F_POLLED) &&
1469 	    (zc->flags & IORING_RECVSEND_POLL_FIRST))
1470 		return -EAGAIN;
1471 
1472 	if (req->flags & REQ_F_IMPORT_BUFFER) {
1473 		req->flags &= ~REQ_F_IMPORT_BUFFER;
1474 		ret = io_send_zc_import(req, issue_flags);
1475 		if (unlikely(ret))
1476 			return ret;
1477 	}
1478 
1479 	msg_flags = zc->msg_flags;
1480 	if (issue_flags & IO_URING_F_NONBLOCK)
1481 		msg_flags |= MSG_DONTWAIT;
1482 	if (msg_flags & MSG_WAITALL)
1483 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1484 	msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
1485 
1486 	kmsg->msg.msg_flags = msg_flags;
1487 	kmsg->msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg;
1488 	ret = sock_sendmsg(sock, &kmsg->msg);
1489 
1490 	if (unlikely(ret < min_ret)) {
1491 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1492 			return -EAGAIN;
1493 
1494 		if (ret > 0 && io_net_retry(sock, kmsg->msg.msg_flags)) {
1495 			zc->len -= ret;
1496 			zc->buf += ret;
1497 			zc->done_io += ret;
1498 			return -EAGAIN;
1499 		}
1500 		if (ret == -ERESTARTSYS)
1501 			ret = -EINTR;
1502 		req_set_fail(req);
1503 	}
1504 
1505 	if (ret >= 0)
1506 		ret += zc->done_io;
1507 	else if (zc->done_io)
1508 		ret = zc->done_io;
1509 
1510 	/*
1511 	 * If we're in io-wq we can't rely on tw ordering guarantees, defer
1512 	 * flushing notif to io_send_zc_cleanup()
1513 	 */
1514 	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
1515 		io_notif_flush(zc->notif);
1516 		zc->notif = NULL;
1517 		io_req_msg_cleanup(req, 0);
1518 	}
1519 	io_req_set_res(req, ret, IORING_CQE_F_MORE);
1520 	return IOU_COMPLETE;
1521 }
1522 
1523 int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
1524 {
1525 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1526 	struct io_async_msghdr *kmsg = req->async_data;
1527 	struct socket *sock;
1528 	unsigned flags;
1529 	int ret, min_ret = 0;
1530 
1531 	if (req->flags & REQ_F_IMPORT_BUFFER) {
1532 		unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs;
1533 		int ret;
1534 
1535 		ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter, req,
1536 					&kmsg->vec, uvec_segs, issue_flags);
1537 		if (unlikely(ret))
1538 			return ret;
1539 		req->flags &= ~REQ_F_IMPORT_BUFFER;
1540 	}
1541 
1542 	sock = sock_from_file(req->file);
1543 	if (unlikely(!sock))
1544 		return -ENOTSOCK;
1545 	if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
1546 		return -EOPNOTSUPP;
1547 
1548 	if (!(req->flags & REQ_F_POLLED) &&
1549 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
1550 		return -EAGAIN;
1551 
1552 	flags = sr->msg_flags;
1553 	if (issue_flags & IO_URING_F_NONBLOCK)
1554 		flags |= MSG_DONTWAIT;
1555 	if (flags & MSG_WAITALL)
1556 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1557 
1558 	kmsg->msg.msg_control_user = sr->msg_control;
1559 	kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg;
1560 	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
1561 
1562 	if (unlikely(ret < min_ret)) {
1563 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1564 			return -EAGAIN;
1565 
1566 		if (ret > 0 && io_net_retry(sock, flags)) {
1567 			sr->done_io += ret;
1568 			return -EAGAIN;
1569 		}
1570 		if (ret == -ERESTARTSYS)
1571 			ret = -EINTR;
1572 		req_set_fail(req);
1573 	}
1574 
1575 	if (ret >= 0)
1576 		ret += sr->done_io;
1577 	else if (sr->done_io)
1578 		ret = sr->done_io;
1579 
1580 	/*
1581 	 * If we're in io-wq we can't rely on tw ordering guarantees, defer
1582 	 * flushing notif to io_send_zc_cleanup()
1583 	 */
1584 	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
1585 		io_notif_flush(sr->notif);
1586 		sr->notif = NULL;
1587 		io_req_msg_cleanup(req, 0);
1588 	}
1589 	io_req_set_res(req, ret, IORING_CQE_F_MORE);
1590 	return IOU_COMPLETE;
1591 }
1592 
1593 void io_sendrecv_fail(struct io_kiocb *req)
1594 {
1595 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1596 
1597 	if (sr->done_io)
1598 		req->cqe.res = sr->done_io;
1599 
1600 	if ((req->flags & REQ_F_NEED_CLEANUP) &&
1601 	    (req->opcode == IORING_OP_SEND_ZC || req->opcode == IORING_OP_SENDMSG_ZC))
1602 		req->cqe.flags |= IORING_CQE_F_MORE;
1603 }
1604 
1605 #define ACCEPT_FLAGS	(IORING_ACCEPT_MULTISHOT | IORING_ACCEPT_DONTWAIT | \
1606 			 IORING_ACCEPT_POLL_FIRST)
1607 
1608 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1609 {
1610 	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
1611 
1612 	if (sqe->len || sqe->buf_index)
1613 		return -EINVAL;
1614 
1615 	accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1616 	accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
1617 	accept->flags = READ_ONCE(sqe->accept_flags);
1618 	accept->nofile = rlimit(RLIMIT_NOFILE);
1619 	accept->iou_flags = READ_ONCE(sqe->ioprio);
1620 	if (accept->iou_flags & ~ACCEPT_FLAGS)
1621 		return -EINVAL;
1622 
1623 	accept->file_slot = READ_ONCE(sqe->file_index);
1624 	if (accept->file_slot) {
1625 		if (accept->flags & SOCK_CLOEXEC)
1626 			return -EINVAL;
1627 		if (accept->iou_flags & IORING_ACCEPT_MULTISHOT &&
1628 		    accept->file_slot != IORING_FILE_INDEX_ALLOC)
1629 			return -EINVAL;
1630 	}
1631 	if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1632 		return -EINVAL;
1633 	if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
1634 		accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
1635 	if (accept->iou_flags & IORING_ACCEPT_MULTISHOT)
1636 		req->flags |= REQ_F_APOLL_MULTISHOT;
1637 	if (accept->iou_flags & IORING_ACCEPT_DONTWAIT)
1638 		req->flags |= REQ_F_NOWAIT;
1639 	return 0;
1640 }
1641 
1642 int io_accept(struct io_kiocb *req, unsigned int issue_flags)
1643 {
1644 	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
1645 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1646 	bool fixed = !!accept->file_slot;
1647 	struct proto_accept_arg arg = {
1648 		.flags = force_nonblock ? O_NONBLOCK : 0,
1649 	};
1650 	struct file *file;
1651 	unsigned cflags;
1652 	int ret, fd;
1653 
1654 	if (!(req->flags & REQ_F_POLLED) &&
1655 	    accept->iou_flags & IORING_ACCEPT_POLL_FIRST)
1656 		return -EAGAIN;
1657 
1658 retry:
1659 	if (!fixed) {
1660 		fd = __get_unused_fd_flags(accept->flags, accept->nofile);
1661 		if (unlikely(fd < 0))
1662 			return fd;
1663 	}
1664 	arg.err = 0;
1665 	arg.is_empty = -1;
1666 	file = do_accept(req->file, &arg, accept->addr, accept->addr_len,
1667 			 accept->flags);
1668 	if (IS_ERR(file)) {
1669 		if (!fixed)
1670 			put_unused_fd(fd);
1671 		ret = PTR_ERR(file);
1672 		if (ret == -EAGAIN && force_nonblock &&
1673 		    !(accept->iou_flags & IORING_ACCEPT_DONTWAIT))
1674 			return IOU_RETRY;
1675 
1676 		if (ret == -ERESTARTSYS)
1677 			ret = -EINTR;
1678 	} else if (!fixed) {
1679 		fd_install(fd, file);
1680 		ret = fd;
1681 	} else {
1682 		ret = io_fixed_fd_install(req, issue_flags, file,
1683 						accept->file_slot);
1684 	}
1685 
1686 	cflags = 0;
1687 	if (!arg.is_empty)
1688 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
1689 
1690 	if (ret >= 0 && (req->flags & REQ_F_APOLL_MULTISHOT) &&
1691 	    io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) {
1692 		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1)
1693 			goto retry;
1694 		return IOU_RETRY;
1695 	}
1696 
1697 	io_req_set_res(req, ret, cflags);
1698 	if (ret < 0)
1699 		req_set_fail(req);
1700 	return IOU_COMPLETE;
1701 }
1702 
1703 int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1704 {
1705 	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1706 
1707 	if (sqe->addr || sqe->rw_flags || sqe->buf_index)
1708 		return -EINVAL;
1709 
1710 	sock->domain = READ_ONCE(sqe->fd);
1711 	sock->type = READ_ONCE(sqe->off);
1712 	sock->protocol = READ_ONCE(sqe->len);
1713 	sock->file_slot = READ_ONCE(sqe->file_index);
1714 	sock->nofile = rlimit(RLIMIT_NOFILE);
1715 
1716 	sock->flags = sock->type & ~SOCK_TYPE_MASK;
1717 	if (sock->file_slot && (sock->flags & SOCK_CLOEXEC))
1718 		return -EINVAL;
1719 	if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1720 		return -EINVAL;
1721 	return 0;
1722 }
1723 
1724 int io_socket(struct io_kiocb *req, unsigned int issue_flags)
1725 {
1726 	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1727 	bool fixed = !!sock->file_slot;
1728 	struct file *file;
1729 	int ret, fd;
1730 
1731 	if (!fixed) {
1732 		fd = __get_unused_fd_flags(sock->flags, sock->nofile);
1733 		if (unlikely(fd < 0))
1734 			return fd;
1735 	}
1736 	file = __sys_socket_file(sock->domain, sock->type, sock->protocol);
1737 	if (IS_ERR(file)) {
1738 		if (!fixed)
1739 			put_unused_fd(fd);
1740 		ret = PTR_ERR(file);
1741 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1742 			return -EAGAIN;
1743 		if (ret == -ERESTARTSYS)
1744 			ret = -EINTR;
1745 		req_set_fail(req);
1746 	} else if (!fixed) {
1747 		fd_install(fd, file);
1748 		ret = fd;
1749 	} else {
1750 		ret = io_fixed_fd_install(req, issue_flags, file,
1751 					    sock->file_slot);
1752 	}
1753 	io_req_set_res(req, ret, 0);
1754 	return IOU_COMPLETE;
1755 }
1756 
1757 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1758 {
1759 	struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect);
1760 	struct io_async_msghdr *io;
1761 
1762 	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
1763 		return -EINVAL;
1764 
1765 	conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1766 	conn->addr_len =  READ_ONCE(sqe->addr2);
1767 	conn->in_progress = conn->seen_econnaborted = false;
1768 
1769 	io = io_msg_alloc_async(req);
1770 	if (unlikely(!io))
1771 		return -ENOMEM;
1772 
1773 	return move_addr_to_kernel(conn->addr, conn->addr_len, &io->addr);
1774 }
1775 
1776 int io_connect(struct io_kiocb *req, unsigned int issue_flags)
1777 {
1778 	struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect);
1779 	struct io_async_msghdr *io = req->async_data;
1780 	unsigned file_flags;
1781 	int ret;
1782 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1783 
1784 	if (connect->in_progress) {
1785 		struct poll_table_struct pt = { ._key = EPOLLERR };
1786 
1787 		if (vfs_poll(req->file, &pt) & EPOLLERR)
1788 			goto get_sock_err;
1789 	}
1790 
1791 	file_flags = force_nonblock ? O_NONBLOCK : 0;
1792 
1793 	ret = __sys_connect_file(req->file, &io->addr, connect->addr_len,
1794 				 file_flags);
1795 	if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED)
1796 	    && force_nonblock) {
1797 		if (ret == -EINPROGRESS) {
1798 			connect->in_progress = true;
1799 		} else if (ret == -ECONNABORTED) {
1800 			if (connect->seen_econnaborted)
1801 				goto out;
1802 			connect->seen_econnaborted = true;
1803 		}
1804 		return -EAGAIN;
1805 	}
1806 	if (connect->in_progress) {
1807 		/*
1808 		 * At least bluetooth will return -EBADFD on a re-connect
1809 		 * attempt, and it's (supposedly) also valid to get -EISCONN
1810 		 * which means the previous result is good. For both of these,
1811 		 * grab the sock_error() and use that for the completion.
1812 		 */
1813 		if (ret == -EBADFD || ret == -EISCONN) {
1814 get_sock_err:
1815 			ret = sock_error(sock_from_file(req->file)->sk);
1816 		}
1817 	}
1818 	if (ret == -ERESTARTSYS)
1819 		ret = -EINTR;
1820 out:
1821 	if (ret < 0)
1822 		req_set_fail(req);
1823 	io_req_msg_cleanup(req, issue_flags);
1824 	io_req_set_res(req, ret, 0);
1825 	return IOU_COMPLETE;
1826 }
1827 
1828 int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1829 {
1830 	struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind);
1831 	struct sockaddr __user *uaddr;
1832 	struct io_async_msghdr *io;
1833 
1834 	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
1835 		return -EINVAL;
1836 
1837 	uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1838 	bind->addr_len =  READ_ONCE(sqe->addr2);
1839 
1840 	io = io_msg_alloc_async(req);
1841 	if (unlikely(!io))
1842 		return -ENOMEM;
1843 	return move_addr_to_kernel(uaddr, bind->addr_len, &io->addr);
1844 }
1845 
1846 int io_bind(struct io_kiocb *req, unsigned int issue_flags)
1847 {
1848 	struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind);
1849 	struct io_async_msghdr *io = req->async_data;
1850 	struct socket *sock;
1851 	int ret;
1852 
1853 	sock = sock_from_file(req->file);
1854 	if (unlikely(!sock))
1855 		return -ENOTSOCK;
1856 
1857 	ret = __sys_bind_socket(sock, &io->addr, bind->addr_len);
1858 	if (ret < 0)
1859 		req_set_fail(req);
1860 	io_req_set_res(req, ret, 0);
1861 	return 0;
1862 }
1863 
1864 int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1865 {
1866 	struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen);
1867 
1868 	if (sqe->addr || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in || sqe->addr2)
1869 		return -EINVAL;
1870 
1871 	listen->backlog = READ_ONCE(sqe->len);
1872 	return 0;
1873 }
1874 
1875 int io_listen(struct io_kiocb *req, unsigned int issue_flags)
1876 {
1877 	struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen);
1878 	struct socket *sock;
1879 	int ret;
1880 
1881 	sock = sock_from_file(req->file);
1882 	if (unlikely(!sock))
1883 		return -ENOTSOCK;
1884 
1885 	ret = __sys_listen_socket(sock, listen->backlog);
1886 	if (ret < 0)
1887 		req_set_fail(req);
1888 	io_req_set_res(req, ret, 0);
1889 	return 0;
1890 }
1891 
1892 void io_netmsg_cache_free(const void *entry)
1893 {
1894 	struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry;
1895 
1896 	io_vec_free(&kmsg->vec);
1897 	kfree(kmsg);
1898 }
1899