xref: /linux/io_uring/net.c (revision b948f9d5d3057b01188e36664e7c7604d1c8ecb5)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/file.h>
5 #include <linux/slab.h>
6 #include <linux/net.h>
7 #include <linux/compat.h>
8 #include <net/compat.h>
9 #include <linux/io_uring.h>
10 
11 #include <uapi/linux/io_uring.h>
12 
13 #include "filetable.h"
14 #include "io_uring.h"
15 #include "kbuf.h"
16 #include "alloc_cache.h"
17 #include "net.h"
18 #include "notif.h"
19 #include "rsrc.h"
20 #include "zcrx.h"
21 
22 struct io_shutdown {
23 	struct file			*file;
24 	int				how;
25 };
26 
27 struct io_accept {
28 	struct file			*file;
29 	struct sockaddr __user		*addr;
30 	int __user			*addr_len;
31 	int				flags;
32 	int				iou_flags;
33 	u32				file_slot;
34 	unsigned long			nofile;
35 };
36 
37 struct io_socket {
38 	struct file			*file;
39 	int				domain;
40 	int				type;
41 	int				protocol;
42 	int				flags;
43 	u32				file_slot;
44 	unsigned long			nofile;
45 };
46 
47 struct io_connect {
48 	struct file			*file;
49 	struct sockaddr __user		*addr;
50 	int				addr_len;
51 	bool				in_progress;
52 	bool				seen_econnaborted;
53 };
54 
55 struct io_bind {
56 	struct file			*file;
57 	int				addr_len;
58 };
59 
60 struct io_listen {
61 	struct file			*file;
62 	int				backlog;
63 };
64 
65 struct io_sr_msg {
66 	struct file			*file;
67 	union {
68 		struct compat_msghdr __user	*umsg_compat;
69 		struct user_msghdr __user	*umsg;
70 		void __user			*buf;
71 	};
72 	int				len;
73 	unsigned			done_io;
74 	unsigned			msg_flags;
75 	unsigned			nr_multishot_loops;
76 	u16				flags;
77 	/* initialised and used only by !msg send variants */
78 	u16				buf_group;
79 	/* per-invocation mshot limit */
80 	unsigned			mshot_len;
81 	/* overall mshot byte limit */
82 	unsigned			mshot_total_len;
83 	void __user			*msg_control;
84 	/* used only for send zerocopy */
85 	struct io_kiocb 		*notif;
86 };
87 
88 /*
89  * The UAPI flags are the lower 8 bits, as that's all sqe->ioprio will hold
90  * anyway. Use the upper 8 bits for internal uses.
91  */
92 enum sr_retry_flags {
93 	IORING_RECV_RETRY	= (1U << 15),
94 	IORING_RECV_PARTIAL_MAP	= (1U << 14),
95 	IORING_RECV_MSHOT_CAP	= (1U << 13),
96 	IORING_RECV_MSHOT_LIM	= (1U << 12),
97 	IORING_RECV_MSHOT_DONE	= (1U << 11),
98 
99 	IORING_RECV_RETRY_CLEAR	= IORING_RECV_RETRY | IORING_RECV_PARTIAL_MAP,
100 	IORING_RECV_NO_RETRY	= IORING_RECV_RETRY | IORING_RECV_PARTIAL_MAP |
101 				  IORING_RECV_MSHOT_CAP | IORING_RECV_MSHOT_DONE,
102 };
103 
104 /*
105  * Number of times we'll try and do receives if there's more data. If we
106  * exceed this limit, then add us to the back of the queue and retry from
107  * there. This helps fairness between flooding clients.
108  */
109 #define MULTISHOT_MAX_RETRY	32
110 
111 struct io_recvzc {
112 	struct file			*file;
113 	u16				flags;
114 	u32				len;
115 	struct io_zcrx_ifq		*ifq;
116 };
117 
118 static int io_sg_from_iter_iovec(struct sk_buff *skb,
119 				 struct iov_iter *from, size_t length);
120 static int io_sg_from_iter(struct sk_buff *skb,
121 			   struct iov_iter *from, size_t length);
122 
123 int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
124 {
125 	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
126 
127 	if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
128 		     sqe->buf_index || sqe->splice_fd_in))
129 		return -EINVAL;
130 
131 	shutdown->how = READ_ONCE(sqe->len);
132 	req->flags |= REQ_F_FORCE_ASYNC;
133 	return 0;
134 }
135 
136 int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
137 {
138 	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
139 	struct socket *sock;
140 	int ret;
141 
142 	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
143 
144 	sock = sock_from_file(req->file);
145 	if (unlikely(!sock))
146 		return -ENOTSOCK;
147 
148 	ret = __sys_shutdown_sock(sock, shutdown->how);
149 	io_req_set_res(req, ret, 0);
150 	return IOU_COMPLETE;
151 }
152 
153 static bool io_net_retry(struct socket *sock, int flags)
154 {
155 	if (!(flags & MSG_WAITALL))
156 		return false;
157 	return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
158 }
159 
160 static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg)
161 {
162 	if (kmsg->vec.iovec)
163 		io_vec_free(&kmsg->vec);
164 }
165 
166 static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
167 {
168 	struct io_async_msghdr *hdr = req->async_data;
169 
170 	/* can't recycle, ensure we free the iovec if we have one */
171 	if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
172 		io_netmsg_iovec_free(hdr);
173 		return;
174 	}
175 
176 	/* Let normal cleanup path reap it if we fail adding to the cache */
177 	io_alloc_cache_vec_kasan(&hdr->vec);
178 	if (hdr->vec.nr > IO_VEC_CACHE_SOFT_CAP)
179 		io_vec_free(&hdr->vec);
180 
181 	if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr))
182 		io_req_async_data_clear(req, REQ_F_NEED_CLEANUP);
183 }
184 
185 static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req)
186 {
187 	struct io_ring_ctx *ctx = req->ctx;
188 	struct io_async_msghdr *hdr;
189 
190 	hdr = io_uring_alloc_async_data(&ctx->netmsg_cache, req);
191 	if (!hdr)
192 		return NULL;
193 
194 	/* If the async data was cached, we might have an iov cached inside. */
195 	if (hdr->vec.iovec)
196 		req->flags |= REQ_F_NEED_CLEANUP;
197 	return hdr;
198 }
199 
200 static inline void io_mshot_prep_retry(struct io_kiocb *req,
201 				       struct io_async_msghdr *kmsg)
202 {
203 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
204 
205 	req->flags &= ~REQ_F_BL_EMPTY;
206 	sr->done_io = 0;
207 	sr->flags &= ~IORING_RECV_RETRY_CLEAR;
208 	sr->len = sr->mshot_len;
209 }
210 
211 static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg,
212 			     const struct iovec __user *uiov, unsigned uvec_seg,
213 			     int ddir)
214 {
215 	struct iovec *iov;
216 	int ret, nr_segs;
217 
218 	if (iomsg->vec.iovec) {
219 		nr_segs = iomsg->vec.nr;
220 		iov = iomsg->vec.iovec;
221 	} else {
222 		nr_segs = 1;
223 		iov = &iomsg->fast_iov;
224 	}
225 
226 	ret = __import_iovec(ddir, uiov, uvec_seg, nr_segs, &iov,
227 			     &iomsg->msg.msg_iter, io_is_compat(req->ctx));
228 	if (unlikely(ret < 0))
229 		return ret;
230 
231 	if (iov) {
232 		req->flags |= REQ_F_NEED_CLEANUP;
233 		io_vec_reset_iovec(&iomsg->vec, iov, iomsg->msg.msg_iter.nr_segs);
234 	}
235 	return 0;
236 }
237 
238 static int io_compat_msg_copy_hdr(struct io_kiocb *req,
239 				  struct io_async_msghdr *iomsg,
240 				  struct compat_msghdr *msg, int ddir,
241 				  struct sockaddr __user **save_addr)
242 {
243 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
244 	struct compat_iovec __user *uiov;
245 	int ret;
246 
247 	if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg)))
248 		return -EFAULT;
249 
250 	ret = __get_compat_msghdr(&iomsg->msg, msg, save_addr);
251 	if (ret)
252 		return ret;
253 
254 	uiov = compat_ptr(msg->msg_iov);
255 	if (req->flags & REQ_F_BUFFER_SELECT) {
256 		if (msg->msg_iovlen == 0) {
257 			sr->len = 0;
258 		} else if (msg->msg_iovlen > 1) {
259 			return -EINVAL;
260 		} else {
261 			struct compat_iovec tmp_iov;
262 
263 			if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov)))
264 				return -EFAULT;
265 			sr->len = tmp_iov.iov_len;
266 		}
267 	}
268 	return 0;
269 }
270 
271 static int io_copy_msghdr_from_user(struct user_msghdr *msg,
272 				    struct user_msghdr __user *umsg)
273 {
274 	if (!user_access_begin(umsg, sizeof(*umsg)))
275 		return -EFAULT;
276 	unsafe_get_user(msg->msg_name, &umsg->msg_name, ua_end);
277 	unsafe_get_user(msg->msg_namelen, &umsg->msg_namelen, ua_end);
278 	unsafe_get_user(msg->msg_iov, &umsg->msg_iov, ua_end);
279 	unsafe_get_user(msg->msg_iovlen, &umsg->msg_iovlen, ua_end);
280 	unsafe_get_user(msg->msg_control, &umsg->msg_control, ua_end);
281 	unsafe_get_user(msg->msg_controllen, &umsg->msg_controllen, ua_end);
282 	user_access_end();
283 	return 0;
284 ua_end:
285 	user_access_end();
286 	return -EFAULT;
287 }
288 
289 static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
290 			   struct user_msghdr *msg, int ddir,
291 			   struct sockaddr __user **save_addr)
292 {
293 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
294 	struct user_msghdr __user *umsg = sr->umsg;
295 	int ret;
296 
297 	iomsg->msg.msg_name = &iomsg->addr;
298 	iomsg->msg.msg_iter.nr_segs = 0;
299 
300 	if (io_is_compat(req->ctx)) {
301 		struct compat_msghdr cmsg;
302 
303 		ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ddir, save_addr);
304 		if (ret)
305 			return ret;
306 
307 		memset(msg, 0, sizeof(*msg));
308 		msg->msg_namelen = cmsg.msg_namelen;
309 		msg->msg_controllen = cmsg.msg_controllen;
310 		msg->msg_iov = compat_ptr(cmsg.msg_iov);
311 		msg->msg_iovlen = cmsg.msg_iovlen;
312 		return 0;
313 	}
314 
315 	ret = io_copy_msghdr_from_user(msg, umsg);
316 	if (unlikely(ret))
317 		return ret;
318 
319 	msg->msg_flags = 0;
320 
321 	ret = __copy_msghdr(&iomsg->msg, msg, save_addr);
322 	if (ret)
323 		return ret;
324 
325 	if (req->flags & REQ_F_BUFFER_SELECT) {
326 		if (msg->msg_iovlen == 0) {
327 			sr->len = 0;
328 		} else if (msg->msg_iovlen > 1) {
329 			return -EINVAL;
330 		} else {
331 			struct iovec __user *uiov = msg->msg_iov;
332 			struct iovec tmp_iov;
333 
334 			if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov)))
335 				return -EFAULT;
336 			sr->len = tmp_iov.iov_len;
337 		}
338 	}
339 	return 0;
340 }
341 
342 void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req)
343 {
344 	struct io_async_msghdr *io = req->async_data;
345 
346 	io_netmsg_iovec_free(io);
347 }
348 
349 static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
350 {
351 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
352 	struct io_async_msghdr *kmsg = req->async_data;
353 	void __user *addr;
354 	u16 addr_len;
355 	int ret;
356 
357 	sr->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
358 
359 	if (READ_ONCE(sqe->__pad3[0]))
360 		return -EINVAL;
361 
362 	kmsg->msg.msg_name = NULL;
363 	kmsg->msg.msg_namelen = 0;
364 	kmsg->msg.msg_control = NULL;
365 	kmsg->msg.msg_controllen = 0;
366 	kmsg->msg.msg_ubuf = NULL;
367 
368 	addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
369 	addr_len = READ_ONCE(sqe->addr_len);
370 	if (addr) {
371 		ret = move_addr_to_kernel(addr, addr_len, &kmsg->addr);
372 		if (unlikely(ret < 0))
373 			return ret;
374 		kmsg->msg.msg_name = &kmsg->addr;
375 		kmsg->msg.msg_namelen = addr_len;
376 	}
377 	if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
378 		if (sr->flags & IORING_SEND_VECTORIZED)
379 			return -EINVAL;
380 		req->flags |= REQ_F_IMPORT_BUFFER;
381 		return 0;
382 	}
383 	if (req->flags & REQ_F_BUFFER_SELECT)
384 		return 0;
385 
386 	if (sr->flags & IORING_SEND_VECTORIZED)
387 		return io_net_import_vec(req, kmsg, sr->buf, sr->len, ITER_SOURCE);
388 
389 	return import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter);
390 }
391 
392 static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
393 {
394 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
395 	struct io_async_msghdr *kmsg = req->async_data;
396 	struct user_msghdr msg;
397 	int ret;
398 
399 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
400 	ret = io_msg_copy_hdr(req, kmsg, &msg, ITER_SOURCE, NULL);
401 	if (unlikely(ret))
402 		return ret;
403 	/* save msg_control as sys_sendmsg() overwrites it */
404 	sr->msg_control = kmsg->msg.msg_control_user;
405 
406 	if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
407 		kmsg->msg.msg_iter.nr_segs = msg.msg_iovlen;
408 		return io_prep_reg_iovec(req, &kmsg->vec, msg.msg_iov,
409 					 msg.msg_iovlen);
410 	}
411 	if (req->flags & REQ_F_BUFFER_SELECT)
412 		return 0;
413 	return io_net_import_vec(req, kmsg, msg.msg_iov, msg.msg_iovlen, ITER_SOURCE);
414 }
415 
416 #define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE | IORING_SEND_VECTORIZED)
417 
418 int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
419 {
420 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
421 
422 	sr->done_io = 0;
423 	sr->len = READ_ONCE(sqe->len);
424 	if (unlikely(sr->len < 0))
425 		return -EINVAL;
426 	sr->flags = READ_ONCE(sqe->ioprio);
427 	if (sr->flags & ~SENDMSG_FLAGS)
428 		return -EINVAL;
429 	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
430 	if (sr->msg_flags & MSG_DONTWAIT)
431 		req->flags |= REQ_F_NOWAIT;
432 	if (req->flags & REQ_F_BUFFER_SELECT)
433 		sr->buf_group = req->buf_index;
434 	if (sr->flags & IORING_RECVSEND_BUNDLE) {
435 		if (req->opcode == IORING_OP_SENDMSG)
436 			return -EINVAL;
437 		sr->msg_flags |= MSG_WAITALL;
438 		req->flags |= REQ_F_MULTISHOT;
439 	}
440 
441 	if (io_is_compat(req->ctx))
442 		sr->msg_flags |= MSG_CMSG_COMPAT;
443 
444 	if (unlikely(!io_msg_alloc_async(req)))
445 		return -ENOMEM;
446 	if (req->opcode != IORING_OP_SENDMSG)
447 		return io_send_setup(req, sqe);
448 	if (unlikely(sqe->addr2 || sqe->file_index))
449 		return -EINVAL;
450 	return io_sendmsg_setup(req, sqe);
451 }
452 
453 static void io_req_msg_cleanup(struct io_kiocb *req,
454 			       unsigned int issue_flags)
455 {
456 	io_netmsg_recycle(req, issue_flags);
457 }
458 
459 /*
460  * For bundle completions, we need to figure out how many segments we consumed.
461  * A bundle could be using a single ITER_UBUF if that's all we mapped, or it
462  * could be using an ITER_IOVEC. If the latter, then if we consumed all of
463  * the segments, then it's a trivial questiont o answer. If we have residual
464  * data in the iter, then loop the segments to figure out how much we
465  * transferred.
466  */
467 static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret)
468 {
469 	struct iovec *iov;
470 	int nbufs;
471 
472 	/* no data is always zero segments, and a ubuf is always 1 segment */
473 	if (ret <= 0)
474 		return 0;
475 	if (iter_is_ubuf(&kmsg->msg.msg_iter))
476 		return 1;
477 
478 	iov = kmsg->vec.iovec;
479 	if (!iov)
480 		iov = &kmsg->fast_iov;
481 
482 	/* if all data was transferred, it's basic pointer math */
483 	if (!iov_iter_count(&kmsg->msg.msg_iter))
484 		return iter_iov(&kmsg->msg.msg_iter) - iov;
485 
486 	/* short transfer, count segments */
487 	nbufs = 0;
488 	do {
489 		int this_len = min_t(int, iov[nbufs].iov_len, ret);
490 
491 		nbufs++;
492 		ret -= this_len;
493 	} while (ret);
494 
495 	return nbufs;
496 }
497 
498 static int io_net_kbuf_recyle(struct io_kiocb *req, struct io_buffer_list *bl,
499 			      struct io_async_msghdr *kmsg, int len)
500 {
501 	req->flags |= REQ_F_BL_NO_RECYCLE;
502 	if (req->flags & REQ_F_BUFFERS_COMMIT)
503 		io_kbuf_commit(req, bl, len, io_bundle_nbufs(kmsg, len));
504 	return IOU_RETRY;
505 }
506 
507 static inline bool io_send_finish(struct io_kiocb *req,
508 				  struct io_async_msghdr *kmsg,
509 				  struct io_br_sel *sel)
510 {
511 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
512 	bool bundle_finished = sel->val <= 0;
513 	unsigned int cflags;
514 
515 	if (!(sr->flags & IORING_RECVSEND_BUNDLE)) {
516 		cflags = io_put_kbuf(req, sel->val, sel->buf_list);
517 		goto finish;
518 	}
519 
520 	cflags = io_put_kbufs(req, sel->val, sel->buf_list, io_bundle_nbufs(kmsg, sel->val));
521 
522 	/*
523 	 * Don't start new bundles if the buffer list is empty, or if the
524 	 * current operation needed to go through polling to complete.
525 	 */
526 	if (bundle_finished || req->flags & (REQ_F_BL_EMPTY | REQ_F_POLLED))
527 		goto finish;
528 
529 	/*
530 	 * Fill CQE for this receive and see if we should keep trying to
531 	 * receive from this socket.
532 	 */
533 	if (io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) {
534 		io_mshot_prep_retry(req, kmsg);
535 		return false;
536 	}
537 
538 	/* Otherwise stop bundle and use the current result. */
539 finish:
540 	io_req_set_res(req, sel->val, cflags);
541 	sel->val = IOU_COMPLETE;
542 	return true;
543 }
544 
545 int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
546 {
547 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
548 	struct io_async_msghdr *kmsg = req->async_data;
549 	struct socket *sock;
550 	unsigned flags;
551 	int min_ret = 0;
552 	int ret;
553 
554 	sock = sock_from_file(req->file);
555 	if (unlikely(!sock))
556 		return -ENOTSOCK;
557 
558 	if (!(req->flags & REQ_F_POLLED) &&
559 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
560 		return -EAGAIN;
561 
562 	flags = sr->msg_flags;
563 	if (issue_flags & IO_URING_F_NONBLOCK)
564 		flags |= MSG_DONTWAIT;
565 	if (flags & MSG_WAITALL)
566 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
567 
568 	kmsg->msg.msg_control_user = sr->msg_control;
569 
570 	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
571 
572 	if (ret < min_ret) {
573 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
574 			return -EAGAIN;
575 		if (ret > 0 && io_net_retry(sock, flags)) {
576 			kmsg->msg.msg_controllen = 0;
577 			kmsg->msg.msg_control = NULL;
578 			sr->done_io += ret;
579 			return -EAGAIN;
580 		}
581 		if (ret == -ERESTARTSYS)
582 			ret = -EINTR;
583 		req_set_fail(req);
584 	}
585 	io_req_msg_cleanup(req, issue_flags);
586 	if (ret >= 0)
587 		ret += sr->done_io;
588 	else if (sr->done_io)
589 		ret = sr->done_io;
590 	io_req_set_res(req, ret, 0);
591 	return IOU_COMPLETE;
592 }
593 
594 static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags,
595 				 struct io_br_sel *sel, struct io_async_msghdr *kmsg)
596 {
597 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
598 	struct buf_sel_arg arg = {
599 		.iovs = &kmsg->fast_iov,
600 		.max_len = min_not_zero(sr->len, INT_MAX),
601 		.nr_iovs = 1,
602 		.buf_group = sr->buf_group,
603 	};
604 	int ret;
605 
606 	if (kmsg->vec.iovec) {
607 		arg.nr_iovs = kmsg->vec.nr;
608 		arg.iovs = kmsg->vec.iovec;
609 		arg.mode = KBUF_MODE_FREE;
610 	}
611 
612 	if (!(sr->flags & IORING_RECVSEND_BUNDLE))
613 		arg.nr_iovs = 1;
614 	else
615 		arg.mode |= KBUF_MODE_EXPAND;
616 
617 	ret = io_buffers_select(req, &arg, sel, issue_flags);
618 	if (unlikely(ret < 0))
619 		return ret;
620 
621 	if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
622 		kmsg->vec.nr = ret;
623 		kmsg->vec.iovec = arg.iovs;
624 		req->flags |= REQ_F_NEED_CLEANUP;
625 	}
626 	sr->len = arg.out_len;
627 
628 	if (ret == 1) {
629 		sr->buf = arg.iovs[0].iov_base;
630 		ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len,
631 					&kmsg->msg.msg_iter);
632 		if (unlikely(ret))
633 			return ret;
634 	} else {
635 		iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE,
636 				arg.iovs, ret, arg.out_len);
637 	}
638 
639 	return 0;
640 }
641 
642 int io_send(struct io_kiocb *req, unsigned int issue_flags)
643 {
644 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
645 	struct io_async_msghdr *kmsg = req->async_data;
646 	struct io_br_sel sel = { };
647 	struct socket *sock;
648 	unsigned flags;
649 	int min_ret = 0;
650 	int ret;
651 
652 	sock = sock_from_file(req->file);
653 	if (unlikely(!sock))
654 		return -ENOTSOCK;
655 
656 	if (!(req->flags & REQ_F_POLLED) &&
657 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
658 		return -EAGAIN;
659 
660 	flags = sr->msg_flags;
661 	if (issue_flags & IO_URING_F_NONBLOCK)
662 		flags |= MSG_DONTWAIT;
663 
664 retry_bundle:
665 	sel.buf_list = NULL;
666 	if (io_do_buffer_select(req)) {
667 		ret = io_send_select_buffer(req, issue_flags, &sel, kmsg);
668 		if (ret)
669 			return ret;
670 	}
671 
672 	/*
673 	 * If MSG_WAITALL is set, or this is a bundle send, then we need
674 	 * the full amount. If just bundle is set, if we do a short send
675 	 * then we complete the bundle sequence rather than continue on.
676 	 */
677 	if (flags & MSG_WAITALL || sr->flags & IORING_RECVSEND_BUNDLE)
678 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
679 
680 	flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
681 	kmsg->msg.msg_flags = flags;
682 	ret = sock_sendmsg(sock, &kmsg->msg);
683 	if (ret < min_ret) {
684 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
685 			return -EAGAIN;
686 
687 		if (ret > 0 && io_net_retry(sock, flags)) {
688 			sr->len -= ret;
689 			sr->buf += ret;
690 			sr->done_io += ret;
691 			return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret);
692 		}
693 		if (ret == -ERESTARTSYS)
694 			ret = -EINTR;
695 		req_set_fail(req);
696 	}
697 	if (ret >= 0)
698 		ret += sr->done_io;
699 	else if (sr->done_io)
700 		ret = sr->done_io;
701 
702 	sel.val = ret;
703 	if (!io_send_finish(req, kmsg, &sel))
704 		goto retry_bundle;
705 
706 	io_req_msg_cleanup(req, issue_flags);
707 	return sel.val;
708 }
709 
710 static int io_recvmsg_mshot_prep(struct io_kiocb *req,
711 				 struct io_async_msghdr *iomsg,
712 				 int namelen, size_t controllen)
713 {
714 	if ((req->flags & (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) ==
715 			  (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) {
716 		int hdr;
717 
718 		if (unlikely(namelen < 0))
719 			return -EOVERFLOW;
720 		if (check_add_overflow(sizeof(struct io_uring_recvmsg_out),
721 					namelen, &hdr))
722 			return -EOVERFLOW;
723 		if (check_add_overflow(hdr, controllen, &hdr))
724 			return -EOVERFLOW;
725 
726 		iomsg->namelen = namelen;
727 		iomsg->controllen = controllen;
728 		return 0;
729 	}
730 
731 	return 0;
732 }
733 
734 static int io_recvmsg_copy_hdr(struct io_kiocb *req,
735 			       struct io_async_msghdr *iomsg)
736 {
737 	struct user_msghdr msg;
738 	int ret;
739 
740 	ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr);
741 	if (unlikely(ret))
742 		return ret;
743 
744 	if (!(req->flags & REQ_F_BUFFER_SELECT)) {
745 		ret = io_net_import_vec(req, iomsg, msg.msg_iov, msg.msg_iovlen,
746 					ITER_DEST);
747 		if (unlikely(ret))
748 			return ret;
749 	}
750 	return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen,
751 					msg.msg_controllen);
752 }
753 
754 static int io_recvmsg_prep_setup(struct io_kiocb *req)
755 {
756 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
757 	struct io_async_msghdr *kmsg;
758 
759 	kmsg = io_msg_alloc_async(req);
760 	if (unlikely(!kmsg))
761 		return -ENOMEM;
762 
763 	if (req->opcode == IORING_OP_RECV) {
764 		kmsg->msg.msg_name = NULL;
765 		kmsg->msg.msg_namelen = 0;
766 		kmsg->msg.msg_inq = 0;
767 		kmsg->msg.msg_control = NULL;
768 		kmsg->msg.msg_get_inq = 1;
769 		kmsg->msg.msg_controllen = 0;
770 		kmsg->msg.msg_iocb = NULL;
771 		kmsg->msg.msg_ubuf = NULL;
772 
773 		if (req->flags & REQ_F_BUFFER_SELECT)
774 			return 0;
775 		return import_ubuf(ITER_DEST, sr->buf, sr->len,
776 				   &kmsg->msg.msg_iter);
777 	}
778 
779 	return io_recvmsg_copy_hdr(req, kmsg);
780 }
781 
782 #define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \
783 			IORING_RECVSEND_BUNDLE)
784 
785 int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
786 {
787 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
788 
789 	sr->done_io = 0;
790 
791 	if (unlikely(sqe->addr2))
792 		return -EINVAL;
793 
794 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
795 	sr->len = READ_ONCE(sqe->len);
796 	if (unlikely(sr->len < 0))
797 		return -EINVAL;
798 	sr->flags = READ_ONCE(sqe->ioprio);
799 	if (sr->flags & ~RECVMSG_FLAGS)
800 		return -EINVAL;
801 	sr->msg_flags = READ_ONCE(sqe->msg_flags);
802 	if (sr->msg_flags & MSG_DONTWAIT)
803 		req->flags |= REQ_F_NOWAIT;
804 	if (sr->msg_flags & MSG_ERRQUEUE)
805 		req->flags |= REQ_F_CLEAR_POLLIN;
806 	if (req->flags & REQ_F_BUFFER_SELECT)
807 		sr->buf_group = req->buf_index;
808 	sr->mshot_total_len = sr->mshot_len = 0;
809 	if (sr->flags & IORING_RECV_MULTISHOT) {
810 		if (!(req->flags & REQ_F_BUFFER_SELECT))
811 			return -EINVAL;
812 		if (sr->msg_flags & MSG_WAITALL)
813 			return -EINVAL;
814 		if (req->opcode == IORING_OP_RECV) {
815 			sr->mshot_len = sr->len;
816 			sr->mshot_total_len = READ_ONCE(sqe->optlen);
817 			if (sr->mshot_total_len)
818 				sr->flags |= IORING_RECV_MSHOT_LIM;
819 		} else if (sqe->optlen) {
820 			return -EINVAL;
821 		}
822 		req->flags |= REQ_F_APOLL_MULTISHOT;
823 	} else if (sqe->optlen) {
824 		return -EINVAL;
825 	}
826 
827 	if (sr->flags & IORING_RECVSEND_BUNDLE) {
828 		if (req->opcode == IORING_OP_RECVMSG)
829 			return -EINVAL;
830 	}
831 
832 	if (io_is_compat(req->ctx))
833 		sr->msg_flags |= MSG_CMSG_COMPAT;
834 
835 	sr->nr_multishot_loops = 0;
836 	return io_recvmsg_prep_setup(req);
837 }
838 
839 /* bits to clear in old and inherit in new cflags on bundle retry */
840 #define CQE_F_MASK	(IORING_CQE_F_SOCK_NONEMPTY|IORING_CQE_F_MORE)
841 
842 /*
843  * Finishes io_recv and io_recvmsg.
844  *
845  * Returns true if it is actually finished, or false if it should run
846  * again (for multishot).
847  */
848 static inline bool io_recv_finish(struct io_kiocb *req,
849 				  struct io_async_msghdr *kmsg,
850 				  struct io_br_sel *sel, bool mshot_finished,
851 				  unsigned issue_flags)
852 {
853 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
854 	unsigned int cflags = 0;
855 
856 	if (kmsg->msg.msg_inq > 0)
857 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
858 
859 	if (sel->val > 0 && sr->flags & IORING_RECV_MSHOT_LIM) {
860 		/*
861 		 * If sr->len hits zero, the limit has been reached. Mark
862 		 * mshot as finished, and flag MSHOT_DONE as well to prevent
863 		 * a potential bundle from being retried.
864 		 */
865 		sr->mshot_total_len -= min_t(int, sel->val, sr->mshot_total_len);
866 		if (!sr->mshot_total_len) {
867 			sr->flags |= IORING_RECV_MSHOT_DONE;
868 			mshot_finished = true;
869 		}
870 	}
871 
872 	if (sr->flags & IORING_RECVSEND_BUNDLE) {
873 		size_t this_ret = sel->val - sr->done_io;
874 
875 		cflags |= io_put_kbufs(req, this_ret, sel->buf_list, io_bundle_nbufs(kmsg, this_ret));
876 		if (sr->flags & IORING_RECV_RETRY)
877 			cflags = req->cqe.flags | (cflags & CQE_F_MASK);
878 		if (sr->mshot_len && sel->val >= sr->mshot_len)
879 			sr->flags |= IORING_RECV_MSHOT_CAP;
880 		/* bundle with no more immediate buffers, we're done */
881 		if (req->flags & REQ_F_BL_EMPTY)
882 			goto finish;
883 		/*
884 		 * If more is available AND it was a full transfer, retry and
885 		 * append to this one
886 		 */
887 		if (!(sr->flags & IORING_RECV_NO_RETRY) &&
888 		    kmsg->msg.msg_inq > 1 && this_ret > 0 &&
889 		    !iov_iter_count(&kmsg->msg.msg_iter)) {
890 			req->cqe.flags = cflags & ~CQE_F_MASK;
891 			sr->len = kmsg->msg.msg_inq;
892 			sr->done_io += this_ret;
893 			sr->flags |= IORING_RECV_RETRY;
894 			return false;
895 		}
896 	} else {
897 		cflags |= io_put_kbuf(req, sel->val, sel->buf_list);
898 	}
899 
900 	/*
901 	 * Fill CQE for this receive and see if we should keep trying to
902 	 * receive from this socket.
903 	 */
904 	if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished &&
905 	    io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) {
906 		sel->val = IOU_RETRY;
907 		io_mshot_prep_retry(req, kmsg);
908 		/* Known not-empty or unknown state, retry */
909 		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) {
910 			if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY &&
911 			    !(sr->flags & IORING_RECV_MSHOT_CAP)) {
912 				return false;
913 			}
914 			/* mshot retries exceeded, force a requeue */
915 			sr->nr_multishot_loops = 0;
916 			sr->flags &= ~IORING_RECV_MSHOT_CAP;
917 			if (issue_flags & IO_URING_F_MULTISHOT)
918 				sel->val = IOU_REQUEUE;
919 		}
920 		return true;
921 	}
922 
923 	/* Finish the request / stop multishot. */
924 finish:
925 	io_req_set_res(req, sel->val, cflags);
926 	sel->val = IOU_COMPLETE;
927 	io_req_msg_cleanup(req, issue_flags);
928 	return true;
929 }
930 
931 static int io_recvmsg_prep_multishot(struct io_async_msghdr *kmsg,
932 				     struct io_sr_msg *sr, void __user **buf,
933 				     size_t *len)
934 {
935 	unsigned long ubuf = (unsigned long) *buf;
936 	unsigned long hdr;
937 
938 	hdr = sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
939 		kmsg->controllen;
940 	if (*len < hdr)
941 		return -EFAULT;
942 
943 	if (kmsg->controllen) {
944 		unsigned long control = ubuf + hdr - kmsg->controllen;
945 
946 		kmsg->msg.msg_control_user = (void __user *) control;
947 		kmsg->msg.msg_controllen = kmsg->controllen;
948 	}
949 
950 	sr->buf = *buf; /* stash for later copy */
951 	*buf = (void __user *) (ubuf + hdr);
952 	kmsg->payloadlen = *len = *len - hdr;
953 	return 0;
954 }
955 
956 struct io_recvmsg_multishot_hdr {
957 	struct io_uring_recvmsg_out msg;
958 	struct sockaddr_storage addr;
959 };
960 
961 static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io,
962 				struct io_async_msghdr *kmsg,
963 				unsigned int flags, bool *finished)
964 {
965 	int err;
966 	int copy_len;
967 	struct io_recvmsg_multishot_hdr hdr;
968 
969 	if (kmsg->namelen)
970 		kmsg->msg.msg_name = &hdr.addr;
971 	kmsg->msg.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
972 	kmsg->msg.msg_namelen = 0;
973 
974 	if (sock->file->f_flags & O_NONBLOCK)
975 		flags |= MSG_DONTWAIT;
976 
977 	err = sock_recvmsg(sock, &kmsg->msg, flags);
978 	*finished = err <= 0;
979 	if (err < 0)
980 		return err;
981 
982 	hdr.msg = (struct io_uring_recvmsg_out) {
983 		.controllen = kmsg->controllen - kmsg->msg.msg_controllen,
984 		.flags = kmsg->msg.msg_flags & ~MSG_CMSG_COMPAT
985 	};
986 
987 	hdr.msg.payloadlen = err;
988 	if (err > kmsg->payloadlen)
989 		err = kmsg->payloadlen;
990 
991 	copy_len = sizeof(struct io_uring_recvmsg_out);
992 	if (kmsg->msg.msg_namelen > kmsg->namelen)
993 		copy_len += kmsg->namelen;
994 	else
995 		copy_len += kmsg->msg.msg_namelen;
996 
997 	/*
998 	 *      "fromlen shall refer to the value before truncation.."
999 	 *                      1003.1g
1000 	 */
1001 	hdr.msg.namelen = kmsg->msg.msg_namelen;
1002 
1003 	/* ensure that there is no gap between hdr and sockaddr_storage */
1004 	BUILD_BUG_ON(offsetof(struct io_recvmsg_multishot_hdr, addr) !=
1005 		     sizeof(struct io_uring_recvmsg_out));
1006 	if (copy_to_user(io->buf, &hdr, copy_len)) {
1007 		*finished = true;
1008 		return -EFAULT;
1009 	}
1010 
1011 	return sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
1012 			kmsg->controllen + err;
1013 }
1014 
1015 int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
1016 {
1017 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1018 	struct io_async_msghdr *kmsg = req->async_data;
1019 	struct io_br_sel sel = { };
1020 	struct socket *sock;
1021 	unsigned flags;
1022 	int ret, min_ret = 0;
1023 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1024 	bool mshot_finished = true;
1025 
1026 	sock = sock_from_file(req->file);
1027 	if (unlikely(!sock))
1028 		return -ENOTSOCK;
1029 
1030 	if (!(req->flags & REQ_F_POLLED) &&
1031 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
1032 		return -EAGAIN;
1033 
1034 	flags = sr->msg_flags;
1035 	if (force_nonblock)
1036 		flags |= MSG_DONTWAIT;
1037 
1038 retry_multishot:
1039 	sel.buf_list = NULL;
1040 	if (io_do_buffer_select(req)) {
1041 		size_t len = sr->len;
1042 
1043 		sel = io_buffer_select(req, &len, sr->buf_group, issue_flags);
1044 		if (!sel.addr)
1045 			return -ENOBUFS;
1046 
1047 		if (req->flags & REQ_F_APOLL_MULTISHOT) {
1048 			ret = io_recvmsg_prep_multishot(kmsg, sr, &sel.addr, &len);
1049 			if (ret) {
1050 				io_kbuf_recycle(req, sel.buf_list, issue_flags);
1051 				return ret;
1052 			}
1053 		}
1054 
1055 		iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, sel.addr, len);
1056 	}
1057 
1058 	kmsg->msg.msg_get_inq = 1;
1059 	kmsg->msg.msg_inq = -1;
1060 	if (req->flags & REQ_F_APOLL_MULTISHOT) {
1061 		ret = io_recvmsg_multishot(sock, sr, kmsg, flags,
1062 					   &mshot_finished);
1063 	} else {
1064 		/* disable partial retry for recvmsg with cmsg attached */
1065 		if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen)
1066 			min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1067 
1068 		ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg,
1069 					 kmsg->uaddr, flags);
1070 	}
1071 
1072 	if (ret < min_ret) {
1073 		if (ret == -EAGAIN && force_nonblock) {
1074 			io_kbuf_recycle(req, sel.buf_list, issue_flags);
1075 			return IOU_RETRY;
1076 		}
1077 		if (ret > 0 && io_net_retry(sock, flags)) {
1078 			sr->done_io += ret;
1079 			return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret);
1080 		}
1081 		if (ret == -ERESTARTSYS)
1082 			ret = -EINTR;
1083 		req_set_fail(req);
1084 	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
1085 		req_set_fail(req);
1086 	}
1087 
1088 	if (ret > 0)
1089 		ret += sr->done_io;
1090 	else if (sr->done_io)
1091 		ret = sr->done_io;
1092 	else
1093 		io_kbuf_recycle(req, sel.buf_list, issue_flags);
1094 
1095 	sel.val = ret;
1096 	if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags))
1097 		goto retry_multishot;
1098 
1099 	return sel.val;
1100 }
1101 
1102 static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg,
1103 			      struct io_br_sel *sel, unsigned int issue_flags)
1104 {
1105 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1106 	int ret;
1107 
1108 	/*
1109 	 * If the ring isn't locked, then don't use the peek interface
1110 	 * to grab multiple buffers as we will lock/unlock between
1111 	 * this selection and posting the buffers.
1112 	 */
1113 	if (!(issue_flags & IO_URING_F_UNLOCKED) &&
1114 	    sr->flags & IORING_RECVSEND_BUNDLE) {
1115 		struct buf_sel_arg arg = {
1116 			.iovs = &kmsg->fast_iov,
1117 			.nr_iovs = 1,
1118 			.mode = KBUF_MODE_EXPAND,
1119 			.buf_group = sr->buf_group,
1120 		};
1121 
1122 		if (kmsg->vec.iovec) {
1123 			arg.nr_iovs = kmsg->vec.nr;
1124 			arg.iovs = kmsg->vec.iovec;
1125 			arg.mode |= KBUF_MODE_FREE;
1126 		}
1127 
1128 		if (sel->val)
1129 			arg.max_len = sel->val;
1130 		else if (kmsg->msg.msg_inq > 1)
1131 			arg.max_len = min_not_zero(sel->val, (ssize_t) kmsg->msg.msg_inq);
1132 
1133 		/* if mshot limited, ensure we don't go over */
1134 		if (sr->flags & IORING_RECV_MSHOT_LIM)
1135 			arg.max_len = min_not_zero(arg.max_len, sr->mshot_total_len);
1136 		ret = io_buffers_peek(req, &arg, sel);
1137 		if (unlikely(ret < 0))
1138 			return ret;
1139 
1140 		if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
1141 			kmsg->vec.nr = ret;
1142 			kmsg->vec.iovec = arg.iovs;
1143 			req->flags |= REQ_F_NEED_CLEANUP;
1144 		}
1145 		if (arg.partial_map)
1146 			sr->flags |= IORING_RECV_PARTIAL_MAP;
1147 
1148 		/* special case 1 vec, can be a fast path */
1149 		if (ret == 1) {
1150 			sr->buf = arg.iovs[0].iov_base;
1151 			sr->len = arg.iovs[0].iov_len;
1152 			goto map_ubuf;
1153 		}
1154 		iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret,
1155 				arg.out_len);
1156 	} else {
1157 		size_t len = sel->val;
1158 
1159 		*sel = io_buffer_select(req, &len, sr->buf_group, issue_flags);
1160 		if (!sel->addr)
1161 			return -ENOBUFS;
1162 		sr->buf = sel->addr;
1163 		sr->len = len;
1164 map_ubuf:
1165 		ret = import_ubuf(ITER_DEST, sr->buf, sr->len,
1166 				  &kmsg->msg.msg_iter);
1167 		if (unlikely(ret))
1168 			return ret;
1169 	}
1170 
1171 	return 0;
1172 }
1173 
1174 int io_recv(struct io_kiocb *req, unsigned int issue_flags)
1175 {
1176 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1177 	struct io_async_msghdr *kmsg = req->async_data;
1178 	struct io_br_sel sel;
1179 	struct socket *sock;
1180 	unsigned flags;
1181 	int ret, min_ret = 0;
1182 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1183 	bool mshot_finished;
1184 
1185 	if (!(req->flags & REQ_F_POLLED) &&
1186 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
1187 		return -EAGAIN;
1188 
1189 	sock = sock_from_file(req->file);
1190 	if (unlikely(!sock))
1191 		return -ENOTSOCK;
1192 
1193 	flags = sr->msg_flags;
1194 	if (force_nonblock)
1195 		flags |= MSG_DONTWAIT;
1196 
1197 retry_multishot:
1198 	sel.buf_list = NULL;
1199 	if (io_do_buffer_select(req)) {
1200 		sel.val = sr->len;
1201 		ret = io_recv_buf_select(req, kmsg, &sel, issue_flags);
1202 		if (unlikely(ret < 0)) {
1203 			kmsg->msg.msg_inq = -1;
1204 			goto out_free;
1205 		}
1206 		sr->buf = NULL;
1207 	}
1208 
1209 	kmsg->msg.msg_flags = 0;
1210 	kmsg->msg.msg_inq = -1;
1211 
1212 	if (flags & MSG_WAITALL)
1213 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1214 
1215 	ret = sock_recvmsg(sock, &kmsg->msg, flags);
1216 	if (ret < min_ret) {
1217 		if (ret == -EAGAIN && force_nonblock) {
1218 			io_kbuf_recycle(req, sel.buf_list, issue_flags);
1219 			return IOU_RETRY;
1220 		}
1221 		if (ret > 0 && io_net_retry(sock, flags)) {
1222 			sr->len -= ret;
1223 			sr->buf += ret;
1224 			sr->done_io += ret;
1225 			return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret);
1226 		}
1227 		if (ret == -ERESTARTSYS)
1228 			ret = -EINTR;
1229 		req_set_fail(req);
1230 	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
1231 out_free:
1232 		req_set_fail(req);
1233 	}
1234 
1235 	mshot_finished = ret <= 0;
1236 	if (ret > 0)
1237 		ret += sr->done_io;
1238 	else if (sr->done_io)
1239 		ret = sr->done_io;
1240 	else
1241 		io_kbuf_recycle(req, sel.buf_list, issue_flags);
1242 
1243 	sel.val = ret;
1244 	if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags))
1245 		goto retry_multishot;
1246 
1247 	return sel.val;
1248 }
1249 
1250 int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1251 {
1252 	struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
1253 	unsigned ifq_idx;
1254 
1255 	if (unlikely(sqe->addr2 || sqe->addr || sqe->addr3))
1256 		return -EINVAL;
1257 
1258 	ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx);
1259 	zc->ifq = xa_load(&req->ctx->zcrx_ctxs, ifq_idx);
1260 	if (!zc->ifq)
1261 		return -EINVAL;
1262 
1263 	zc->len = READ_ONCE(sqe->len);
1264 	zc->flags = READ_ONCE(sqe->ioprio);
1265 	if (READ_ONCE(sqe->msg_flags))
1266 		return -EINVAL;
1267 	if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT))
1268 		return -EINVAL;
1269 	/* multishot required */
1270 	if (!(zc->flags & IORING_RECV_MULTISHOT))
1271 		return -EINVAL;
1272 	/* All data completions are posted as aux CQEs. */
1273 	req->flags |= REQ_F_APOLL_MULTISHOT;
1274 
1275 	return 0;
1276 }
1277 
1278 int io_recvzc(struct io_kiocb *req, unsigned int issue_flags)
1279 {
1280 	struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
1281 	struct socket *sock;
1282 	unsigned int len;
1283 	int ret;
1284 
1285 	if (!(req->flags & REQ_F_POLLED) &&
1286 	    (zc->flags & IORING_RECVSEND_POLL_FIRST))
1287 		return -EAGAIN;
1288 
1289 	sock = sock_from_file(req->file);
1290 	if (unlikely(!sock))
1291 		return -ENOTSOCK;
1292 
1293 	len = zc->len;
1294 	ret = io_zcrx_recv(req, zc->ifq, sock, 0, issue_flags, &zc->len);
1295 	if (len && zc->len == 0) {
1296 		io_req_set_res(req, 0, 0);
1297 
1298 		return IOU_COMPLETE;
1299 	}
1300 	if (unlikely(ret <= 0) && ret != -EAGAIN) {
1301 		if (ret == -ERESTARTSYS)
1302 			ret = -EINTR;
1303 		if (ret == IOU_REQUEUE)
1304 			return IOU_REQUEUE;
1305 
1306 		req_set_fail(req);
1307 		io_req_set_res(req, ret, 0);
1308 		return IOU_COMPLETE;
1309 	}
1310 	return IOU_RETRY;
1311 }
1312 
1313 void io_send_zc_cleanup(struct io_kiocb *req)
1314 {
1315 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1316 	struct io_async_msghdr *io = req->async_data;
1317 
1318 	if (req_has_async_data(req))
1319 		io_netmsg_iovec_free(io);
1320 	if (zc->notif) {
1321 		io_notif_flush(zc->notif);
1322 		zc->notif = NULL;
1323 	}
1324 }
1325 
1326 #define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF)
1327 #define IO_ZC_FLAGS_VALID  (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE | \
1328 				IORING_SEND_VECTORIZED)
1329 
1330 int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1331 {
1332 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1333 	struct io_ring_ctx *ctx = req->ctx;
1334 	struct io_async_msghdr *iomsg;
1335 	struct io_kiocb *notif;
1336 	int ret;
1337 
1338 	zc->done_io = 0;
1339 
1340 	if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)))
1341 		return -EINVAL;
1342 	/* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */
1343 	if (req->flags & REQ_F_CQE_SKIP)
1344 		return -EINVAL;
1345 
1346 	notif = zc->notif = io_alloc_notif(ctx);
1347 	if (!notif)
1348 		return -ENOMEM;
1349 	notif->cqe.user_data = req->cqe.user_data;
1350 	notif->cqe.res = 0;
1351 	notif->cqe.flags = IORING_CQE_F_NOTIF;
1352 	req->flags |= REQ_F_NEED_CLEANUP | REQ_F_POLL_NO_LAZY;
1353 
1354 	zc->flags = READ_ONCE(sqe->ioprio);
1355 	if (unlikely(zc->flags & ~IO_ZC_FLAGS_COMMON)) {
1356 		if (zc->flags & ~IO_ZC_FLAGS_VALID)
1357 			return -EINVAL;
1358 		if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) {
1359 			struct io_notif_data *nd = io_notif_to_data(notif);
1360 
1361 			nd->zc_report = true;
1362 			nd->zc_used = false;
1363 			nd->zc_copied = false;
1364 		}
1365 	}
1366 
1367 	zc->len = READ_ONCE(sqe->len);
1368 	zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY;
1369 	req->buf_index = READ_ONCE(sqe->buf_index);
1370 	if (zc->msg_flags & MSG_DONTWAIT)
1371 		req->flags |= REQ_F_NOWAIT;
1372 
1373 	if (io_is_compat(req->ctx))
1374 		zc->msg_flags |= MSG_CMSG_COMPAT;
1375 
1376 	iomsg = io_msg_alloc_async(req);
1377 	if (unlikely(!iomsg))
1378 		return -ENOMEM;
1379 
1380 	if (req->opcode == IORING_OP_SEND_ZC) {
1381 		ret = io_send_setup(req, sqe);
1382 	} else {
1383 		if (unlikely(sqe->addr2 || sqe->file_index))
1384 			return -EINVAL;
1385 		ret = io_sendmsg_setup(req, sqe);
1386 	}
1387 	if (unlikely(ret))
1388 		return ret;
1389 
1390 	if (!(zc->flags & IORING_RECVSEND_FIXED_BUF)) {
1391 		iomsg->msg.sg_from_iter = io_sg_from_iter_iovec;
1392 		return io_notif_account_mem(zc->notif, iomsg->msg.msg_iter.count);
1393 	}
1394 	iomsg->msg.sg_from_iter = io_sg_from_iter;
1395 	return 0;
1396 }
1397 
1398 static int io_sg_from_iter_iovec(struct sk_buff *skb,
1399 				 struct iov_iter *from, size_t length)
1400 {
1401 	skb_zcopy_downgrade_managed(skb);
1402 	return zerocopy_fill_skb_from_iter(skb, from, length);
1403 }
1404 
1405 static int io_sg_from_iter(struct sk_buff *skb,
1406 			   struct iov_iter *from, size_t length)
1407 {
1408 	struct skb_shared_info *shinfo = skb_shinfo(skb);
1409 	int frag = shinfo->nr_frags;
1410 	int ret = 0;
1411 	struct bvec_iter bi;
1412 	ssize_t copied = 0;
1413 	unsigned long truesize = 0;
1414 
1415 	if (!frag)
1416 		shinfo->flags |= SKBFL_MANAGED_FRAG_REFS;
1417 	else if (unlikely(!skb_zcopy_managed(skb)))
1418 		return zerocopy_fill_skb_from_iter(skb, from, length);
1419 
1420 	bi.bi_size = min(from->count, length);
1421 	bi.bi_bvec_done = from->iov_offset;
1422 	bi.bi_idx = 0;
1423 
1424 	while (bi.bi_size && frag < MAX_SKB_FRAGS) {
1425 		struct bio_vec v = mp_bvec_iter_bvec(from->bvec, bi);
1426 
1427 		copied += v.bv_len;
1428 		truesize += PAGE_ALIGN(v.bv_len + v.bv_offset);
1429 		__skb_fill_page_desc_noacc(shinfo, frag++, v.bv_page,
1430 					   v.bv_offset, v.bv_len);
1431 		bvec_iter_advance_single(from->bvec, &bi, v.bv_len);
1432 	}
1433 	if (bi.bi_size)
1434 		ret = -EMSGSIZE;
1435 
1436 	shinfo->nr_frags = frag;
1437 	from->bvec += bi.bi_idx;
1438 	from->nr_segs -= bi.bi_idx;
1439 	from->count -= copied;
1440 	from->iov_offset = bi.bi_bvec_done;
1441 
1442 	skb->data_len += copied;
1443 	skb->len += copied;
1444 	skb->truesize += truesize;
1445 	return ret;
1446 }
1447 
1448 static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags)
1449 {
1450 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1451 	struct io_async_msghdr *kmsg = req->async_data;
1452 
1453 	WARN_ON_ONCE(!(sr->flags & IORING_RECVSEND_FIXED_BUF));
1454 
1455 	sr->notif->buf_index = req->buf_index;
1456 	return io_import_reg_buf(sr->notif, &kmsg->msg.msg_iter,
1457 				(u64)(uintptr_t)sr->buf, sr->len,
1458 				ITER_SOURCE, issue_flags);
1459 }
1460 
1461 int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
1462 {
1463 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1464 	struct io_async_msghdr *kmsg = req->async_data;
1465 	struct socket *sock;
1466 	unsigned msg_flags;
1467 	int ret, min_ret = 0;
1468 
1469 	sock = sock_from_file(req->file);
1470 	if (unlikely(!sock))
1471 		return -ENOTSOCK;
1472 	if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
1473 		return -EOPNOTSUPP;
1474 
1475 	if (!(req->flags & REQ_F_POLLED) &&
1476 	    (zc->flags & IORING_RECVSEND_POLL_FIRST))
1477 		return -EAGAIN;
1478 
1479 	if (req->flags & REQ_F_IMPORT_BUFFER) {
1480 		req->flags &= ~REQ_F_IMPORT_BUFFER;
1481 		ret = io_send_zc_import(req, issue_flags);
1482 		if (unlikely(ret))
1483 			return ret;
1484 	}
1485 
1486 	msg_flags = zc->msg_flags;
1487 	if (issue_flags & IO_URING_F_NONBLOCK)
1488 		msg_flags |= MSG_DONTWAIT;
1489 	if (msg_flags & MSG_WAITALL)
1490 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1491 	msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
1492 
1493 	kmsg->msg.msg_flags = msg_flags;
1494 	kmsg->msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg;
1495 	ret = sock_sendmsg(sock, &kmsg->msg);
1496 
1497 	if (unlikely(ret < min_ret)) {
1498 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1499 			return -EAGAIN;
1500 
1501 		if (ret > 0 && io_net_retry(sock, kmsg->msg.msg_flags)) {
1502 			zc->done_io += ret;
1503 			return -EAGAIN;
1504 		}
1505 		if (ret == -ERESTARTSYS)
1506 			ret = -EINTR;
1507 		req_set_fail(req);
1508 	}
1509 
1510 	if (ret >= 0)
1511 		ret += zc->done_io;
1512 	else if (zc->done_io)
1513 		ret = zc->done_io;
1514 
1515 	/*
1516 	 * If we're in io-wq we can't rely on tw ordering guarantees, defer
1517 	 * flushing notif to io_send_zc_cleanup()
1518 	 */
1519 	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
1520 		io_notif_flush(zc->notif);
1521 		zc->notif = NULL;
1522 		io_req_msg_cleanup(req, 0);
1523 	}
1524 	io_req_set_res(req, ret, IORING_CQE_F_MORE);
1525 	return IOU_COMPLETE;
1526 }
1527 
1528 int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
1529 {
1530 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1531 	struct io_async_msghdr *kmsg = req->async_data;
1532 	struct socket *sock;
1533 	unsigned flags;
1534 	int ret, min_ret = 0;
1535 
1536 	if (req->flags & REQ_F_IMPORT_BUFFER) {
1537 		unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs;
1538 		int ret;
1539 
1540 		sr->notif->buf_index = req->buf_index;
1541 		ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter,
1542 					sr->notif, &kmsg->vec, uvec_segs,
1543 					issue_flags);
1544 		if (unlikely(ret))
1545 			return ret;
1546 		req->flags &= ~REQ_F_IMPORT_BUFFER;
1547 	}
1548 
1549 	sock = sock_from_file(req->file);
1550 	if (unlikely(!sock))
1551 		return -ENOTSOCK;
1552 	if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
1553 		return -EOPNOTSUPP;
1554 
1555 	if (!(req->flags & REQ_F_POLLED) &&
1556 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
1557 		return -EAGAIN;
1558 
1559 	flags = sr->msg_flags;
1560 	if (issue_flags & IO_URING_F_NONBLOCK)
1561 		flags |= MSG_DONTWAIT;
1562 	if (flags & MSG_WAITALL)
1563 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1564 
1565 	kmsg->msg.msg_control_user = sr->msg_control;
1566 	kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg;
1567 	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
1568 
1569 	if (unlikely(ret < min_ret)) {
1570 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1571 			return -EAGAIN;
1572 
1573 		if (ret > 0 && io_net_retry(sock, flags)) {
1574 			sr->done_io += ret;
1575 			return -EAGAIN;
1576 		}
1577 		if (ret == -ERESTARTSYS)
1578 			ret = -EINTR;
1579 		req_set_fail(req);
1580 	}
1581 
1582 	if (ret >= 0)
1583 		ret += sr->done_io;
1584 	else if (sr->done_io)
1585 		ret = sr->done_io;
1586 
1587 	/*
1588 	 * If we're in io-wq we can't rely on tw ordering guarantees, defer
1589 	 * flushing notif to io_send_zc_cleanup()
1590 	 */
1591 	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
1592 		io_notif_flush(sr->notif);
1593 		sr->notif = NULL;
1594 		io_req_msg_cleanup(req, 0);
1595 	}
1596 	io_req_set_res(req, ret, IORING_CQE_F_MORE);
1597 	return IOU_COMPLETE;
1598 }
1599 
1600 void io_sendrecv_fail(struct io_kiocb *req)
1601 {
1602 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1603 
1604 	if (sr->done_io)
1605 		req->cqe.res = sr->done_io;
1606 
1607 	if ((req->flags & REQ_F_NEED_CLEANUP) &&
1608 	    (req->opcode == IORING_OP_SEND_ZC || req->opcode == IORING_OP_SENDMSG_ZC))
1609 		req->cqe.flags |= IORING_CQE_F_MORE;
1610 }
1611 
1612 #define ACCEPT_FLAGS	(IORING_ACCEPT_MULTISHOT | IORING_ACCEPT_DONTWAIT | \
1613 			 IORING_ACCEPT_POLL_FIRST)
1614 
1615 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1616 {
1617 	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
1618 
1619 	if (sqe->len || sqe->buf_index)
1620 		return -EINVAL;
1621 
1622 	accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1623 	accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
1624 	accept->flags = READ_ONCE(sqe->accept_flags);
1625 	accept->nofile = rlimit(RLIMIT_NOFILE);
1626 	accept->iou_flags = READ_ONCE(sqe->ioprio);
1627 	if (accept->iou_flags & ~ACCEPT_FLAGS)
1628 		return -EINVAL;
1629 
1630 	accept->file_slot = READ_ONCE(sqe->file_index);
1631 	if (accept->file_slot) {
1632 		if (accept->flags & SOCK_CLOEXEC)
1633 			return -EINVAL;
1634 		if (accept->iou_flags & IORING_ACCEPT_MULTISHOT &&
1635 		    accept->file_slot != IORING_FILE_INDEX_ALLOC)
1636 			return -EINVAL;
1637 	}
1638 	if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1639 		return -EINVAL;
1640 	if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
1641 		accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
1642 	if (accept->iou_flags & IORING_ACCEPT_MULTISHOT)
1643 		req->flags |= REQ_F_APOLL_MULTISHOT;
1644 	if (accept->iou_flags & IORING_ACCEPT_DONTWAIT)
1645 		req->flags |= REQ_F_NOWAIT;
1646 	return 0;
1647 }
1648 
1649 int io_accept(struct io_kiocb *req, unsigned int issue_flags)
1650 {
1651 	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
1652 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1653 	bool fixed = !!accept->file_slot;
1654 	struct proto_accept_arg arg = {
1655 		.flags = force_nonblock ? O_NONBLOCK : 0,
1656 	};
1657 	struct file *file;
1658 	unsigned cflags;
1659 	int ret, fd;
1660 
1661 	if (!(req->flags & REQ_F_POLLED) &&
1662 	    accept->iou_flags & IORING_ACCEPT_POLL_FIRST)
1663 		return -EAGAIN;
1664 
1665 retry:
1666 	if (!fixed) {
1667 		fd = __get_unused_fd_flags(accept->flags, accept->nofile);
1668 		if (unlikely(fd < 0))
1669 			return fd;
1670 	}
1671 	arg.err = 0;
1672 	arg.is_empty = -1;
1673 	file = do_accept(req->file, &arg, accept->addr, accept->addr_len,
1674 			 accept->flags);
1675 	if (IS_ERR(file)) {
1676 		if (!fixed)
1677 			put_unused_fd(fd);
1678 		ret = PTR_ERR(file);
1679 		if (ret == -EAGAIN && force_nonblock &&
1680 		    !(accept->iou_flags & IORING_ACCEPT_DONTWAIT))
1681 			return IOU_RETRY;
1682 
1683 		if (ret == -ERESTARTSYS)
1684 			ret = -EINTR;
1685 	} else if (!fixed) {
1686 		fd_install(fd, file);
1687 		ret = fd;
1688 	} else {
1689 		ret = io_fixed_fd_install(req, issue_flags, file,
1690 						accept->file_slot);
1691 	}
1692 
1693 	cflags = 0;
1694 	if (!arg.is_empty)
1695 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
1696 
1697 	if (ret >= 0 && (req->flags & REQ_F_APOLL_MULTISHOT) &&
1698 	    io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) {
1699 		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1)
1700 			goto retry;
1701 		return IOU_RETRY;
1702 	}
1703 
1704 	io_req_set_res(req, ret, cflags);
1705 	if (ret < 0)
1706 		req_set_fail(req);
1707 	return IOU_COMPLETE;
1708 }
1709 
1710 void io_socket_bpf_populate(struct io_uring_bpf_ctx *bctx, struct io_kiocb *req)
1711 {
1712 	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1713 
1714 	bctx->socket.family = sock->domain;
1715 	bctx->socket.type = sock->type;
1716 	bctx->socket.protocol = sock->protocol;
1717 }
1718 
1719 int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1720 {
1721 	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1722 
1723 	if (sqe->addr || sqe->rw_flags || sqe->buf_index)
1724 		return -EINVAL;
1725 
1726 	sock->domain = READ_ONCE(sqe->fd);
1727 	sock->type = READ_ONCE(sqe->off);
1728 	sock->protocol = READ_ONCE(sqe->len);
1729 	sock->file_slot = READ_ONCE(sqe->file_index);
1730 	sock->nofile = rlimit(RLIMIT_NOFILE);
1731 
1732 	sock->flags = sock->type & ~SOCK_TYPE_MASK;
1733 	if (sock->file_slot && (sock->flags & SOCK_CLOEXEC))
1734 		return -EINVAL;
1735 	if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1736 		return -EINVAL;
1737 	return 0;
1738 }
1739 
1740 int io_socket(struct io_kiocb *req, unsigned int issue_flags)
1741 {
1742 	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1743 	bool fixed = !!sock->file_slot;
1744 	struct file *file;
1745 	int ret, fd;
1746 
1747 	if (!fixed) {
1748 		fd = __get_unused_fd_flags(sock->flags, sock->nofile);
1749 		if (unlikely(fd < 0))
1750 			return fd;
1751 	}
1752 	file = __sys_socket_file(sock->domain, sock->type, sock->protocol);
1753 	if (IS_ERR(file)) {
1754 		if (!fixed)
1755 			put_unused_fd(fd);
1756 		ret = PTR_ERR(file);
1757 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1758 			return -EAGAIN;
1759 		if (ret == -ERESTARTSYS)
1760 			ret = -EINTR;
1761 		req_set_fail(req);
1762 	} else if (!fixed) {
1763 		fd_install(fd, file);
1764 		ret = fd;
1765 	} else {
1766 		ret = io_fixed_fd_install(req, issue_flags, file,
1767 					    sock->file_slot);
1768 	}
1769 	io_req_set_res(req, ret, 0);
1770 	return IOU_COMPLETE;
1771 }
1772 
1773 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1774 {
1775 	struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect);
1776 	struct io_async_msghdr *io;
1777 
1778 	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
1779 		return -EINVAL;
1780 
1781 	conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1782 	conn->addr_len =  READ_ONCE(sqe->addr2);
1783 	conn->in_progress = conn->seen_econnaborted = false;
1784 
1785 	io = io_msg_alloc_async(req);
1786 	if (unlikely(!io))
1787 		return -ENOMEM;
1788 
1789 	return move_addr_to_kernel(conn->addr, conn->addr_len, &io->addr);
1790 }
1791 
1792 int io_connect(struct io_kiocb *req, unsigned int issue_flags)
1793 {
1794 	struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect);
1795 	struct io_async_msghdr *io = req->async_data;
1796 	unsigned file_flags;
1797 	int ret;
1798 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1799 
1800 	if (connect->in_progress) {
1801 		struct poll_table_struct pt = { ._key = EPOLLERR };
1802 
1803 		if (vfs_poll(req->file, &pt) & EPOLLERR)
1804 			goto get_sock_err;
1805 	}
1806 
1807 	file_flags = force_nonblock ? O_NONBLOCK : 0;
1808 
1809 	ret = __sys_connect_file(req->file, &io->addr, connect->addr_len,
1810 				 file_flags);
1811 	if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED)
1812 	    && force_nonblock) {
1813 		if (ret == -EINPROGRESS) {
1814 			connect->in_progress = true;
1815 		} else if (ret == -ECONNABORTED) {
1816 			if (connect->seen_econnaborted)
1817 				goto out;
1818 			connect->seen_econnaborted = true;
1819 		}
1820 		return -EAGAIN;
1821 	}
1822 	if (connect->in_progress) {
1823 		/*
1824 		 * At least bluetooth will return -EBADFD on a re-connect
1825 		 * attempt, and it's (supposedly) also valid to get -EISCONN
1826 		 * which means the previous result is good. For both of these,
1827 		 * grab the sock_error() and use that for the completion.
1828 		 */
1829 		if (ret == -EBADFD || ret == -EISCONN) {
1830 get_sock_err:
1831 			ret = sock_error(sock_from_file(req->file)->sk);
1832 		}
1833 	}
1834 	if (ret == -ERESTARTSYS)
1835 		ret = -EINTR;
1836 out:
1837 	if (ret < 0)
1838 		req_set_fail(req);
1839 	io_req_msg_cleanup(req, issue_flags);
1840 	io_req_set_res(req, ret, 0);
1841 	return IOU_COMPLETE;
1842 }
1843 
1844 int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1845 {
1846 	struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind);
1847 	struct sockaddr __user *uaddr;
1848 	struct io_async_msghdr *io;
1849 
1850 	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
1851 		return -EINVAL;
1852 
1853 	uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1854 	bind->addr_len =  READ_ONCE(sqe->addr2);
1855 
1856 	io = io_msg_alloc_async(req);
1857 	if (unlikely(!io))
1858 		return -ENOMEM;
1859 	return move_addr_to_kernel(uaddr, bind->addr_len, &io->addr);
1860 }
1861 
1862 int io_bind(struct io_kiocb *req, unsigned int issue_flags)
1863 {
1864 	struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind);
1865 	struct io_async_msghdr *io = req->async_data;
1866 	struct socket *sock;
1867 	int ret;
1868 
1869 	sock = sock_from_file(req->file);
1870 	if (unlikely(!sock))
1871 		return -ENOTSOCK;
1872 
1873 	ret = __sys_bind_socket(sock, &io->addr, bind->addr_len);
1874 	if (ret < 0)
1875 		req_set_fail(req);
1876 	io_req_set_res(req, ret, 0);
1877 	return 0;
1878 }
1879 
1880 int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1881 {
1882 	struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen);
1883 
1884 	if (sqe->addr || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in || sqe->addr2)
1885 		return -EINVAL;
1886 
1887 	listen->backlog = READ_ONCE(sqe->len);
1888 	return 0;
1889 }
1890 
1891 int io_listen(struct io_kiocb *req, unsigned int issue_flags)
1892 {
1893 	struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen);
1894 	struct socket *sock;
1895 	int ret;
1896 
1897 	sock = sock_from_file(req->file);
1898 	if (unlikely(!sock))
1899 		return -ENOTSOCK;
1900 
1901 	ret = __sys_listen_socket(sock, listen->backlog);
1902 	if (ret < 0)
1903 		req_set_fail(req);
1904 	io_req_set_res(req, ret, 0);
1905 	return 0;
1906 }
1907 
1908 void io_netmsg_cache_free(const void *entry)
1909 {
1910 	struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry;
1911 
1912 	io_vec_free(&kmsg->vec);
1913 	kfree(kmsg);
1914 }
1915