xref: /linux/io_uring/net.c (revision 9b40ba14edcdf70240af8114092a76f75f070774)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/file.h>
5 #include <linux/slab.h>
6 #include <linux/net.h>
7 #include <linux/un.h>
8 #include <linux/compat.h>
9 #include <net/compat.h>
10 #include <linux/io_uring.h>
11 
12 #include <uapi/linux/io_uring.h>
13 
14 #include "filetable.h"
15 #include "io_uring.h"
16 #include "kbuf.h"
17 #include "alloc_cache.h"
18 #include "net.h"
19 #include "notif.h"
20 #include "rsrc.h"
21 #include "zcrx.h"
22 
23 struct io_shutdown {
24 	struct file			*file;
25 	int				how;
26 };
27 
28 struct io_accept {
29 	struct file			*file;
30 	struct sockaddr __user		*addr;
31 	int __user			*addr_len;
32 	int				flags;
33 	int				iou_flags;
34 	u32				file_slot;
35 	unsigned long			nofile;
36 };
37 
38 struct io_socket {
39 	struct file			*file;
40 	int				domain;
41 	int				type;
42 	int				protocol;
43 	int				flags;
44 	u32				file_slot;
45 	unsigned long			nofile;
46 };
47 
48 struct io_connect {
49 	struct file			*file;
50 	struct sockaddr __user		*addr;
51 	int				addr_len;
52 	bool				in_progress;
53 	bool				seen_econnaborted;
54 };
55 
56 struct io_bind {
57 	struct file			*file;
58 	int				addr_len;
59 };
60 
61 struct io_listen {
62 	struct file			*file;
63 	int				backlog;
64 };
65 
66 struct io_sr_msg {
67 	struct file			*file;
68 	union {
69 		struct compat_msghdr __user	*umsg_compat;
70 		struct user_msghdr __user	*umsg;
71 		void __user			*buf;
72 	};
73 	int				len;
74 	unsigned			done_io;
75 	unsigned			msg_flags;
76 	unsigned			nr_multishot_loops;
77 	u16				flags;
78 	/* initialised and used only by !msg send variants */
79 	u16				buf_group;
80 	/* per-invocation mshot limit */
81 	unsigned			mshot_len;
82 	/* overall mshot byte limit */
83 	unsigned			mshot_total_len;
84 	void __user			*msg_control;
85 	/* used only for send zerocopy */
86 	struct io_kiocb 		*notif;
87 };
88 
89 /*
90  * The UAPI flags are the lower 8 bits, as that's all sqe->ioprio will hold
91  * anyway. Use the upper 8 bits for internal uses.
92  */
93 enum sr_retry_flags {
94 	IORING_RECV_RETRY	= (1U << 15),
95 	IORING_RECV_PARTIAL_MAP	= (1U << 14),
96 	IORING_RECV_MSHOT_CAP	= (1U << 13),
97 	IORING_RECV_MSHOT_LIM	= (1U << 12),
98 	IORING_RECV_MSHOT_DONE	= (1U << 11),
99 
100 	IORING_RECV_RETRY_CLEAR	= IORING_RECV_RETRY | IORING_RECV_PARTIAL_MAP,
101 	IORING_RECV_NO_RETRY	= IORING_RECV_RETRY | IORING_RECV_PARTIAL_MAP |
102 				  IORING_RECV_MSHOT_CAP | IORING_RECV_MSHOT_DONE,
103 };
104 
105 /*
106  * Number of times we'll try and do receives if there's more data. If we
107  * exceed this limit, then add us to the back of the queue and retry from
108  * there. This helps fairness between flooding clients.
109  */
110 #define MULTISHOT_MAX_RETRY	32
111 
112 struct io_recvzc {
113 	struct file			*file;
114 	u16				flags;
115 	u32				len;
116 	struct io_zcrx_ifq		*ifq;
117 };
118 
119 static int io_sg_from_iter_iovec(struct sk_buff *skb,
120 				 struct iov_iter *from, size_t length);
121 static int io_sg_from_iter(struct sk_buff *skb,
122 			   struct iov_iter *from, size_t length);
123 
124 int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
125 {
126 	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
127 
128 	if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
129 		     sqe->buf_index || sqe->splice_fd_in))
130 		return -EINVAL;
131 
132 	shutdown->how = READ_ONCE(sqe->len);
133 	req->flags |= REQ_F_FORCE_ASYNC;
134 	return 0;
135 }
136 
137 int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
138 {
139 	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
140 	struct socket *sock;
141 	int ret;
142 
143 	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
144 
145 	sock = sock_from_file(req->file);
146 	if (unlikely(!sock))
147 		return -ENOTSOCK;
148 
149 	ret = __sys_shutdown_sock(sock, shutdown->how);
150 	io_req_set_res(req, ret, 0);
151 	return IOU_COMPLETE;
152 }
153 
154 static bool io_net_retry(struct socket *sock, int flags)
155 {
156 	if (!(flags & MSG_WAITALL))
157 		return false;
158 	return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
159 }
160 
161 static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg)
162 {
163 	if (kmsg->vec.iovec)
164 		io_vec_free(&kmsg->vec);
165 }
166 
167 static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
168 {
169 	struct io_async_msghdr *hdr = req->async_data;
170 
171 	/* can't recycle, ensure we free the iovec if we have one */
172 	if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
173 		io_netmsg_iovec_free(hdr);
174 		return;
175 	}
176 
177 	/* Let normal cleanup path reap it if we fail adding to the cache */
178 	io_alloc_cache_vec_kasan(&hdr->vec);
179 	if (hdr->vec.nr > IO_VEC_CACHE_SOFT_CAP)
180 		io_vec_free(&hdr->vec);
181 
182 	if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr))
183 		io_req_async_data_clear(req, REQ_F_NEED_CLEANUP);
184 }
185 
186 static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req)
187 {
188 	struct io_ring_ctx *ctx = req->ctx;
189 	struct io_async_msghdr *hdr;
190 
191 	hdr = io_uring_alloc_async_data(&ctx->netmsg_cache, req);
192 	if (!hdr)
193 		return NULL;
194 
195 	/* If the async data was cached, we might have an iov cached inside. */
196 	if (hdr->vec.iovec)
197 		req->flags |= REQ_F_NEED_CLEANUP;
198 	return hdr;
199 }
200 
201 static inline void io_mshot_prep_retry(struct io_kiocb *req,
202 				       struct io_async_msghdr *kmsg)
203 {
204 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
205 
206 	req->flags &= ~REQ_F_BL_EMPTY;
207 	sr->done_io = 0;
208 	sr->flags &= ~IORING_RECV_RETRY_CLEAR;
209 	sr->len = sr->mshot_len;
210 }
211 
212 static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg,
213 			     const struct iovec __user *uiov, unsigned uvec_seg,
214 			     int ddir)
215 {
216 	struct iovec *iov;
217 	int ret, nr_segs;
218 
219 	if (iomsg->vec.iovec) {
220 		nr_segs = iomsg->vec.nr;
221 		iov = iomsg->vec.iovec;
222 	} else {
223 		nr_segs = 1;
224 		iov = &iomsg->fast_iov;
225 	}
226 
227 	ret = __import_iovec(ddir, uiov, uvec_seg, nr_segs, &iov,
228 			     &iomsg->msg.msg_iter, io_is_compat(req->ctx));
229 	if (unlikely(ret < 0))
230 		return ret;
231 
232 	if (iov) {
233 		req->flags |= REQ_F_NEED_CLEANUP;
234 		io_vec_reset_iovec(&iomsg->vec, iov, iomsg->msg.msg_iter.nr_segs);
235 	}
236 	return 0;
237 }
238 
239 static int io_compat_msg_copy_hdr(struct io_kiocb *req,
240 				  struct io_async_msghdr *iomsg,
241 				  struct compat_msghdr *msg, int ddir,
242 				  struct sockaddr __user **save_addr)
243 {
244 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
245 	struct compat_iovec __user *uiov;
246 	int ret;
247 
248 	if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg)))
249 		return -EFAULT;
250 
251 	ret = __get_compat_msghdr(&iomsg->msg, msg, save_addr);
252 	if (ret)
253 		return ret;
254 
255 	uiov = compat_ptr(msg->msg_iov);
256 	if (req->flags & REQ_F_BUFFER_SELECT) {
257 		if (msg->msg_iovlen == 0) {
258 			sr->len = 0;
259 		} else if (msg->msg_iovlen > 1) {
260 			return -EINVAL;
261 		} else {
262 			struct compat_iovec tmp_iov;
263 
264 			if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov)))
265 				return -EFAULT;
266 			sr->len = tmp_iov.iov_len;
267 		}
268 	}
269 	return 0;
270 }
271 
272 static int io_copy_msghdr_from_user(struct user_msghdr *msg,
273 				    struct user_msghdr __user *umsg)
274 {
275 	if (!user_access_begin(umsg, sizeof(*umsg)))
276 		return -EFAULT;
277 	unsafe_get_user(msg->msg_name, &umsg->msg_name, ua_end);
278 	unsafe_get_user(msg->msg_namelen, &umsg->msg_namelen, ua_end);
279 	unsafe_get_user(msg->msg_iov, &umsg->msg_iov, ua_end);
280 	unsafe_get_user(msg->msg_iovlen, &umsg->msg_iovlen, ua_end);
281 	unsafe_get_user(msg->msg_control, &umsg->msg_control, ua_end);
282 	unsafe_get_user(msg->msg_controllen, &umsg->msg_controllen, ua_end);
283 	user_access_end();
284 	return 0;
285 ua_end:
286 	user_access_end();
287 	return -EFAULT;
288 }
289 
290 static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
291 			   struct user_msghdr *msg, int ddir,
292 			   struct sockaddr __user **save_addr)
293 {
294 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
295 	struct user_msghdr __user *umsg = sr->umsg;
296 	int ret;
297 
298 	iomsg->msg.msg_name = &iomsg->addr;
299 	iomsg->msg.msg_iter.nr_segs = 0;
300 
301 	if (io_is_compat(req->ctx)) {
302 		struct compat_msghdr cmsg;
303 
304 		ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ddir, save_addr);
305 		if (ret)
306 			return ret;
307 
308 		memset(msg, 0, sizeof(*msg));
309 		msg->msg_namelen = cmsg.msg_namelen;
310 		msg->msg_controllen = cmsg.msg_controllen;
311 		msg->msg_iov = compat_ptr(cmsg.msg_iov);
312 		msg->msg_iovlen = cmsg.msg_iovlen;
313 		return 0;
314 	}
315 
316 	ret = io_copy_msghdr_from_user(msg, umsg);
317 	if (unlikely(ret))
318 		return ret;
319 
320 	msg->msg_flags = 0;
321 
322 	ret = __copy_msghdr(&iomsg->msg, msg, save_addr);
323 	if (ret)
324 		return ret;
325 
326 	if (req->flags & REQ_F_BUFFER_SELECT) {
327 		if (msg->msg_iovlen == 0) {
328 			sr->len = 0;
329 		} else if (msg->msg_iovlen > 1) {
330 			return -EINVAL;
331 		} else {
332 			struct iovec __user *uiov = msg->msg_iov;
333 			struct iovec tmp_iov;
334 
335 			if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov)))
336 				return -EFAULT;
337 			sr->len = tmp_iov.iov_len;
338 		}
339 	}
340 	return 0;
341 }
342 
343 void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req)
344 {
345 	struct io_async_msghdr *io = req->async_data;
346 
347 	io_netmsg_iovec_free(io);
348 }
349 
350 static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
351 {
352 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
353 	struct io_async_msghdr *kmsg = req->async_data;
354 	void __user *addr;
355 	u16 addr_len;
356 	int ret;
357 
358 	sr->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
359 
360 	if (READ_ONCE(sqe->__pad3[0]))
361 		return -EINVAL;
362 
363 	kmsg->msg.msg_name = NULL;
364 	kmsg->msg.msg_namelen = 0;
365 	kmsg->msg.msg_control = NULL;
366 	kmsg->msg.msg_controllen = 0;
367 	kmsg->msg.msg_ubuf = NULL;
368 
369 	addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
370 	addr_len = READ_ONCE(sqe->addr_len);
371 	if (addr) {
372 		ret = move_addr_to_kernel(addr, addr_len, &kmsg->addr);
373 		if (unlikely(ret < 0))
374 			return ret;
375 		kmsg->msg.msg_name = &kmsg->addr;
376 		kmsg->msg.msg_namelen = addr_len;
377 	}
378 	if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
379 		if (!(sr->flags & IORING_SEND_VECTORIZED)) {
380 			req->flags |= REQ_F_IMPORT_BUFFER;
381 			return 0;
382 		}
383 
384 		kmsg->msg.msg_iter.nr_segs = sr->len;
385 		return io_prep_reg_iovec(req, &kmsg->vec, sr->buf, sr->len);
386 	}
387 	if (req->flags & REQ_F_BUFFER_SELECT)
388 		return 0;
389 
390 	if (sr->flags & IORING_SEND_VECTORIZED)
391 		return io_net_import_vec(req, kmsg, sr->buf, sr->len, ITER_SOURCE);
392 
393 	return import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter);
394 }
395 
396 static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
397 {
398 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
399 	struct io_async_msghdr *kmsg = req->async_data;
400 	struct user_msghdr msg;
401 	int ret;
402 
403 	sr->flags |= IORING_SEND_VECTORIZED;
404 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
405 	ret = io_msg_copy_hdr(req, kmsg, &msg, ITER_SOURCE, NULL);
406 	if (unlikely(ret))
407 		return ret;
408 	/* save msg_control as sys_sendmsg() overwrites it */
409 	sr->msg_control = kmsg->msg.msg_control_user;
410 
411 	if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
412 		kmsg->msg.msg_iter.nr_segs = msg.msg_iovlen;
413 		return io_prep_reg_iovec(req, &kmsg->vec, msg.msg_iov,
414 					 msg.msg_iovlen);
415 	}
416 	if (req->flags & REQ_F_BUFFER_SELECT)
417 		return 0;
418 	return io_net_import_vec(req, kmsg, msg.msg_iov, msg.msg_iovlen, ITER_SOURCE);
419 }
420 
421 #define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE | \
422 			IORING_SEND_VECTORIZED | IORING_RECVSEND_FIXED_BUF)
423 
424 int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
425 {
426 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
427 
428 	sr->done_io = 0;
429 	sr->len = READ_ONCE(sqe->len);
430 	if (unlikely(sr->len < 0))
431 		return -EINVAL;
432 	sr->flags = READ_ONCE(sqe->ioprio);
433 	if (sr->flags & ~SENDMSG_FLAGS)
434 		return -EINVAL;
435 	if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
436 		/* registered buffer send only supported for plain IORING_OP_SEND */
437 		if (req->opcode != IORING_OP_SEND ||
438 		    (req->flags & REQ_F_BUFFER_SELECT) ||
439 		    (sr->flags & (IORING_RECVSEND_BUNDLE|IORING_SEND_VECTORIZED)))
440 			return -EINVAL;
441 		req->buf_index = READ_ONCE(sqe->buf_index);
442 	}
443 	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
444 	if (sr->msg_flags & MSG_DONTWAIT)
445 		req->flags |= REQ_F_NOWAIT;
446 	if (req->flags & REQ_F_BUFFER_SELECT)
447 		sr->buf_group = req->buf_index;
448 	if (sr->flags & IORING_RECVSEND_BUNDLE) {
449 		if (req->opcode == IORING_OP_SENDMSG)
450 			return -EINVAL;
451 		sr->msg_flags |= MSG_WAITALL;
452 		req->flags |= REQ_F_MULTISHOT;
453 	}
454 
455 	if (io_is_compat(req->ctx))
456 		sr->msg_flags |= MSG_CMSG_COMPAT;
457 
458 	if (unlikely(!io_msg_alloc_async(req)))
459 		return -ENOMEM;
460 	if (req->opcode != IORING_OP_SENDMSG)
461 		return io_send_setup(req, sqe);
462 	if (unlikely(sqe->addr2 || sqe->file_index))
463 		return -EINVAL;
464 	return io_sendmsg_setup(req, sqe);
465 }
466 
467 static void io_req_msg_cleanup(struct io_kiocb *req,
468 			       unsigned int issue_flags)
469 {
470 	io_netmsg_recycle(req, issue_flags);
471 }
472 
473 /*
474  * For bundle completions, we need to figure out how many segments we consumed.
475  * A bundle could be using a single ITER_UBUF if that's all we mapped, or it
476  * could be using an ITER_IOVEC. If the latter, then if we consumed all of
477  * the segments, then it's a trivial questiont o answer. If we have residual
478  * data in the iter, then loop the segments to figure out how much we
479  * transferred.
480  */
481 static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret)
482 {
483 	struct iovec *iov;
484 	int nbufs;
485 
486 	/* no data is always zero segments, and a ubuf is always 1 segment */
487 	if (ret <= 0)
488 		return 0;
489 	if (iter_is_ubuf(&kmsg->msg.msg_iter))
490 		return 1;
491 
492 	iov = kmsg->vec.iovec;
493 	if (!iov)
494 		iov = &kmsg->fast_iov;
495 
496 	/* if all data was transferred, it's basic pointer math */
497 	if (!iov_iter_count(&kmsg->msg.msg_iter))
498 		return iter_iov(&kmsg->msg.msg_iter) - iov;
499 
500 	/* short transfer, count segments */
501 	nbufs = 0;
502 	do {
503 		int this_len = min_t(int, iov[nbufs].iov_len, ret);
504 
505 		nbufs++;
506 		ret -= this_len;
507 	} while (ret);
508 
509 	return nbufs;
510 }
511 
512 static int io_net_kbuf_recyle(struct io_kiocb *req, struct io_buffer_list *bl,
513 			      struct io_async_msghdr *kmsg, int len)
514 {
515 	req->flags |= REQ_F_BL_NO_RECYCLE;
516 	if (req->flags & REQ_F_BUFFERS_COMMIT)
517 		io_kbuf_commit(req, bl, len, io_bundle_nbufs(kmsg, len));
518 	return IOU_RETRY;
519 }
520 
521 static inline bool io_send_finish(struct io_kiocb *req,
522 				  struct io_async_msghdr *kmsg,
523 				  struct io_br_sel *sel)
524 {
525 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
526 	bool bundle_finished = sel->val <= 0;
527 	unsigned int cflags;
528 
529 	if (!(sr->flags & IORING_RECVSEND_BUNDLE)) {
530 		cflags = io_put_kbuf(req, sel->val, sel->buf_list);
531 		goto finish;
532 	}
533 
534 	cflags = io_put_kbufs(req, sel->val, sel->buf_list, io_bundle_nbufs(kmsg, sel->val));
535 
536 	/*
537 	 * Don't start new bundles if the buffer list is empty, or if the
538 	 * current operation needed to go through polling to complete.
539 	 */
540 	if (bundle_finished || req->flags & (REQ_F_BL_EMPTY | REQ_F_POLLED))
541 		goto finish;
542 
543 	/*
544 	 * Fill CQE for this receive and see if we should keep trying to
545 	 * receive from this socket.
546 	 */
547 	if (io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) {
548 		io_mshot_prep_retry(req, kmsg);
549 		return false;
550 	}
551 
552 	/* Otherwise stop bundle and use the current result. */
553 finish:
554 	io_req_set_res(req, sel->val, cflags);
555 	sel->val = IOU_COMPLETE;
556 	return true;
557 }
558 
559 int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
560 {
561 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
562 	struct io_async_msghdr *kmsg = req->async_data;
563 	struct socket *sock;
564 	unsigned flags;
565 	int min_ret = 0;
566 	int ret;
567 
568 	sock = sock_from_file(req->file);
569 	if (unlikely(!sock))
570 		return -ENOTSOCK;
571 
572 	if (!(req->flags & REQ_F_POLLED) &&
573 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
574 		return -EAGAIN;
575 
576 	flags = sr->msg_flags;
577 	if (issue_flags & IO_URING_F_NONBLOCK)
578 		flags |= MSG_DONTWAIT;
579 	if (flags & MSG_WAITALL)
580 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
581 
582 	kmsg->msg.msg_control_user = sr->msg_control;
583 
584 	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
585 
586 	if (ret < min_ret) {
587 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
588 			return -EAGAIN;
589 		if (ret > 0 && io_net_retry(sock, flags)) {
590 			kmsg->msg.msg_controllen = 0;
591 			kmsg->msg.msg_control = NULL;
592 			sr->done_io += ret;
593 			return -EAGAIN;
594 		}
595 		if (ret == -ERESTARTSYS)
596 			ret = -EINTR;
597 		req_set_fail(req);
598 	}
599 	io_req_msg_cleanup(req, issue_flags);
600 	if (ret >= 0)
601 		ret += sr->done_io;
602 	else if (sr->done_io)
603 		ret = sr->done_io;
604 	io_req_set_res(req, ret, 0);
605 	return IOU_COMPLETE;
606 }
607 
608 static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags,
609 				 struct io_br_sel *sel, struct io_async_msghdr *kmsg)
610 {
611 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
612 	struct buf_sel_arg arg = {
613 		.iovs = &kmsg->fast_iov,
614 		.max_len = min_not_zero(sr->len, INT_MAX),
615 		.nr_iovs = 1,
616 		.buf_group = sr->buf_group,
617 	};
618 	int ret;
619 
620 	if (kmsg->vec.iovec) {
621 		arg.nr_iovs = kmsg->vec.nr;
622 		arg.iovs = kmsg->vec.iovec;
623 		arg.mode = KBUF_MODE_FREE;
624 	}
625 
626 	if (!(sr->flags & IORING_RECVSEND_BUNDLE))
627 		arg.nr_iovs = 1;
628 	else
629 		arg.mode |= KBUF_MODE_EXPAND;
630 
631 	ret = io_buffers_select(req, &arg, sel, issue_flags);
632 	if (unlikely(ret < 0))
633 		return ret;
634 
635 	if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
636 		kmsg->vec.nr = ret;
637 		kmsg->vec.iovec = arg.iovs;
638 		req->flags |= REQ_F_NEED_CLEANUP;
639 	}
640 	sr->len = arg.out_len;
641 
642 	if (ret == 1) {
643 		sr->buf = arg.iovs[0].iov_base;
644 		ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len,
645 					&kmsg->msg.msg_iter);
646 		if (unlikely(ret))
647 			return ret;
648 	} else {
649 		iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE,
650 				arg.iovs, ret, arg.out_len);
651 	}
652 
653 	return 0;
654 }
655 
656 int io_send(struct io_kiocb *req, unsigned int issue_flags)
657 {
658 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
659 	struct io_async_msghdr *kmsg = req->async_data;
660 	struct io_br_sel sel = { };
661 	struct socket *sock;
662 	unsigned flags;
663 	int min_ret = 0;
664 	int ret;
665 
666 	sock = sock_from_file(req->file);
667 	if (unlikely(!sock))
668 		return -ENOTSOCK;
669 
670 	if (!(req->flags & REQ_F_POLLED) &&
671 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
672 		return -EAGAIN;
673 
674 	if (req->flags & REQ_F_IMPORT_BUFFER) {
675 		ret = io_import_reg_buf(req, &kmsg->msg.msg_iter,
676 					(u64)(uintptr_t)sr->buf, sr->len,
677 					ITER_SOURCE, issue_flags);
678 		if (unlikely(ret))
679 			return ret;
680 		req->flags &= ~REQ_F_IMPORT_BUFFER;
681 	}
682 
683 	flags = sr->msg_flags;
684 	if (issue_flags & IO_URING_F_NONBLOCK)
685 		flags |= MSG_DONTWAIT;
686 
687 retry_bundle:
688 	sel.buf_list = NULL;
689 	if (io_do_buffer_select(req)) {
690 		ret = io_send_select_buffer(req, issue_flags, &sel, kmsg);
691 		if (ret)
692 			return ret;
693 	}
694 
695 	/*
696 	 * If MSG_WAITALL is set, or this is a bundle send, then we need
697 	 * the full amount. If just bundle is set, if we do a short send
698 	 * then we complete the bundle sequence rather than continue on.
699 	 */
700 	if (flags & MSG_WAITALL || sr->flags & IORING_RECVSEND_BUNDLE)
701 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
702 
703 	flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
704 	kmsg->msg.msg_flags = flags;
705 	ret = sock_sendmsg(sock, &kmsg->msg);
706 	if (ret < min_ret) {
707 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
708 			return -EAGAIN;
709 
710 		if (ret > 0 && io_net_retry(sock, flags)) {
711 			sr->len -= ret;
712 			sr->buf += ret;
713 			sr->done_io += ret;
714 			return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret);
715 		}
716 		if (ret == -ERESTARTSYS)
717 			ret = -EINTR;
718 		req_set_fail(req);
719 	}
720 	if (ret >= 0)
721 		ret += sr->done_io;
722 	else if (sr->done_io)
723 		ret = sr->done_io;
724 
725 	sel.val = ret;
726 	if (!io_send_finish(req, kmsg, &sel))
727 		goto retry_bundle;
728 
729 	io_req_msg_cleanup(req, issue_flags);
730 	return sel.val;
731 }
732 
733 static int io_recvmsg_mshot_prep(struct io_kiocb *req,
734 				 struct io_async_msghdr *iomsg,
735 				 int namelen, size_t controllen)
736 {
737 	if ((req->flags & (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) ==
738 			  (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) {
739 		int hdr;
740 
741 		if (unlikely(namelen < 0))
742 			return -EOVERFLOW;
743 		if (check_add_overflow(sizeof(struct io_uring_recvmsg_out),
744 					namelen, &hdr))
745 			return -EOVERFLOW;
746 		if (check_add_overflow(hdr, controllen, &hdr))
747 			return -EOVERFLOW;
748 
749 		iomsg->namelen = namelen;
750 		iomsg->controllen = controllen;
751 		return 0;
752 	}
753 
754 	return 0;
755 }
756 
757 static int io_recvmsg_copy_hdr(struct io_kiocb *req,
758 			       struct io_async_msghdr *iomsg)
759 {
760 	struct user_msghdr msg;
761 	int ret;
762 
763 	ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr);
764 	if (unlikely(ret))
765 		return ret;
766 
767 	if (!(req->flags & REQ_F_BUFFER_SELECT)) {
768 		ret = io_net_import_vec(req, iomsg, msg.msg_iov, msg.msg_iovlen,
769 					ITER_DEST);
770 		if (unlikely(ret))
771 			return ret;
772 	}
773 	return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen,
774 					msg.msg_controllen);
775 }
776 
777 static int io_recvmsg_prep_setup(struct io_kiocb *req)
778 {
779 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
780 	struct io_async_msghdr *kmsg;
781 
782 	kmsg = io_msg_alloc_async(req);
783 	if (unlikely(!kmsg))
784 		return -ENOMEM;
785 
786 	if (req->opcode == IORING_OP_RECV) {
787 		kmsg->msg.msg_name = NULL;
788 		kmsg->msg.msg_namelen = 0;
789 		kmsg->msg.msg_inq = 0;
790 		kmsg->msg.msg_control = NULL;
791 		kmsg->msg.msg_get_inq = 1;
792 		kmsg->msg.msg_controllen = 0;
793 		kmsg->msg.msg_ubuf = NULL;
794 
795 		if (req->flags & REQ_F_BUFFER_SELECT)
796 			return 0;
797 		if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
798 			req->flags |= REQ_F_IMPORT_BUFFER;
799 			return 0;
800 		}
801 		return import_ubuf(ITER_DEST, sr->buf, sr->len,
802 				   &kmsg->msg.msg_iter);
803 	}
804 
805 	return io_recvmsg_copy_hdr(req, kmsg);
806 }
807 
808 #define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \
809 			IORING_RECVSEND_BUNDLE | IORING_RECVSEND_FIXED_BUF)
810 
811 int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
812 {
813 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
814 
815 	sr->done_io = 0;
816 
817 	if (unlikely(sqe->addr2))
818 		return -EINVAL;
819 
820 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
821 	sr->len = READ_ONCE(sqe->len);
822 	if (unlikely(sr->len < 0))
823 		return -EINVAL;
824 	sr->flags = READ_ONCE(sqe->ioprio);
825 	if (sr->flags & ~RECVMSG_FLAGS)
826 		return -EINVAL;
827 	if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
828 		/* registered buffer recv only for plain IORING_OP_RECV */
829 		if (req->opcode != IORING_OP_RECV ||
830 		    (req->flags & REQ_F_BUFFER_SELECT) ||
831 		    (sr->flags & (IORING_RECV_MULTISHOT | IORING_RECVSEND_BUNDLE)))
832 			return -EINVAL;
833 		req->buf_index = READ_ONCE(sqe->buf_index);
834 	}
835 	sr->msg_flags = READ_ONCE(sqe->msg_flags);
836 	if (sr->msg_flags & MSG_DONTWAIT)
837 		req->flags |= REQ_F_NOWAIT;
838 	if (sr->msg_flags & MSG_ERRQUEUE)
839 		req->flags |= REQ_F_CLEAR_POLLIN;
840 	if (req->flags & REQ_F_BUFFER_SELECT)
841 		sr->buf_group = req->buf_index;
842 	sr->mshot_total_len = sr->mshot_len = 0;
843 	if (sr->flags & IORING_RECV_MULTISHOT) {
844 		if (!(req->flags & REQ_F_BUFFER_SELECT))
845 			return -EINVAL;
846 		if (sr->msg_flags & MSG_WAITALL)
847 			return -EINVAL;
848 		if (req->opcode == IORING_OP_RECV) {
849 			sr->mshot_len = sr->len;
850 			sr->mshot_total_len = READ_ONCE(sqe->optlen);
851 			if (sr->mshot_total_len)
852 				sr->flags |= IORING_RECV_MSHOT_LIM;
853 		} else if (sqe->optlen) {
854 			return -EINVAL;
855 		}
856 		req->flags |= REQ_F_APOLL_MULTISHOT;
857 	} else if (sqe->optlen) {
858 		return -EINVAL;
859 	}
860 
861 	if (sr->flags & IORING_RECVSEND_BUNDLE) {
862 		if (req->opcode == IORING_OP_RECVMSG)
863 			return -EINVAL;
864 	}
865 
866 	if (io_is_compat(req->ctx))
867 		sr->msg_flags |= MSG_CMSG_COMPAT;
868 
869 	sr->nr_multishot_loops = 0;
870 	return io_recvmsg_prep_setup(req);
871 }
872 
873 /* bits to clear in old and inherit in new cflags on bundle retry */
874 #define CQE_F_MASK	(IORING_CQE_F_SOCK_NONEMPTY|IORING_CQE_F_MORE|\
875 			 IORING_CQE_F_BUF_MORE)
876 
877 /*
878  * Finishes io_recv and io_recvmsg.
879  *
880  * Returns true if it is actually finished, or false if it should run
881  * again (for multishot).
882  */
883 static inline bool io_recv_finish(struct io_kiocb *req,
884 				  struct io_async_msghdr *kmsg,
885 				  struct io_br_sel *sel, bool mshot_finished,
886 				  unsigned issue_flags)
887 {
888 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
889 	unsigned int cflags = 0;
890 
891 	if (kmsg->msg.msg_inq > 0)
892 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
893 
894 	if (sel->val > 0 && sr->flags & IORING_RECV_MSHOT_LIM) {
895 		/*
896 		 * If sr->len hits zero, the limit has been reached. Mark
897 		 * mshot as finished, and flag MSHOT_DONE as well to prevent
898 		 * a potential bundle from being retried.
899 		 */
900 		sr->mshot_total_len -= min_t(int, sel->val, sr->mshot_total_len);
901 		if (!sr->mshot_total_len) {
902 			sr->flags |= IORING_RECV_MSHOT_DONE;
903 			mshot_finished = true;
904 		}
905 	}
906 
907 	if (sr->flags & IORING_RECVSEND_BUNDLE) {
908 		size_t this_ret = sel->val - sr->done_io;
909 
910 		cflags |= io_put_kbufs(req, this_ret, sel->buf_list, io_bundle_nbufs(kmsg, this_ret));
911 		if (sr->flags & IORING_RECV_RETRY)
912 			cflags = req->cqe.flags | (cflags & CQE_F_MASK);
913 		if (sr->mshot_len && sel->val >= sr->mshot_len)
914 			sr->flags |= IORING_RECV_MSHOT_CAP;
915 		/* bundle with no more immediate buffers, we're done */
916 		if (req->flags & REQ_F_BL_EMPTY)
917 			goto finish;
918 		/*
919 		 * If more is available AND it was a full transfer, retry and
920 		 * append to this one
921 		 */
922 		if (!(sr->flags & IORING_RECV_NO_RETRY) &&
923 		    kmsg->msg.msg_inq > 1 && this_ret > 0 &&
924 		    !iov_iter_count(&kmsg->msg.msg_iter)) {
925 			req->cqe.flags = cflags & ~CQE_F_MASK;
926 			sr->len = kmsg->msg.msg_inq;
927 			sr->done_io += this_ret;
928 			sr->flags |= IORING_RECV_RETRY;
929 			return false;
930 		}
931 	} else {
932 		cflags |= io_put_kbuf(req, sel->val, sel->buf_list);
933 	}
934 
935 	/*
936 	 * Fill CQE for this receive and see if we should keep trying to
937 	 * receive from this socket.
938 	 */
939 	if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished &&
940 	    io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) {
941 		sel->val = IOU_RETRY;
942 		io_mshot_prep_retry(req, kmsg);
943 		/* Known not-empty or unknown state, retry */
944 		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) {
945 			if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY &&
946 			    !(sr->flags & IORING_RECV_MSHOT_CAP)) {
947 				return false;
948 			}
949 			/* mshot retries exceeded, force a requeue */
950 			sr->nr_multishot_loops = 0;
951 			sr->flags &= ~IORING_RECV_MSHOT_CAP;
952 			if (issue_flags & IO_URING_F_MULTISHOT)
953 				sel->val = IOU_REQUEUE;
954 		}
955 		return true;
956 	}
957 
958 	/* Finish the request / stop multishot. */
959 finish:
960 	io_req_set_res(req, sel->val, cflags);
961 	sel->val = IOU_COMPLETE;
962 	io_req_msg_cleanup(req, issue_flags);
963 	return true;
964 }
965 
966 static int io_recvmsg_prep_multishot(struct io_async_msghdr *kmsg,
967 				     struct io_sr_msg *sr, void __user **buf,
968 				     size_t *len)
969 {
970 	unsigned long ubuf = (unsigned long) *buf;
971 	unsigned long hdr;
972 
973 	hdr = sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
974 		kmsg->controllen;
975 	if (*len < hdr)
976 		return -EFAULT;
977 
978 	if (kmsg->controllen) {
979 		unsigned long control = ubuf + hdr - kmsg->controllen;
980 
981 		kmsg->msg.msg_control_user = (void __user *) control;
982 		kmsg->msg.msg_controllen = kmsg->controllen;
983 	}
984 
985 	sr->buf = *buf; /* stash for later copy */
986 	*buf = (void __user *) (ubuf + hdr);
987 	kmsg->payloadlen = *len = *len - hdr;
988 	return 0;
989 }
990 
991 struct io_recvmsg_multishot_hdr {
992 	struct io_uring_recvmsg_out msg;
993 	struct sockaddr_storage addr;
994 };
995 
996 static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io,
997 				struct io_async_msghdr *kmsg,
998 				unsigned int flags, bool *finished)
999 {
1000 	int err;
1001 	int copy_len;
1002 	struct io_recvmsg_multishot_hdr hdr;
1003 
1004 	if (kmsg->namelen)
1005 		kmsg->msg.msg_name = &hdr.addr;
1006 	kmsg->msg.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
1007 	kmsg->msg.msg_namelen = 0;
1008 
1009 	if (sock->file->f_flags & O_NONBLOCK)
1010 		flags |= MSG_DONTWAIT;
1011 
1012 	err = sock_recvmsg(sock, &kmsg->msg, flags);
1013 	*finished = err <= 0;
1014 	if (err < 0)
1015 		return err;
1016 
1017 	hdr.msg = (struct io_uring_recvmsg_out) {
1018 		.controllen = kmsg->controllen - kmsg->msg.msg_controllen,
1019 		.flags = kmsg->msg.msg_flags & ~MSG_CMSG_COMPAT
1020 	};
1021 
1022 	hdr.msg.payloadlen = err;
1023 	if (err > kmsg->payloadlen)
1024 		err = kmsg->payloadlen;
1025 
1026 	copy_len = sizeof(struct io_uring_recvmsg_out);
1027 	if (kmsg->msg.msg_namelen > kmsg->namelen)
1028 		copy_len += kmsg->namelen;
1029 	else
1030 		copy_len += kmsg->msg.msg_namelen;
1031 
1032 	/*
1033 	 *      "fromlen shall refer to the value before truncation.."
1034 	 *                      1003.1g
1035 	 */
1036 	hdr.msg.namelen = kmsg->msg.msg_namelen;
1037 
1038 	/* ensure that there is no gap between hdr and sockaddr_storage */
1039 	BUILD_BUG_ON(offsetof(struct io_recvmsg_multishot_hdr, addr) !=
1040 		     sizeof(struct io_uring_recvmsg_out));
1041 	if (copy_to_user(io->buf, &hdr, copy_len)) {
1042 		*finished = true;
1043 		return -EFAULT;
1044 	}
1045 
1046 	return sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
1047 			kmsg->controllen + err;
1048 }
1049 
1050 int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
1051 {
1052 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1053 	struct io_async_msghdr *kmsg = req->async_data;
1054 	struct io_br_sel sel = { };
1055 	struct socket *sock;
1056 	unsigned flags;
1057 	int ret, min_ret = 0;
1058 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1059 	bool mshot_finished = true;
1060 
1061 	sock = sock_from_file(req->file);
1062 	if (unlikely(!sock))
1063 		return -ENOTSOCK;
1064 
1065 	if (!(req->flags & REQ_F_POLLED) &&
1066 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
1067 		return -EAGAIN;
1068 
1069 	flags = sr->msg_flags;
1070 	if (force_nonblock)
1071 		flags |= MSG_DONTWAIT;
1072 
1073 retry_multishot:
1074 	sel.buf_list = NULL;
1075 	if (io_do_buffer_select(req)) {
1076 		size_t len = sr->len;
1077 
1078 		sel = io_buffer_select(req, &len, sr->buf_group, issue_flags);
1079 		if (!sel.addr)
1080 			return -ENOBUFS;
1081 
1082 		if (req->flags & REQ_F_APOLL_MULTISHOT) {
1083 			ret = io_recvmsg_prep_multishot(kmsg, sr, &sel.addr, &len);
1084 			if (ret) {
1085 				io_kbuf_recycle(req, sel.buf_list, issue_flags);
1086 				return ret;
1087 			}
1088 		}
1089 
1090 		iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, sel.addr, len);
1091 	}
1092 
1093 	kmsg->msg.msg_get_inq = 1;
1094 	kmsg->msg.msg_inq = -1;
1095 	if (req->flags & REQ_F_APOLL_MULTISHOT) {
1096 		ret = io_recvmsg_multishot(sock, sr, kmsg, flags,
1097 					   &mshot_finished);
1098 	} else {
1099 		/* disable partial retry for recvmsg with cmsg attached */
1100 		if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen)
1101 			min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1102 
1103 		ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg,
1104 					 kmsg->uaddr, flags);
1105 	}
1106 
1107 	if (ret < min_ret) {
1108 		if (ret == -EAGAIN && force_nonblock) {
1109 			io_kbuf_recycle(req, sel.buf_list, issue_flags);
1110 			return IOU_RETRY;
1111 		}
1112 		if (ret > 0 && io_net_retry(sock, flags)) {
1113 			sr->done_io += ret;
1114 			return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret);
1115 		}
1116 		if (ret == -ERESTARTSYS)
1117 			ret = -EINTR;
1118 		req_set_fail(req);
1119 	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
1120 		req_set_fail(req);
1121 	}
1122 
1123 	if (ret > 0)
1124 		ret += sr->done_io;
1125 	else if (sr->done_io)
1126 		ret = sr->done_io;
1127 	else
1128 		io_kbuf_recycle(req, sel.buf_list, issue_flags);
1129 
1130 	sel.val = ret;
1131 	if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags))
1132 		goto retry_multishot;
1133 
1134 	return sel.val;
1135 }
1136 
1137 static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg,
1138 			      struct io_br_sel *sel, unsigned int issue_flags)
1139 {
1140 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1141 	int ret;
1142 
1143 	/*
1144 	 * If the ring isn't locked, then don't use the peek interface
1145 	 * to grab multiple buffers as we will lock/unlock between
1146 	 * this selection and posting the buffers.
1147 	 */
1148 	if (!(issue_flags & IO_URING_F_UNLOCKED) &&
1149 	    sr->flags & IORING_RECVSEND_BUNDLE) {
1150 		struct buf_sel_arg arg = {
1151 			.iovs = &kmsg->fast_iov,
1152 			.nr_iovs = 1,
1153 			.mode = KBUF_MODE_EXPAND,
1154 			.buf_group = sr->buf_group,
1155 		};
1156 
1157 		if (kmsg->vec.iovec) {
1158 			arg.nr_iovs = kmsg->vec.nr;
1159 			arg.iovs = kmsg->vec.iovec;
1160 			arg.mode |= KBUF_MODE_FREE;
1161 		}
1162 
1163 		if (sel->val)
1164 			arg.max_len = sel->val;
1165 		else if (kmsg->msg.msg_inq > 1)
1166 			arg.max_len = min_not_zero(sel->val, (ssize_t) kmsg->msg.msg_inq);
1167 
1168 		/* if mshot limited, ensure we don't go over */
1169 		if (sr->flags & IORING_RECV_MSHOT_LIM)
1170 			arg.max_len = min_not_zero(arg.max_len, sr->mshot_total_len);
1171 		ret = io_buffers_peek(req, &arg, sel);
1172 		if (unlikely(ret < 0))
1173 			return ret;
1174 
1175 		if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
1176 			kmsg->vec.nr = ret;
1177 			kmsg->vec.iovec = arg.iovs;
1178 			req->flags |= REQ_F_NEED_CLEANUP;
1179 		}
1180 		if (arg.partial_map)
1181 			sr->flags |= IORING_RECV_PARTIAL_MAP;
1182 
1183 		/* special case 1 vec, can be a fast path */
1184 		if (ret == 1) {
1185 			sr->buf = arg.iovs[0].iov_base;
1186 			sr->len = arg.iovs[0].iov_len;
1187 			goto map_ubuf;
1188 		}
1189 		iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret,
1190 				arg.out_len);
1191 	} else {
1192 		size_t len = sel->val;
1193 
1194 		*sel = io_buffer_select(req, &len, sr->buf_group, issue_flags);
1195 		if (!sel->addr)
1196 			return -ENOBUFS;
1197 		sr->buf = sel->addr;
1198 		sr->len = len;
1199 map_ubuf:
1200 		ret = import_ubuf(ITER_DEST, sr->buf, sr->len,
1201 				  &kmsg->msg.msg_iter);
1202 		if (unlikely(ret))
1203 			return ret;
1204 	}
1205 
1206 	return 0;
1207 }
1208 
1209 int io_recv(struct io_kiocb *req, unsigned int issue_flags)
1210 {
1211 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1212 	struct io_async_msghdr *kmsg = req->async_data;
1213 	struct io_br_sel sel;
1214 	struct socket *sock;
1215 	unsigned flags;
1216 	int ret, min_ret = 0;
1217 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1218 	bool mshot_finished;
1219 
1220 	sock = sock_from_file(req->file);
1221 	if (unlikely(!sock))
1222 		return -ENOTSOCK;
1223 
1224 	if (!(req->flags & REQ_F_POLLED) &&
1225 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
1226 		return -EAGAIN;
1227 
1228 	flags = sr->msg_flags;
1229 	if (force_nonblock)
1230 		flags |= MSG_DONTWAIT;
1231 
1232 	if (req->flags & REQ_F_IMPORT_BUFFER) {
1233 		ret = io_import_reg_buf(req, &kmsg->msg.msg_iter,
1234 					(u64)(uintptr_t)sr->buf, sr->len,
1235 					ITER_DEST, issue_flags);
1236 		if (unlikely(ret)) {
1237 			kmsg->msg.msg_inq = -1;
1238 			sel.buf_list = NULL;
1239 			goto out_free;
1240 		}
1241 		req->flags &= ~REQ_F_IMPORT_BUFFER;
1242 	}
1243 
1244 retry_multishot:
1245 	sel.buf_list = NULL;
1246 	if (io_do_buffer_select(req)) {
1247 		sel.val = sr->len;
1248 		ret = io_recv_buf_select(req, kmsg, &sel, issue_flags);
1249 		if (unlikely(ret < 0)) {
1250 			kmsg->msg.msg_inq = -1;
1251 			goto out_free;
1252 		}
1253 		sr->buf = NULL;
1254 	}
1255 
1256 	kmsg->msg.msg_flags = 0;
1257 	kmsg->msg.msg_inq = -1;
1258 
1259 	if (flags & MSG_WAITALL)
1260 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1261 
1262 	ret = sock_recvmsg(sock, &kmsg->msg, flags);
1263 	if (ret < min_ret) {
1264 		if (ret == -EAGAIN && force_nonblock) {
1265 			io_kbuf_recycle(req, sel.buf_list, issue_flags);
1266 			return IOU_RETRY;
1267 		}
1268 		if (ret > 0 && io_net_retry(sock, flags)) {
1269 			sr->len -= ret;
1270 			sr->buf += ret;
1271 			sr->done_io += ret;
1272 			return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret);
1273 		}
1274 		if (ret == -ERESTARTSYS)
1275 			ret = -EINTR;
1276 		req_set_fail(req);
1277 	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
1278 out_free:
1279 		req_set_fail(req);
1280 	}
1281 
1282 	mshot_finished = ret <= 0;
1283 	if (ret > 0)
1284 		ret += sr->done_io;
1285 	else if (sr->done_io)
1286 		ret = sr->done_io;
1287 	else
1288 		io_kbuf_recycle(req, sel.buf_list, issue_flags);
1289 
1290 	sel.val = ret;
1291 	if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags))
1292 		goto retry_multishot;
1293 
1294 	return sel.val;
1295 }
1296 
1297 int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1298 {
1299 	struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
1300 	unsigned ifq_idx;
1301 
1302 	if (unlikely(sqe->addr2 || sqe->addr || sqe->addr3))
1303 		return -EINVAL;
1304 
1305 	ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx);
1306 	zc->ifq = xa_load(&req->ctx->zcrx_ctxs, ifq_idx);
1307 	if (!zc->ifq)
1308 		return -EINVAL;
1309 
1310 	zc->len = READ_ONCE(sqe->len);
1311 	zc->flags = READ_ONCE(sqe->ioprio);
1312 	if (READ_ONCE(sqe->msg_flags))
1313 		return -EINVAL;
1314 	if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT))
1315 		return -EINVAL;
1316 	/* multishot required */
1317 	if (!(zc->flags & IORING_RECV_MULTISHOT))
1318 		return -EINVAL;
1319 	/* All data completions are posted as aux CQEs. */
1320 	req->flags |= REQ_F_APOLL_MULTISHOT;
1321 
1322 	return 0;
1323 }
1324 
1325 int io_recvzc(struct io_kiocb *req, unsigned int issue_flags)
1326 {
1327 	struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
1328 	struct socket *sock;
1329 	unsigned int len;
1330 	int ret;
1331 
1332 	sock = sock_from_file(req->file);
1333 	if (unlikely(!sock))
1334 		return -ENOTSOCK;
1335 
1336 	if (!(req->flags & REQ_F_POLLED) &&
1337 	    (zc->flags & IORING_RECVSEND_POLL_FIRST))
1338 		return -EAGAIN;
1339 
1340 	len = zc->len;
1341 	ret = io_zcrx_recv(req, zc->ifq, sock, 0, issue_flags, &zc->len);
1342 	if (len && zc->len == 0) {
1343 		io_req_set_res(req, 0, 0);
1344 
1345 		return IOU_COMPLETE;
1346 	}
1347 	if (unlikely(ret <= 0) && ret != -EAGAIN) {
1348 		if (ret == -ERESTARTSYS)
1349 			ret = -EINTR;
1350 		if (ret == IOU_REQUEUE)
1351 			return IOU_REQUEUE;
1352 
1353 		req_set_fail(req);
1354 		io_req_set_res(req, ret, 0);
1355 		return IOU_COMPLETE;
1356 	}
1357 	return IOU_RETRY;
1358 }
1359 
1360 void io_send_zc_cleanup(struct io_kiocb *req)
1361 {
1362 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1363 	struct io_async_msghdr *io = req->async_data;
1364 
1365 	if (req_has_async_data(req))
1366 		io_netmsg_iovec_free(io);
1367 	if (zc->notif) {
1368 		io_notif_flush(zc->notif);
1369 		zc->notif = NULL;
1370 	}
1371 }
1372 
1373 #define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF)
1374 #define IO_ZC_FLAGS_VALID  (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE | \
1375 				IORING_SEND_VECTORIZED)
1376 
1377 int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1378 {
1379 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1380 	struct io_ring_ctx *ctx = req->ctx;
1381 	struct io_async_msghdr *iomsg;
1382 	struct io_kiocb *notif;
1383 	u64 user_data;
1384 	int ret;
1385 
1386 	zc->done_io = 0;
1387 
1388 	if (unlikely(READ_ONCE(sqe->__pad2[0])))
1389 		return -EINVAL;
1390 	/* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */
1391 	if (req->flags & REQ_F_CQE_SKIP)
1392 		return -EINVAL;
1393 
1394 	notif = zc->notif = io_alloc_notif(ctx);
1395 	if (!notif)
1396 		return -ENOMEM;
1397 	user_data = READ_ONCE(sqe->addr3);
1398 	if (!user_data)
1399 		user_data = req->cqe.user_data;
1400 
1401 	notif->cqe.user_data = user_data;
1402 	notif->cqe.res = 0;
1403 	notif->cqe.flags = IORING_CQE_F_NOTIF;
1404 	req->flags |= REQ_F_NEED_CLEANUP | REQ_F_POLL_NO_LAZY;
1405 
1406 	zc->flags = READ_ONCE(sqe->ioprio);
1407 	if (unlikely(zc->flags & ~IO_ZC_FLAGS_COMMON)) {
1408 		if (zc->flags & ~IO_ZC_FLAGS_VALID)
1409 			return -EINVAL;
1410 		if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) {
1411 			struct io_notif_data *nd = io_notif_to_data(notif);
1412 
1413 			nd->zc_report = true;
1414 			nd->zc_used = false;
1415 			nd->zc_copied = false;
1416 		}
1417 	}
1418 
1419 	zc->len = READ_ONCE(sqe->len);
1420 	zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY;
1421 	req->buf_index = READ_ONCE(sqe->buf_index);
1422 	if (zc->msg_flags & MSG_DONTWAIT)
1423 		req->flags |= REQ_F_NOWAIT;
1424 
1425 	if (io_is_compat(ctx))
1426 		zc->msg_flags |= MSG_CMSG_COMPAT;
1427 
1428 	iomsg = io_msg_alloc_async(req);
1429 	if (unlikely(!iomsg))
1430 		return -ENOMEM;
1431 
1432 	if (req->opcode == IORING_OP_SEND_ZC) {
1433 		ret = io_send_setup(req, sqe);
1434 	} else {
1435 		if (unlikely(sqe->addr2 || sqe->file_index))
1436 			return -EINVAL;
1437 		ret = io_sendmsg_setup(req, sqe);
1438 	}
1439 	if (unlikely(ret))
1440 		return ret;
1441 
1442 	if (!(zc->flags & IORING_RECVSEND_FIXED_BUF)) {
1443 		iomsg->msg.sg_from_iter = io_sg_from_iter_iovec;
1444 		return io_notif_account_mem(zc->notif, iomsg->msg.msg_iter.count);
1445 	}
1446 	iomsg->msg.sg_from_iter = io_sg_from_iter;
1447 	return 0;
1448 }
1449 
1450 static int io_sg_from_iter_iovec(struct sk_buff *skb,
1451 				 struct iov_iter *from, size_t length)
1452 {
1453 	skb_zcopy_downgrade_managed(skb);
1454 	return zerocopy_fill_skb_from_iter(skb, from, length);
1455 }
1456 
1457 static int io_sg_from_iter(struct sk_buff *skb,
1458 			   struct iov_iter *from, size_t length)
1459 {
1460 	struct skb_shared_info *shinfo = skb_shinfo(skb);
1461 	int frag = shinfo->nr_frags;
1462 	int ret = 0;
1463 	struct bvec_iter bi;
1464 	ssize_t copied = 0;
1465 	unsigned long truesize = 0;
1466 
1467 	if (!frag)
1468 		shinfo->flags |= SKBFL_MANAGED_FRAG_REFS;
1469 	else if (unlikely(!skb_zcopy_managed(skb)))
1470 		return zerocopy_fill_skb_from_iter(skb, from, length);
1471 
1472 	bi.bi_size = min(from->count, length);
1473 	bi.bi_bvec_done = from->iov_offset;
1474 	bi.bi_idx = 0;
1475 
1476 	while (bi.bi_size && frag < MAX_SKB_FRAGS) {
1477 		struct bio_vec v = mp_bvec_iter_bvec(from->bvec, bi);
1478 
1479 		copied += v.bv_len;
1480 		truesize += PAGE_ALIGN(v.bv_len + v.bv_offset);
1481 		__skb_fill_page_desc_noacc(shinfo, frag++, v.bv_page,
1482 					   v.bv_offset, v.bv_len);
1483 		bvec_iter_advance_single(from->bvec, &bi, v.bv_len);
1484 	}
1485 	if (bi.bi_size)
1486 		ret = -EMSGSIZE;
1487 
1488 	shinfo->nr_frags = frag;
1489 	from->bvec += bi.bi_idx;
1490 	from->nr_segs -= bi.bi_idx;
1491 	from->count -= copied;
1492 	from->iov_offset = bi.bi_bvec_done;
1493 
1494 	skb->data_len += copied;
1495 	skb->len += copied;
1496 	skb->truesize += truesize;
1497 	return ret;
1498 }
1499 
1500 static int io_send_zc_import(struct io_kiocb *req,
1501 			     struct io_async_msghdr *kmsg,
1502 			     unsigned int issue_flags)
1503 {
1504 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1505 	struct io_kiocb *notif = sr->notif;
1506 	int ret;
1507 
1508 	WARN_ON_ONCE(!(sr->flags & IORING_RECVSEND_FIXED_BUF));
1509 
1510 	notif->buf_index = req->buf_index;
1511 
1512 	if (!(sr->flags & IORING_SEND_VECTORIZED)) {
1513 		ret = io_import_reg_buf(notif, &kmsg->msg.msg_iter,
1514 					(u64)(uintptr_t)sr->buf, sr->len,
1515 					ITER_SOURCE, issue_flags);
1516 	} else {
1517 		unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs;
1518 
1519 		ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter,
1520 					notif, &kmsg->vec, uvec_segs,
1521 					issue_flags);
1522 	}
1523 
1524 	if (unlikely(ret))
1525 		return ret;
1526 	req->flags &= ~REQ_F_IMPORT_BUFFER;
1527 	return 0;
1528 }
1529 
1530 int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
1531 {
1532 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1533 	struct io_async_msghdr *kmsg = req->async_data;
1534 	struct socket *sock;
1535 	unsigned msg_flags;
1536 	int ret, min_ret = 0;
1537 
1538 	sock = sock_from_file(req->file);
1539 	if (unlikely(!sock))
1540 		return -ENOTSOCK;
1541 	if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
1542 		return -EOPNOTSUPP;
1543 	if (!(req->flags & REQ_F_POLLED) &&
1544 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
1545 		return -EAGAIN;
1546 
1547 	if (req->flags & REQ_F_IMPORT_BUFFER) {
1548 		ret = io_send_zc_import(req, kmsg, issue_flags);
1549 		if (unlikely(ret))
1550 			return ret;
1551 	}
1552 
1553 	msg_flags = sr->msg_flags;
1554 	if (issue_flags & IO_URING_F_NONBLOCK)
1555 		msg_flags |= MSG_DONTWAIT;
1556 	if (msg_flags & MSG_WAITALL)
1557 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1558 
1559 	kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg;
1560 
1561 	if (req->opcode == IORING_OP_SEND_ZC) {
1562 		msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
1563 		kmsg->msg.msg_flags = msg_flags;
1564 		ret = sock_sendmsg(sock, &kmsg->msg);
1565 	} else {
1566 		kmsg->msg.msg_control_user = sr->msg_control;
1567 		ret = __sys_sendmsg_sock(sock, &kmsg->msg, msg_flags);
1568 	}
1569 
1570 	if (unlikely(ret < min_ret)) {
1571 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1572 			return -EAGAIN;
1573 
1574 		if (ret > 0 && io_net_retry(sock, sr->msg_flags)) {
1575 			sr->done_io += ret;
1576 			return -EAGAIN;
1577 		}
1578 		if (ret == -ERESTARTSYS)
1579 			ret = -EINTR;
1580 		req_set_fail(req);
1581 	}
1582 
1583 	if (ret >= 0)
1584 		ret += sr->done_io;
1585 	else if (sr->done_io)
1586 		ret = sr->done_io;
1587 
1588 	/*
1589 	 * If we're in io-wq we can't rely on tw ordering guarantees, defer
1590 	 * flushing notif to io_send_zc_cleanup()
1591 	 */
1592 	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
1593 		io_notif_flush(sr->notif);
1594 		sr->notif = NULL;
1595 		io_req_msg_cleanup(req, 0);
1596 	}
1597 	io_req_set_res(req, ret, IORING_CQE_F_MORE);
1598 	return IOU_COMPLETE;
1599 }
1600 
1601 void io_sendrecv_fail(struct io_kiocb *req)
1602 {
1603 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1604 
1605 	if (sr->done_io)
1606 		req->cqe.res = sr->done_io;
1607 
1608 	if ((req->flags & REQ_F_NEED_CLEANUP) &&
1609 	    (req->opcode == IORING_OP_SEND_ZC || req->opcode == IORING_OP_SENDMSG_ZC))
1610 		req->cqe.flags |= IORING_CQE_F_MORE;
1611 }
1612 
1613 #define ACCEPT_FLAGS	(IORING_ACCEPT_MULTISHOT | IORING_ACCEPT_DONTWAIT | \
1614 			 IORING_ACCEPT_POLL_FIRST)
1615 
1616 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1617 {
1618 	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
1619 
1620 	if (sqe->len || sqe->buf_index)
1621 		return -EINVAL;
1622 
1623 	accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1624 	accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
1625 	accept->flags = READ_ONCE(sqe->accept_flags);
1626 	accept->nofile = rlimit(RLIMIT_NOFILE);
1627 	accept->iou_flags = READ_ONCE(sqe->ioprio);
1628 	if (accept->iou_flags & ~ACCEPT_FLAGS)
1629 		return -EINVAL;
1630 
1631 	accept->file_slot = READ_ONCE(sqe->file_index);
1632 	if (accept->file_slot) {
1633 		if (accept->flags & SOCK_CLOEXEC)
1634 			return -EINVAL;
1635 		if (accept->iou_flags & IORING_ACCEPT_MULTISHOT &&
1636 		    accept->file_slot != IORING_FILE_INDEX_ALLOC)
1637 			return -EINVAL;
1638 	}
1639 	if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1640 		return -EINVAL;
1641 	if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
1642 		accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
1643 	if (accept->iou_flags & IORING_ACCEPT_MULTISHOT)
1644 		req->flags |= REQ_F_APOLL_MULTISHOT;
1645 	if (accept->iou_flags & IORING_ACCEPT_DONTWAIT)
1646 		req->flags |= REQ_F_NOWAIT;
1647 	return 0;
1648 }
1649 
1650 int io_accept(struct io_kiocb *req, unsigned int issue_flags)
1651 {
1652 	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
1653 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1654 	bool fixed = !!accept->file_slot;
1655 	struct proto_accept_arg arg = {
1656 		.flags = force_nonblock ? O_NONBLOCK : 0,
1657 	};
1658 	struct file *file;
1659 	unsigned cflags;
1660 	int ret, fd;
1661 
1662 	if (!(req->flags & REQ_F_POLLED) &&
1663 	    accept->iou_flags & IORING_ACCEPT_POLL_FIRST)
1664 		return -EAGAIN;
1665 
1666 retry:
1667 	if (!fixed) {
1668 		fd = __get_unused_fd_flags(accept->flags, accept->nofile);
1669 		if (unlikely(fd < 0))
1670 			return fd;
1671 	}
1672 	arg.err = 0;
1673 	arg.is_empty = -1;
1674 	file = do_accept(req->file, &arg, accept->addr, accept->addr_len,
1675 			 accept->flags);
1676 	if (IS_ERR(file)) {
1677 		if (!fixed)
1678 			put_unused_fd(fd);
1679 		ret = PTR_ERR(file);
1680 		if (ret == -EAGAIN && force_nonblock &&
1681 		    !(accept->iou_flags & IORING_ACCEPT_DONTWAIT))
1682 			return IOU_RETRY;
1683 
1684 		if (ret == -ERESTARTSYS)
1685 			ret = -EINTR;
1686 	} else if (!fixed) {
1687 		fd_install(fd, file);
1688 		ret = fd;
1689 	} else {
1690 		ret = io_fixed_fd_install(req, issue_flags, file,
1691 						accept->file_slot);
1692 	}
1693 
1694 	cflags = 0;
1695 	if (!arg.is_empty)
1696 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
1697 
1698 	if (ret >= 0 && (req->flags & REQ_F_APOLL_MULTISHOT) &&
1699 	    io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) {
1700 		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1)
1701 			goto retry;
1702 		return IOU_RETRY;
1703 	}
1704 
1705 	io_req_set_res(req, ret, cflags);
1706 	if (ret < 0)
1707 		req_set_fail(req);
1708 	return IOU_COMPLETE;
1709 }
1710 
1711 void io_socket_bpf_populate(struct io_uring_bpf_ctx *bctx, struct io_kiocb *req)
1712 {
1713 	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1714 
1715 	bctx->socket.family = sock->domain;
1716 	bctx->socket.type = sock->type;
1717 	bctx->socket.protocol = sock->protocol;
1718 }
1719 
1720 void io_connect_bpf_populate(struct io_uring_bpf_ctx *bctx, struct io_kiocb *req)
1721 {
1722 	struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect);
1723 	struct sockaddr_storage *ss = req->async_data;
1724 
1725 	/*
1726 	 * move_addr_to_kernel() skips the copy for addr_len == 0, so
1727 	 * iomsg->addr may hold stale data from a prior CONNECT. Bail
1728 	 * unless addr_len covers the family discriminator.
1729 	 */
1730 	if (conn->addr_len < (int)sizeof(sa_family_t))
1731 		return;
1732 
1733 	bctx->connect.family = ss->ss_family;
1734 	switch (ss->ss_family) {
1735 	case AF_INET: {
1736 		struct sockaddr_in *sin = (struct sockaddr_in *)ss;
1737 
1738 		if (conn->addr_len < (int)sizeof(*sin))
1739 			break;
1740 		bctx->connect.port = sin->sin_port;
1741 		bctx->connect.v4_addr = sin->sin_addr.s_addr;
1742 		break;
1743 	}
1744 	case AF_INET6: {
1745 		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)ss;
1746 
1747 		if (conn->addr_len < (int)sizeof(*sin6))
1748 			break;
1749 		bctx->connect.port = sin6->sin6_port;
1750 		memcpy(bctx->connect.v6_addr, &sin6->sin6_addr,
1751 		       sizeof(bctx->connect.v6_addr));
1752 		break;
1753 	}
1754 	default:
1755 		/* family is set; per-family fields stay zero - family-only filtering */
1756 		break;
1757 	}
1758 }
1759 
1760 int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1761 {
1762 	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1763 
1764 	if (sqe->addr || sqe->rw_flags || sqe->buf_index)
1765 		return -EINVAL;
1766 
1767 	sock->domain = READ_ONCE(sqe->fd);
1768 	sock->type = READ_ONCE(sqe->off);
1769 	sock->protocol = READ_ONCE(sqe->len);
1770 	sock->file_slot = READ_ONCE(sqe->file_index);
1771 	sock->nofile = rlimit(RLIMIT_NOFILE);
1772 
1773 	sock->flags = sock->type & ~SOCK_TYPE_MASK;
1774 	if (sock->file_slot && (sock->flags & SOCK_CLOEXEC))
1775 		return -EINVAL;
1776 	if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1777 		return -EINVAL;
1778 	return 0;
1779 }
1780 
1781 int io_socket(struct io_kiocb *req, unsigned int issue_flags)
1782 {
1783 	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1784 	bool fixed = !!sock->file_slot;
1785 	struct file *file;
1786 	int ret, fd;
1787 
1788 	if (!fixed) {
1789 		fd = __get_unused_fd_flags(sock->flags, sock->nofile);
1790 		if (unlikely(fd < 0))
1791 			return fd;
1792 	}
1793 	file = __sys_socket_file(sock->domain, sock->type, sock->protocol);
1794 	if (IS_ERR(file)) {
1795 		if (!fixed)
1796 			put_unused_fd(fd);
1797 		ret = PTR_ERR(file);
1798 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1799 			return -EAGAIN;
1800 		if (ret == -ERESTARTSYS)
1801 			ret = -EINTR;
1802 		req_set_fail(req);
1803 	} else if (!fixed) {
1804 		fd_install(fd, file);
1805 		ret = fd;
1806 	} else {
1807 		ret = io_fixed_fd_install(req, issue_flags, file,
1808 					    sock->file_slot);
1809 	}
1810 	io_req_set_res(req, ret, 0);
1811 	return IOU_COMPLETE;
1812 }
1813 
1814 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1815 {
1816 	struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect);
1817 	struct sockaddr_storage *addr;
1818 
1819 	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
1820 		return -EINVAL;
1821 
1822 	conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1823 	conn->addr_len =  READ_ONCE(sqe->addr2);
1824 	conn->in_progress = conn->seen_econnaborted = false;
1825 
1826 	addr = io_uring_alloc_async_data(NULL, req);
1827 	if (unlikely(!addr))
1828 		return -ENOMEM;
1829 
1830 	return move_addr_to_kernel(conn->addr, conn->addr_len, addr);
1831 }
1832 
1833 int io_connect(struct io_kiocb *req, unsigned int issue_flags)
1834 {
1835 	struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect);
1836 	struct sockaddr_storage *addr = req->async_data;
1837 	unsigned file_flags;
1838 	int ret;
1839 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1840 
1841 	if (connect->in_progress) {
1842 		struct poll_table_struct pt = { ._key = EPOLLERR };
1843 
1844 		if (vfs_poll(req->file, &pt) & EPOLLERR)
1845 			goto get_sock_err;
1846 	}
1847 
1848 	file_flags = force_nonblock ? O_NONBLOCK : 0;
1849 
1850 	ret = __sys_connect_file(req->file, addr, connect->addr_len, file_flags);
1851 	if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED)
1852 	    && force_nonblock) {
1853 		if (ret == -EINPROGRESS) {
1854 			connect->in_progress = true;
1855 		} else if (ret == -ECONNABORTED) {
1856 			if (connect->seen_econnaborted)
1857 				goto out;
1858 			connect->seen_econnaborted = true;
1859 		}
1860 		return -EAGAIN;
1861 	}
1862 	if (connect->in_progress) {
1863 		/*
1864 		 * At least bluetooth will return -EBADFD on a re-connect
1865 		 * attempt, and it's (supposedly) also valid to get -EISCONN
1866 		 * which means the previous result is good. For both of these,
1867 		 * grab the sock_error() and use that for the completion.
1868 		 */
1869 		if (ret == -EBADFD || ret == -EISCONN) {
1870 get_sock_err:
1871 			ret = sock_error(sock_from_file(req->file)->sk);
1872 		}
1873 	}
1874 	if (ret == -ERESTARTSYS)
1875 		ret = -EINTR;
1876 out:
1877 	if (ret < 0)
1878 		req_set_fail(req);
1879 	io_req_set_res(req, ret, 0);
1880 	return IOU_COMPLETE;
1881 }
1882 
1883 /*
1884  * Check if bind request would potentially end up with filename_create(),
1885  * which in turn end up in mnt_want_write() which will grab the fs
1886  * percpu start write sem. This can trigger a lockdep warning.
1887  */
1888 static int io_bind_file_create(const struct sockaddr_storage *addr, int addr_len)
1889 {
1890 	const struct sockaddr_un *sun;
1891 
1892 	if (addr->ss_family != AF_UNIX)
1893 		return 0;
1894 	if (addr_len <= offsetof(struct sockaddr_un, sun_path))
1895 		return 0;
1896 	sun = (const struct sockaddr_un *) addr;
1897 	return sun->sun_path[0] != '\0';
1898 }
1899 
1900 int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1901 {
1902 	struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind);
1903 	struct sockaddr __user *uaddr;
1904 	struct sockaddr_storage *addr;
1905 	int ret;
1906 
1907 	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
1908 		return -EINVAL;
1909 
1910 	uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1911 	bind->addr_len =  READ_ONCE(sqe->addr2);
1912 
1913 	addr = io_uring_alloc_async_data(NULL, req);
1914 	if (unlikely(!addr))
1915 		return -ENOMEM;
1916 	ret = move_addr_to_kernel(uaddr, bind->addr_len, addr);
1917 	if (unlikely(ret))
1918 		return ret;
1919 	if (io_bind_file_create(addr, bind->addr_len))
1920 		req->flags |= REQ_F_FORCE_ASYNC;
1921 	return 0;
1922 }
1923 
1924 
1925 int io_bind(struct io_kiocb *req, unsigned int issue_flags)
1926 {
1927 	struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind);
1928 	struct sockaddr_storage *addr = req->async_data;
1929 	struct socket *sock;
1930 	int ret;
1931 
1932 	sock = sock_from_file(req->file);
1933 	if (unlikely(!sock))
1934 		return -ENOTSOCK;
1935 
1936 	ret = __sys_bind_socket(sock, addr, bind->addr_len);
1937 	if (ret < 0)
1938 		req_set_fail(req);
1939 	io_req_set_res(req, ret, 0);
1940 	return 0;
1941 }
1942 
1943 int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1944 {
1945 	struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen);
1946 
1947 	if (sqe->addr || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in || sqe->addr2)
1948 		return -EINVAL;
1949 
1950 	listen->backlog = READ_ONCE(sqe->len);
1951 	return 0;
1952 }
1953 
1954 int io_listen(struct io_kiocb *req, unsigned int issue_flags)
1955 {
1956 	struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen);
1957 	struct socket *sock;
1958 	int ret;
1959 
1960 	sock = sock_from_file(req->file);
1961 	if (unlikely(!sock))
1962 		return -ENOTSOCK;
1963 
1964 	ret = __sys_listen_socket(sock, listen->backlog);
1965 	if (ret < 0)
1966 		req_set_fail(req);
1967 	io_req_set_res(req, ret, 0);
1968 	return 0;
1969 }
1970 
1971 void io_netmsg_cache_free(const void *entry)
1972 {
1973 	struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry;
1974 
1975 	io_vec_free(&kmsg->vec);
1976 	kfree(kmsg);
1977 }
1978