xref: /linux/io_uring/net.c (revision 3f1c07fc21c68bd3bd2df9d2c9441f6485e934d9)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/file.h>
5 #include <linux/slab.h>
6 #include <linux/net.h>
7 #include <linux/compat.h>
8 #include <net/compat.h>
9 #include <linux/io_uring.h>
10 
11 #include <uapi/linux/io_uring.h>
12 
13 #include "filetable.h"
14 #include "io_uring.h"
15 #include "kbuf.h"
16 #include "alloc_cache.h"
17 #include "net.h"
18 #include "notif.h"
19 #include "rsrc.h"
20 #include "zcrx.h"
21 
22 struct io_shutdown {
23 	struct file			*file;
24 	int				how;
25 };
26 
27 struct io_accept {
28 	struct file			*file;
29 	struct sockaddr __user		*addr;
30 	int __user			*addr_len;
31 	int				flags;
32 	int				iou_flags;
33 	u32				file_slot;
34 	unsigned long			nofile;
35 };
36 
37 struct io_socket {
38 	struct file			*file;
39 	int				domain;
40 	int				type;
41 	int				protocol;
42 	int				flags;
43 	u32				file_slot;
44 	unsigned long			nofile;
45 };
46 
47 struct io_connect {
48 	struct file			*file;
49 	struct sockaddr __user		*addr;
50 	int				addr_len;
51 	bool				in_progress;
52 	bool				seen_econnaborted;
53 };
54 
55 struct io_bind {
56 	struct file			*file;
57 	int				addr_len;
58 };
59 
60 struct io_listen {
61 	struct file			*file;
62 	int				backlog;
63 };
64 
65 struct io_sr_msg {
66 	struct file			*file;
67 	union {
68 		struct compat_msghdr __user	*umsg_compat;
69 		struct user_msghdr __user	*umsg;
70 		void __user			*buf;
71 	};
72 	int				len;
73 	unsigned			done_io;
74 	unsigned			msg_flags;
75 	unsigned			nr_multishot_loops;
76 	u16				flags;
77 	/* initialised and used only by !msg send variants */
78 	u16				buf_group;
79 	/* per-invocation mshot limit */
80 	unsigned			mshot_len;
81 	/* overall mshot byte limit */
82 	unsigned			mshot_total_len;
83 	void __user			*msg_control;
84 	/* used only for send zerocopy */
85 	struct io_kiocb 		*notif;
86 };
87 
88 /*
89  * The UAPI flags are the lower 8 bits, as that's all sqe->ioprio will hold
90  * anyway. Use the upper 8 bits for internal uses.
91  */
92 enum sr_retry_flags {
93 	IORING_RECV_RETRY	= (1U << 15),
94 	IORING_RECV_PARTIAL_MAP	= (1U << 14),
95 	IORING_RECV_MSHOT_CAP	= (1U << 13),
96 	IORING_RECV_MSHOT_LIM	= (1U << 12),
97 	IORING_RECV_MSHOT_DONE	= (1U << 11),
98 
99 	IORING_RECV_RETRY_CLEAR	= IORING_RECV_RETRY | IORING_RECV_PARTIAL_MAP,
100 	IORING_RECV_NO_RETRY	= IORING_RECV_RETRY | IORING_RECV_PARTIAL_MAP |
101 				  IORING_RECV_MSHOT_CAP | IORING_RECV_MSHOT_DONE,
102 };
103 
104 /*
105  * Number of times we'll try and do receives if there's more data. If we
106  * exceed this limit, then add us to the back of the queue and retry from
107  * there. This helps fairness between flooding clients.
108  */
109 #define MULTISHOT_MAX_RETRY	32
110 
111 struct io_recvzc {
112 	struct file			*file;
113 	u16				flags;
114 	u32				len;
115 	struct io_zcrx_ifq		*ifq;
116 };
117 
118 static int io_sg_from_iter_iovec(struct sk_buff *skb,
119 				 struct iov_iter *from, size_t length);
120 static int io_sg_from_iter(struct sk_buff *skb,
121 			   struct iov_iter *from, size_t length);
122 
io_shutdown_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)123 int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
124 {
125 	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
126 
127 	if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
128 		     sqe->buf_index || sqe->splice_fd_in))
129 		return -EINVAL;
130 
131 	shutdown->how = READ_ONCE(sqe->len);
132 	req->flags |= REQ_F_FORCE_ASYNC;
133 	return 0;
134 }
135 
io_shutdown(struct io_kiocb * req,unsigned int issue_flags)136 int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
137 {
138 	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
139 	struct socket *sock;
140 	int ret;
141 
142 	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
143 
144 	sock = sock_from_file(req->file);
145 	if (unlikely(!sock))
146 		return -ENOTSOCK;
147 
148 	ret = __sys_shutdown_sock(sock, shutdown->how);
149 	io_req_set_res(req, ret, 0);
150 	return IOU_COMPLETE;
151 }
152 
io_net_retry(struct socket * sock,int flags)153 static bool io_net_retry(struct socket *sock, int flags)
154 {
155 	if (!(flags & MSG_WAITALL))
156 		return false;
157 	return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
158 }
159 
io_netmsg_iovec_free(struct io_async_msghdr * kmsg)160 static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg)
161 {
162 	if (kmsg->vec.iovec)
163 		io_vec_free(&kmsg->vec);
164 }
165 
io_netmsg_recycle(struct io_kiocb * req,unsigned int issue_flags)166 static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
167 {
168 	struct io_async_msghdr *hdr = req->async_data;
169 
170 	/* can't recycle, ensure we free the iovec if we have one */
171 	if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
172 		io_netmsg_iovec_free(hdr);
173 		return;
174 	}
175 
176 	/* Let normal cleanup path reap it if we fail adding to the cache */
177 	io_alloc_cache_vec_kasan(&hdr->vec);
178 	if (hdr->vec.nr > IO_VEC_CACHE_SOFT_CAP)
179 		io_vec_free(&hdr->vec);
180 
181 	if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr))
182 		io_req_async_data_clear(req, REQ_F_NEED_CLEANUP);
183 }
184 
io_msg_alloc_async(struct io_kiocb * req)185 static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req)
186 {
187 	struct io_ring_ctx *ctx = req->ctx;
188 	struct io_async_msghdr *hdr;
189 
190 	hdr = io_uring_alloc_async_data(&ctx->netmsg_cache, req);
191 	if (!hdr)
192 		return NULL;
193 
194 	/* If the async data was cached, we might have an iov cached inside. */
195 	if (hdr->vec.iovec)
196 		req->flags |= REQ_F_NEED_CLEANUP;
197 	return hdr;
198 }
199 
io_mshot_prep_retry(struct io_kiocb * req,struct io_async_msghdr * kmsg)200 static inline void io_mshot_prep_retry(struct io_kiocb *req,
201 				       struct io_async_msghdr *kmsg)
202 {
203 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
204 
205 	req->flags &= ~REQ_F_BL_EMPTY;
206 	sr->done_io = 0;
207 	sr->flags &= ~IORING_RECV_RETRY_CLEAR;
208 	sr->len = sr->mshot_len;
209 }
210 
io_net_import_vec(struct io_kiocb * req,struct io_async_msghdr * iomsg,const struct iovec __user * uiov,unsigned uvec_seg,int ddir)211 static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg,
212 			     const struct iovec __user *uiov, unsigned uvec_seg,
213 			     int ddir)
214 {
215 	struct iovec *iov;
216 	int ret, nr_segs;
217 
218 	if (iomsg->vec.iovec) {
219 		nr_segs = iomsg->vec.nr;
220 		iov = iomsg->vec.iovec;
221 	} else {
222 		nr_segs = 1;
223 		iov = &iomsg->fast_iov;
224 	}
225 
226 	ret = __import_iovec(ddir, uiov, uvec_seg, nr_segs, &iov,
227 			     &iomsg->msg.msg_iter, io_is_compat(req->ctx));
228 	if (unlikely(ret < 0))
229 		return ret;
230 
231 	if (iov) {
232 		req->flags |= REQ_F_NEED_CLEANUP;
233 		io_vec_reset_iovec(&iomsg->vec, iov, iomsg->msg.msg_iter.nr_segs);
234 	}
235 	return 0;
236 }
237 
io_compat_msg_copy_hdr(struct io_kiocb * req,struct io_async_msghdr * iomsg,struct compat_msghdr * msg,int ddir,struct sockaddr __user ** save_addr)238 static int io_compat_msg_copy_hdr(struct io_kiocb *req,
239 				  struct io_async_msghdr *iomsg,
240 				  struct compat_msghdr *msg, int ddir,
241 				  struct sockaddr __user **save_addr)
242 {
243 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
244 	struct compat_iovec __user *uiov;
245 	int ret;
246 
247 	if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg)))
248 		return -EFAULT;
249 
250 	ret = __get_compat_msghdr(&iomsg->msg, msg, save_addr);
251 	if (ret)
252 		return ret;
253 
254 	uiov = compat_ptr(msg->msg_iov);
255 	if (req->flags & REQ_F_BUFFER_SELECT) {
256 		if (msg->msg_iovlen == 0) {
257 			sr->len = 0;
258 		} else if (msg->msg_iovlen > 1) {
259 			return -EINVAL;
260 		} else {
261 			struct compat_iovec tmp_iov;
262 
263 			if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov)))
264 				return -EFAULT;
265 			sr->len = tmp_iov.iov_len;
266 		}
267 	}
268 	return 0;
269 }
270 
io_copy_msghdr_from_user(struct user_msghdr * msg,struct user_msghdr __user * umsg)271 static int io_copy_msghdr_from_user(struct user_msghdr *msg,
272 				    struct user_msghdr __user *umsg)
273 {
274 	if (!user_access_begin(umsg, sizeof(*umsg)))
275 		return -EFAULT;
276 	unsafe_get_user(msg->msg_name, &umsg->msg_name, ua_end);
277 	unsafe_get_user(msg->msg_namelen, &umsg->msg_namelen, ua_end);
278 	unsafe_get_user(msg->msg_iov, &umsg->msg_iov, ua_end);
279 	unsafe_get_user(msg->msg_iovlen, &umsg->msg_iovlen, ua_end);
280 	unsafe_get_user(msg->msg_control, &umsg->msg_control, ua_end);
281 	unsafe_get_user(msg->msg_controllen, &umsg->msg_controllen, ua_end);
282 	user_access_end();
283 	return 0;
284 ua_end:
285 	user_access_end();
286 	return -EFAULT;
287 }
288 
io_msg_copy_hdr(struct io_kiocb * req,struct io_async_msghdr * iomsg,struct user_msghdr * msg,int ddir,struct sockaddr __user ** save_addr)289 static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
290 			   struct user_msghdr *msg, int ddir,
291 			   struct sockaddr __user **save_addr)
292 {
293 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
294 	struct user_msghdr __user *umsg = sr->umsg;
295 	int ret;
296 
297 	iomsg->msg.msg_name = &iomsg->addr;
298 	iomsg->msg.msg_iter.nr_segs = 0;
299 
300 	if (io_is_compat(req->ctx)) {
301 		struct compat_msghdr cmsg;
302 
303 		ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ddir, save_addr);
304 		if (ret)
305 			return ret;
306 
307 		memset(msg, 0, sizeof(*msg));
308 		msg->msg_namelen = cmsg.msg_namelen;
309 		msg->msg_controllen = cmsg.msg_controllen;
310 		msg->msg_iov = compat_ptr(cmsg.msg_iov);
311 		msg->msg_iovlen = cmsg.msg_iovlen;
312 		return 0;
313 	}
314 
315 	ret = io_copy_msghdr_from_user(msg, umsg);
316 	if (unlikely(ret))
317 		return ret;
318 
319 	msg->msg_flags = 0;
320 
321 	ret = __copy_msghdr(&iomsg->msg, msg, save_addr);
322 	if (ret)
323 		return ret;
324 
325 	if (req->flags & REQ_F_BUFFER_SELECT) {
326 		if (msg->msg_iovlen == 0) {
327 			sr->len = 0;
328 		} else if (msg->msg_iovlen > 1) {
329 			return -EINVAL;
330 		} else {
331 			struct iovec __user *uiov = msg->msg_iov;
332 			struct iovec tmp_iov;
333 
334 			if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov)))
335 				return -EFAULT;
336 			sr->len = tmp_iov.iov_len;
337 		}
338 	}
339 	return 0;
340 }
341 
io_sendmsg_recvmsg_cleanup(struct io_kiocb * req)342 void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req)
343 {
344 	struct io_async_msghdr *io = req->async_data;
345 
346 	io_netmsg_iovec_free(io);
347 }
348 
io_send_setup(struct io_kiocb * req,const struct io_uring_sqe * sqe)349 static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
350 {
351 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
352 	struct io_async_msghdr *kmsg = req->async_data;
353 	void __user *addr;
354 	u16 addr_len;
355 	int ret;
356 
357 	sr->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
358 
359 	if (READ_ONCE(sqe->__pad3[0]))
360 		return -EINVAL;
361 
362 	kmsg->msg.msg_name = NULL;
363 	kmsg->msg.msg_namelen = 0;
364 	kmsg->msg.msg_control = NULL;
365 	kmsg->msg.msg_controllen = 0;
366 	kmsg->msg.msg_ubuf = NULL;
367 
368 	addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
369 	addr_len = READ_ONCE(sqe->addr_len);
370 	if (addr) {
371 		ret = move_addr_to_kernel(addr, addr_len, &kmsg->addr);
372 		if (unlikely(ret < 0))
373 			return ret;
374 		kmsg->msg.msg_name = &kmsg->addr;
375 		kmsg->msg.msg_namelen = addr_len;
376 	}
377 	if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
378 		req->flags |= REQ_F_IMPORT_BUFFER;
379 		return 0;
380 	}
381 	if (req->flags & REQ_F_BUFFER_SELECT)
382 		return 0;
383 
384 	if (sr->flags & IORING_SEND_VECTORIZED)
385 		return io_net_import_vec(req, kmsg, sr->buf, sr->len, ITER_SOURCE);
386 
387 	return import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter);
388 }
389 
io_sendmsg_setup(struct io_kiocb * req,const struct io_uring_sqe * sqe)390 static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
391 {
392 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
393 	struct io_async_msghdr *kmsg = req->async_data;
394 	struct user_msghdr msg;
395 	int ret;
396 
397 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
398 	ret = io_msg_copy_hdr(req, kmsg, &msg, ITER_SOURCE, NULL);
399 	if (unlikely(ret))
400 		return ret;
401 	/* save msg_control as sys_sendmsg() overwrites it */
402 	sr->msg_control = kmsg->msg.msg_control_user;
403 
404 	if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
405 		kmsg->msg.msg_iter.nr_segs = msg.msg_iovlen;
406 		return io_prep_reg_iovec(req, &kmsg->vec, msg.msg_iov,
407 					 msg.msg_iovlen);
408 	}
409 	if (req->flags & REQ_F_BUFFER_SELECT)
410 		return 0;
411 	return io_net_import_vec(req, kmsg, msg.msg_iov, msg.msg_iovlen, ITER_SOURCE);
412 }
413 
414 #define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE | IORING_SEND_VECTORIZED)
415 
io_sendmsg_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)416 int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
417 {
418 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
419 
420 	sr->done_io = 0;
421 	sr->len = READ_ONCE(sqe->len);
422 	sr->flags = READ_ONCE(sqe->ioprio);
423 	if (sr->flags & ~SENDMSG_FLAGS)
424 		return -EINVAL;
425 	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
426 	if (sr->msg_flags & MSG_DONTWAIT)
427 		req->flags |= REQ_F_NOWAIT;
428 	if (req->flags & REQ_F_BUFFER_SELECT)
429 		sr->buf_group = req->buf_index;
430 	if (sr->flags & IORING_RECVSEND_BUNDLE) {
431 		if (req->opcode == IORING_OP_SENDMSG)
432 			return -EINVAL;
433 		sr->msg_flags |= MSG_WAITALL;
434 		req->flags |= REQ_F_MULTISHOT;
435 	}
436 
437 	if (io_is_compat(req->ctx))
438 		sr->msg_flags |= MSG_CMSG_COMPAT;
439 
440 	if (unlikely(!io_msg_alloc_async(req)))
441 		return -ENOMEM;
442 	if (req->opcode != IORING_OP_SENDMSG)
443 		return io_send_setup(req, sqe);
444 	if (unlikely(sqe->addr2 || sqe->file_index))
445 		return -EINVAL;
446 	return io_sendmsg_setup(req, sqe);
447 }
448 
io_req_msg_cleanup(struct io_kiocb * req,unsigned int issue_flags)449 static void io_req_msg_cleanup(struct io_kiocb *req,
450 			       unsigned int issue_flags)
451 {
452 	io_netmsg_recycle(req, issue_flags);
453 }
454 
455 /*
456  * For bundle completions, we need to figure out how many segments we consumed.
457  * A bundle could be using a single ITER_UBUF if that's all we mapped, or it
458  * could be using an ITER_IOVEC. If the latter, then if we consumed all of
459  * the segments, then it's a trivial questiont o answer. If we have residual
460  * data in the iter, then loop the segments to figure out how much we
461  * transferred.
462  */
io_bundle_nbufs(struct io_async_msghdr * kmsg,int ret)463 static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret)
464 {
465 	struct iovec *iov;
466 	int nbufs;
467 
468 	/* no data is always zero segments, and a ubuf is always 1 segment */
469 	if (ret <= 0)
470 		return 0;
471 	if (iter_is_ubuf(&kmsg->msg.msg_iter))
472 		return 1;
473 
474 	iov = kmsg->vec.iovec;
475 	if (!iov)
476 		iov = &kmsg->fast_iov;
477 
478 	/* if all data was transferred, it's basic pointer math */
479 	if (!iov_iter_count(&kmsg->msg.msg_iter))
480 		return iter_iov(&kmsg->msg.msg_iter) - iov;
481 
482 	/* short transfer, count segments */
483 	nbufs = 0;
484 	do {
485 		int this_len = min_t(int, iov[nbufs].iov_len, ret);
486 
487 		nbufs++;
488 		ret -= this_len;
489 	} while (ret);
490 
491 	return nbufs;
492 }
493 
io_net_kbuf_recyle(struct io_kiocb * req,struct io_buffer_list * bl,struct io_async_msghdr * kmsg,int len)494 static int io_net_kbuf_recyle(struct io_kiocb *req, struct io_buffer_list *bl,
495 			      struct io_async_msghdr *kmsg, int len)
496 {
497 	req->flags |= REQ_F_BL_NO_RECYCLE;
498 	if (req->flags & REQ_F_BUFFERS_COMMIT)
499 		io_kbuf_commit(req, bl, len, io_bundle_nbufs(kmsg, len));
500 	return IOU_RETRY;
501 }
502 
io_send_finish(struct io_kiocb * req,struct io_async_msghdr * kmsg,struct io_br_sel * sel)503 static inline bool io_send_finish(struct io_kiocb *req,
504 				  struct io_async_msghdr *kmsg,
505 				  struct io_br_sel *sel)
506 {
507 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
508 	bool bundle_finished = sel->val <= 0;
509 	unsigned int cflags;
510 
511 	if (!(sr->flags & IORING_RECVSEND_BUNDLE)) {
512 		cflags = io_put_kbuf(req, sel->val, sel->buf_list);
513 		goto finish;
514 	}
515 
516 	cflags = io_put_kbufs(req, sel->val, sel->buf_list, io_bundle_nbufs(kmsg, sel->val));
517 
518 	if (bundle_finished || req->flags & REQ_F_BL_EMPTY)
519 		goto finish;
520 
521 	/*
522 	 * Fill CQE for this receive and see if we should keep trying to
523 	 * receive from this socket.
524 	 */
525 	if (io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) {
526 		io_mshot_prep_retry(req, kmsg);
527 		return false;
528 	}
529 
530 	/* Otherwise stop bundle and use the current result. */
531 finish:
532 	io_req_set_res(req, sel->val, cflags);
533 	sel->val = IOU_COMPLETE;
534 	return true;
535 }
536 
io_sendmsg(struct io_kiocb * req,unsigned int issue_flags)537 int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
538 {
539 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
540 	struct io_async_msghdr *kmsg = req->async_data;
541 	struct socket *sock;
542 	unsigned flags;
543 	int min_ret = 0;
544 	int ret;
545 
546 	sock = sock_from_file(req->file);
547 	if (unlikely(!sock))
548 		return -ENOTSOCK;
549 
550 	if (!(req->flags & REQ_F_POLLED) &&
551 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
552 		return -EAGAIN;
553 
554 	flags = sr->msg_flags;
555 	if (issue_flags & IO_URING_F_NONBLOCK)
556 		flags |= MSG_DONTWAIT;
557 	if (flags & MSG_WAITALL)
558 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
559 
560 	kmsg->msg.msg_control_user = sr->msg_control;
561 
562 	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
563 
564 	if (ret < min_ret) {
565 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
566 			return -EAGAIN;
567 		if (ret > 0 && io_net_retry(sock, flags)) {
568 			kmsg->msg.msg_controllen = 0;
569 			kmsg->msg.msg_control = NULL;
570 			sr->done_io += ret;
571 			return -EAGAIN;
572 		}
573 		if (ret == -ERESTARTSYS)
574 			ret = -EINTR;
575 		req_set_fail(req);
576 	}
577 	io_req_msg_cleanup(req, issue_flags);
578 	if (ret >= 0)
579 		ret += sr->done_io;
580 	else if (sr->done_io)
581 		ret = sr->done_io;
582 	io_req_set_res(req, ret, 0);
583 	return IOU_COMPLETE;
584 }
585 
io_send_select_buffer(struct io_kiocb * req,unsigned int issue_flags,struct io_br_sel * sel,struct io_async_msghdr * kmsg)586 static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags,
587 				 struct io_br_sel *sel, struct io_async_msghdr *kmsg)
588 {
589 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
590 	struct buf_sel_arg arg = {
591 		.iovs = &kmsg->fast_iov,
592 		.max_len = min_not_zero(sr->len, INT_MAX),
593 		.nr_iovs = 1,
594 		.buf_group = sr->buf_group,
595 	};
596 	int ret;
597 
598 	if (kmsg->vec.iovec) {
599 		arg.nr_iovs = kmsg->vec.nr;
600 		arg.iovs = kmsg->vec.iovec;
601 		arg.mode = KBUF_MODE_FREE;
602 	}
603 
604 	if (!(sr->flags & IORING_RECVSEND_BUNDLE))
605 		arg.nr_iovs = 1;
606 	else
607 		arg.mode |= KBUF_MODE_EXPAND;
608 
609 	ret = io_buffers_select(req, &arg, sel, issue_flags);
610 	if (unlikely(ret < 0))
611 		return ret;
612 
613 	if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
614 		kmsg->vec.nr = ret;
615 		kmsg->vec.iovec = arg.iovs;
616 		req->flags |= REQ_F_NEED_CLEANUP;
617 	}
618 	sr->len = arg.out_len;
619 
620 	if (ret == 1) {
621 		sr->buf = arg.iovs[0].iov_base;
622 		ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len,
623 					&kmsg->msg.msg_iter);
624 		if (unlikely(ret))
625 			return ret;
626 	} else {
627 		iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE,
628 				arg.iovs, ret, arg.out_len);
629 	}
630 
631 	return 0;
632 }
633 
io_send(struct io_kiocb * req,unsigned int issue_flags)634 int io_send(struct io_kiocb *req, unsigned int issue_flags)
635 {
636 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
637 	struct io_async_msghdr *kmsg = req->async_data;
638 	struct io_br_sel sel = { };
639 	struct socket *sock;
640 	unsigned flags;
641 	int min_ret = 0;
642 	int ret;
643 
644 	sock = sock_from_file(req->file);
645 	if (unlikely(!sock))
646 		return -ENOTSOCK;
647 
648 	if (!(req->flags & REQ_F_POLLED) &&
649 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
650 		return -EAGAIN;
651 
652 	flags = sr->msg_flags;
653 	if (issue_flags & IO_URING_F_NONBLOCK)
654 		flags |= MSG_DONTWAIT;
655 
656 retry_bundle:
657 	sel.buf_list = NULL;
658 	if (io_do_buffer_select(req)) {
659 		ret = io_send_select_buffer(req, issue_flags, &sel, kmsg);
660 		if (ret)
661 			return ret;
662 	}
663 
664 	/*
665 	 * If MSG_WAITALL is set, or this is a bundle send, then we need
666 	 * the full amount. If just bundle is set, if we do a short send
667 	 * then we complete the bundle sequence rather than continue on.
668 	 */
669 	if (flags & MSG_WAITALL || sr->flags & IORING_RECVSEND_BUNDLE)
670 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
671 
672 	flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
673 	kmsg->msg.msg_flags = flags;
674 	ret = sock_sendmsg(sock, &kmsg->msg);
675 	if (ret < min_ret) {
676 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
677 			return -EAGAIN;
678 
679 		if (ret > 0 && io_net_retry(sock, flags)) {
680 			sr->len -= ret;
681 			sr->buf += ret;
682 			sr->done_io += ret;
683 			return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret);
684 		}
685 		if (ret == -ERESTARTSYS)
686 			ret = -EINTR;
687 		req_set_fail(req);
688 	}
689 	if (ret >= 0)
690 		ret += sr->done_io;
691 	else if (sr->done_io)
692 		ret = sr->done_io;
693 
694 	sel.val = ret;
695 	if (!io_send_finish(req, kmsg, &sel))
696 		goto retry_bundle;
697 
698 	io_req_msg_cleanup(req, issue_flags);
699 	return sel.val;
700 }
701 
io_recvmsg_mshot_prep(struct io_kiocb * req,struct io_async_msghdr * iomsg,int namelen,size_t controllen)702 static int io_recvmsg_mshot_prep(struct io_kiocb *req,
703 				 struct io_async_msghdr *iomsg,
704 				 int namelen, size_t controllen)
705 {
706 	if ((req->flags & (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) ==
707 			  (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) {
708 		int hdr;
709 
710 		if (unlikely(namelen < 0))
711 			return -EOVERFLOW;
712 		if (check_add_overflow(sizeof(struct io_uring_recvmsg_out),
713 					namelen, &hdr))
714 			return -EOVERFLOW;
715 		if (check_add_overflow(hdr, controllen, &hdr))
716 			return -EOVERFLOW;
717 
718 		iomsg->namelen = namelen;
719 		iomsg->controllen = controllen;
720 		return 0;
721 	}
722 
723 	return 0;
724 }
725 
io_recvmsg_copy_hdr(struct io_kiocb * req,struct io_async_msghdr * iomsg)726 static int io_recvmsg_copy_hdr(struct io_kiocb *req,
727 			       struct io_async_msghdr *iomsg)
728 {
729 	struct user_msghdr msg;
730 	int ret;
731 
732 	ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr);
733 	if (unlikely(ret))
734 		return ret;
735 
736 	if (!(req->flags & REQ_F_BUFFER_SELECT)) {
737 		ret = io_net_import_vec(req, iomsg, msg.msg_iov, msg.msg_iovlen,
738 					ITER_DEST);
739 		if (unlikely(ret))
740 			return ret;
741 	}
742 	return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen,
743 					msg.msg_controllen);
744 }
745 
io_recvmsg_prep_setup(struct io_kiocb * req)746 static int io_recvmsg_prep_setup(struct io_kiocb *req)
747 {
748 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
749 	struct io_async_msghdr *kmsg;
750 
751 	kmsg = io_msg_alloc_async(req);
752 	if (unlikely(!kmsg))
753 		return -ENOMEM;
754 
755 	if (req->opcode == IORING_OP_RECV) {
756 		kmsg->msg.msg_name = NULL;
757 		kmsg->msg.msg_namelen = 0;
758 		kmsg->msg.msg_inq = 0;
759 		kmsg->msg.msg_control = NULL;
760 		kmsg->msg.msg_get_inq = 1;
761 		kmsg->msg.msg_controllen = 0;
762 		kmsg->msg.msg_iocb = NULL;
763 		kmsg->msg.msg_ubuf = NULL;
764 
765 		if (req->flags & REQ_F_BUFFER_SELECT)
766 			return 0;
767 		return import_ubuf(ITER_DEST, sr->buf, sr->len,
768 				   &kmsg->msg.msg_iter);
769 	}
770 
771 	return io_recvmsg_copy_hdr(req, kmsg);
772 }
773 
774 #define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \
775 			IORING_RECVSEND_BUNDLE)
776 
io_recvmsg_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)777 int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
778 {
779 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
780 
781 	sr->done_io = 0;
782 
783 	if (unlikely(sqe->addr2))
784 		return -EINVAL;
785 
786 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
787 	sr->len = READ_ONCE(sqe->len);
788 	sr->flags = READ_ONCE(sqe->ioprio);
789 	if (sr->flags & ~RECVMSG_FLAGS)
790 		return -EINVAL;
791 	sr->msg_flags = READ_ONCE(sqe->msg_flags);
792 	if (sr->msg_flags & MSG_DONTWAIT)
793 		req->flags |= REQ_F_NOWAIT;
794 	if (sr->msg_flags & MSG_ERRQUEUE)
795 		req->flags |= REQ_F_CLEAR_POLLIN;
796 	if (req->flags & REQ_F_BUFFER_SELECT)
797 		sr->buf_group = req->buf_index;
798 	sr->mshot_total_len = sr->mshot_len = 0;
799 	if (sr->flags & IORING_RECV_MULTISHOT) {
800 		if (!(req->flags & REQ_F_BUFFER_SELECT))
801 			return -EINVAL;
802 		if (sr->msg_flags & MSG_WAITALL)
803 			return -EINVAL;
804 		if (req->opcode == IORING_OP_RECV) {
805 			sr->mshot_len = sr->len;
806 			sr->mshot_total_len = READ_ONCE(sqe->optlen);
807 			if (sr->mshot_total_len)
808 				sr->flags |= IORING_RECV_MSHOT_LIM;
809 		} else if (sqe->optlen) {
810 			return -EINVAL;
811 		}
812 		req->flags |= REQ_F_APOLL_MULTISHOT;
813 	} else if (sqe->optlen) {
814 		return -EINVAL;
815 	}
816 
817 	if (sr->flags & IORING_RECVSEND_BUNDLE) {
818 		if (req->opcode == IORING_OP_RECVMSG)
819 			return -EINVAL;
820 	}
821 
822 	if (io_is_compat(req->ctx))
823 		sr->msg_flags |= MSG_CMSG_COMPAT;
824 
825 	sr->nr_multishot_loops = 0;
826 	return io_recvmsg_prep_setup(req);
827 }
828 
829 /* bits to clear in old and inherit in new cflags on bundle retry */
830 #define CQE_F_MASK	(IORING_CQE_F_SOCK_NONEMPTY|IORING_CQE_F_MORE)
831 
832 /*
833  * Finishes io_recv and io_recvmsg.
834  *
835  * Returns true if it is actually finished, or false if it should run
836  * again (for multishot).
837  */
io_recv_finish(struct io_kiocb * req,struct io_async_msghdr * kmsg,struct io_br_sel * sel,bool mshot_finished,unsigned issue_flags)838 static inline bool io_recv_finish(struct io_kiocb *req,
839 				  struct io_async_msghdr *kmsg,
840 				  struct io_br_sel *sel, bool mshot_finished,
841 				  unsigned issue_flags)
842 {
843 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
844 	unsigned int cflags = 0;
845 
846 	if (kmsg->msg.msg_inq > 0)
847 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
848 
849 	if (sel->val > 0 && sr->flags & IORING_RECV_MSHOT_LIM) {
850 		/*
851 		 * If sr->len hits zero, the limit has been reached. Mark
852 		 * mshot as finished, and flag MSHOT_DONE as well to prevent
853 		 * a potential bundle from being retried.
854 		 */
855 		sr->mshot_total_len -= min_t(int, sel->val, sr->mshot_total_len);
856 		if (!sr->mshot_total_len) {
857 			sr->flags |= IORING_RECV_MSHOT_DONE;
858 			mshot_finished = true;
859 		}
860 	}
861 
862 	if (sr->flags & IORING_RECVSEND_BUNDLE) {
863 		size_t this_ret = sel->val - sr->done_io;
864 
865 		cflags |= io_put_kbufs(req, this_ret, sel->buf_list, io_bundle_nbufs(kmsg, this_ret));
866 		if (sr->flags & IORING_RECV_RETRY)
867 			cflags = req->cqe.flags | (cflags & CQE_F_MASK);
868 		if (sr->mshot_len && sel->val >= sr->mshot_len)
869 			sr->flags |= IORING_RECV_MSHOT_CAP;
870 		/* bundle with no more immediate buffers, we're done */
871 		if (req->flags & REQ_F_BL_EMPTY)
872 			goto finish;
873 		/*
874 		 * If more is available AND it was a full transfer, retry and
875 		 * append to this one
876 		 */
877 		if (!(sr->flags & IORING_RECV_NO_RETRY) &&
878 		    kmsg->msg.msg_inq > 1 && this_ret > 0 &&
879 		    !iov_iter_count(&kmsg->msg.msg_iter)) {
880 			req->cqe.flags = cflags & ~CQE_F_MASK;
881 			sr->len = kmsg->msg.msg_inq;
882 			sr->done_io += this_ret;
883 			sr->flags |= IORING_RECV_RETRY;
884 			return false;
885 		}
886 	} else {
887 		cflags |= io_put_kbuf(req, sel->val, sel->buf_list);
888 	}
889 
890 	/*
891 	 * Fill CQE for this receive and see if we should keep trying to
892 	 * receive from this socket.
893 	 */
894 	if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished &&
895 	    io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) {
896 		sel->val = IOU_RETRY;
897 		io_mshot_prep_retry(req, kmsg);
898 		/* Known not-empty or unknown state, retry */
899 		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) {
900 			if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY &&
901 			    !(sr->flags & IORING_RECV_MSHOT_CAP)) {
902 				return false;
903 			}
904 			/* mshot retries exceeded, force a requeue */
905 			sr->nr_multishot_loops = 0;
906 			sr->flags &= ~IORING_RECV_MSHOT_CAP;
907 			if (issue_flags & IO_URING_F_MULTISHOT)
908 				sel->val = IOU_REQUEUE;
909 		}
910 		return true;
911 	}
912 
913 	/* Finish the request / stop multishot. */
914 finish:
915 	io_req_set_res(req, sel->val, cflags);
916 	sel->val = IOU_COMPLETE;
917 	io_req_msg_cleanup(req, issue_flags);
918 	return true;
919 }
920 
io_recvmsg_prep_multishot(struct io_async_msghdr * kmsg,struct io_sr_msg * sr,void __user ** buf,size_t * len)921 static int io_recvmsg_prep_multishot(struct io_async_msghdr *kmsg,
922 				     struct io_sr_msg *sr, void __user **buf,
923 				     size_t *len)
924 {
925 	unsigned long ubuf = (unsigned long) *buf;
926 	unsigned long hdr;
927 
928 	hdr = sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
929 		kmsg->controllen;
930 	if (*len < hdr)
931 		return -EFAULT;
932 
933 	if (kmsg->controllen) {
934 		unsigned long control = ubuf + hdr - kmsg->controllen;
935 
936 		kmsg->msg.msg_control_user = (void __user *) control;
937 		kmsg->msg.msg_controllen = kmsg->controllen;
938 	}
939 
940 	sr->buf = *buf; /* stash for later copy */
941 	*buf = (void __user *) (ubuf + hdr);
942 	kmsg->payloadlen = *len = *len - hdr;
943 	return 0;
944 }
945 
946 struct io_recvmsg_multishot_hdr {
947 	struct io_uring_recvmsg_out msg;
948 	struct sockaddr_storage addr;
949 };
950 
io_recvmsg_multishot(struct socket * sock,struct io_sr_msg * io,struct io_async_msghdr * kmsg,unsigned int flags,bool * finished)951 static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io,
952 				struct io_async_msghdr *kmsg,
953 				unsigned int flags, bool *finished)
954 {
955 	int err;
956 	int copy_len;
957 	struct io_recvmsg_multishot_hdr hdr;
958 
959 	if (kmsg->namelen)
960 		kmsg->msg.msg_name = &hdr.addr;
961 	kmsg->msg.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
962 	kmsg->msg.msg_namelen = 0;
963 
964 	if (sock->file->f_flags & O_NONBLOCK)
965 		flags |= MSG_DONTWAIT;
966 
967 	err = sock_recvmsg(sock, &kmsg->msg, flags);
968 	*finished = err <= 0;
969 	if (err < 0)
970 		return err;
971 
972 	hdr.msg = (struct io_uring_recvmsg_out) {
973 		.controllen = kmsg->controllen - kmsg->msg.msg_controllen,
974 		.flags = kmsg->msg.msg_flags & ~MSG_CMSG_COMPAT
975 	};
976 
977 	hdr.msg.payloadlen = err;
978 	if (err > kmsg->payloadlen)
979 		err = kmsg->payloadlen;
980 
981 	copy_len = sizeof(struct io_uring_recvmsg_out);
982 	if (kmsg->msg.msg_namelen > kmsg->namelen)
983 		copy_len += kmsg->namelen;
984 	else
985 		copy_len += kmsg->msg.msg_namelen;
986 
987 	/*
988 	 *      "fromlen shall refer to the value before truncation.."
989 	 *                      1003.1g
990 	 */
991 	hdr.msg.namelen = kmsg->msg.msg_namelen;
992 
993 	/* ensure that there is no gap between hdr and sockaddr_storage */
994 	BUILD_BUG_ON(offsetof(struct io_recvmsg_multishot_hdr, addr) !=
995 		     sizeof(struct io_uring_recvmsg_out));
996 	if (copy_to_user(io->buf, &hdr, copy_len)) {
997 		*finished = true;
998 		return -EFAULT;
999 	}
1000 
1001 	return sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
1002 			kmsg->controllen + err;
1003 }
1004 
io_recvmsg(struct io_kiocb * req,unsigned int issue_flags)1005 int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
1006 {
1007 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1008 	struct io_async_msghdr *kmsg = req->async_data;
1009 	struct io_br_sel sel = { };
1010 	struct socket *sock;
1011 	unsigned flags;
1012 	int ret, min_ret = 0;
1013 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1014 	bool mshot_finished = true;
1015 
1016 	sock = sock_from_file(req->file);
1017 	if (unlikely(!sock))
1018 		return -ENOTSOCK;
1019 
1020 	if (!(req->flags & REQ_F_POLLED) &&
1021 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
1022 		return -EAGAIN;
1023 
1024 	flags = sr->msg_flags;
1025 	if (force_nonblock)
1026 		flags |= MSG_DONTWAIT;
1027 
1028 retry_multishot:
1029 	sel.buf_list = NULL;
1030 	if (io_do_buffer_select(req)) {
1031 		size_t len = sr->len;
1032 
1033 		sel = io_buffer_select(req, &len, sr->buf_group, issue_flags);
1034 		if (!sel.addr)
1035 			return -ENOBUFS;
1036 
1037 		if (req->flags & REQ_F_APOLL_MULTISHOT) {
1038 			ret = io_recvmsg_prep_multishot(kmsg, sr, &sel.addr, &len);
1039 			if (ret) {
1040 				io_kbuf_recycle(req, sel.buf_list, issue_flags);
1041 				return ret;
1042 			}
1043 		}
1044 
1045 		iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, sel.addr, len);
1046 	}
1047 
1048 	kmsg->msg.msg_get_inq = 1;
1049 	kmsg->msg.msg_inq = -1;
1050 	if (req->flags & REQ_F_APOLL_MULTISHOT) {
1051 		ret = io_recvmsg_multishot(sock, sr, kmsg, flags,
1052 					   &mshot_finished);
1053 	} else {
1054 		/* disable partial retry for recvmsg with cmsg attached */
1055 		if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen)
1056 			min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1057 
1058 		ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg,
1059 					 kmsg->uaddr, flags);
1060 	}
1061 
1062 	if (ret < min_ret) {
1063 		if (ret == -EAGAIN && force_nonblock) {
1064 			io_kbuf_recycle(req, sel.buf_list, issue_flags);
1065 			return IOU_RETRY;
1066 		}
1067 		if (ret > 0 && io_net_retry(sock, flags)) {
1068 			sr->done_io += ret;
1069 			return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret);
1070 		}
1071 		if (ret == -ERESTARTSYS)
1072 			ret = -EINTR;
1073 		req_set_fail(req);
1074 	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
1075 		req_set_fail(req);
1076 	}
1077 
1078 	if (ret > 0)
1079 		ret += sr->done_io;
1080 	else if (sr->done_io)
1081 		ret = sr->done_io;
1082 	else
1083 		io_kbuf_recycle(req, sel.buf_list, issue_flags);
1084 
1085 	sel.val = ret;
1086 	if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags))
1087 		goto retry_multishot;
1088 
1089 	return sel.val;
1090 }
1091 
io_recv_buf_select(struct io_kiocb * req,struct io_async_msghdr * kmsg,struct io_br_sel * sel,unsigned int issue_flags)1092 static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg,
1093 			      struct io_br_sel *sel, unsigned int issue_flags)
1094 {
1095 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1096 	int ret;
1097 
1098 	/*
1099 	 * If the ring isn't locked, then don't use the peek interface
1100 	 * to grab multiple buffers as we will lock/unlock between
1101 	 * this selection and posting the buffers.
1102 	 */
1103 	if (!(issue_flags & IO_URING_F_UNLOCKED) &&
1104 	    sr->flags & IORING_RECVSEND_BUNDLE) {
1105 		struct buf_sel_arg arg = {
1106 			.iovs = &kmsg->fast_iov,
1107 			.nr_iovs = 1,
1108 			.mode = KBUF_MODE_EXPAND,
1109 			.buf_group = sr->buf_group,
1110 		};
1111 
1112 		if (kmsg->vec.iovec) {
1113 			arg.nr_iovs = kmsg->vec.nr;
1114 			arg.iovs = kmsg->vec.iovec;
1115 			arg.mode |= KBUF_MODE_FREE;
1116 		}
1117 
1118 		if (sel->val)
1119 			arg.max_len = sel->val;
1120 		else if (kmsg->msg.msg_inq > 1)
1121 			arg.max_len = min_not_zero(sel->val, (ssize_t) kmsg->msg.msg_inq);
1122 
1123 		/* if mshot limited, ensure we don't go over */
1124 		if (sr->flags & IORING_RECV_MSHOT_LIM)
1125 			arg.max_len = min_not_zero(arg.max_len, sr->mshot_total_len);
1126 		ret = io_buffers_peek(req, &arg, sel);
1127 		if (unlikely(ret < 0))
1128 			return ret;
1129 
1130 		if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
1131 			kmsg->vec.nr = ret;
1132 			kmsg->vec.iovec = arg.iovs;
1133 			req->flags |= REQ_F_NEED_CLEANUP;
1134 		}
1135 		if (arg.partial_map)
1136 			sr->flags |= IORING_RECV_PARTIAL_MAP;
1137 
1138 		/* special case 1 vec, can be a fast path */
1139 		if (ret == 1) {
1140 			sr->buf = arg.iovs[0].iov_base;
1141 			sr->len = arg.iovs[0].iov_len;
1142 			goto map_ubuf;
1143 		}
1144 		iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret,
1145 				arg.out_len);
1146 	} else {
1147 		size_t len = sel->val;
1148 
1149 		*sel = io_buffer_select(req, &len, sr->buf_group, issue_flags);
1150 		if (!sel->addr)
1151 			return -ENOBUFS;
1152 		sr->buf = sel->addr;
1153 		sr->len = len;
1154 map_ubuf:
1155 		ret = import_ubuf(ITER_DEST, sr->buf, sr->len,
1156 				  &kmsg->msg.msg_iter);
1157 		if (unlikely(ret))
1158 			return ret;
1159 	}
1160 
1161 	return 0;
1162 }
1163 
io_recv(struct io_kiocb * req,unsigned int issue_flags)1164 int io_recv(struct io_kiocb *req, unsigned int issue_flags)
1165 {
1166 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1167 	struct io_async_msghdr *kmsg = req->async_data;
1168 	struct io_br_sel sel;
1169 	struct socket *sock;
1170 	unsigned flags;
1171 	int ret, min_ret = 0;
1172 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1173 	bool mshot_finished;
1174 
1175 	if (!(req->flags & REQ_F_POLLED) &&
1176 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
1177 		return -EAGAIN;
1178 
1179 	sock = sock_from_file(req->file);
1180 	if (unlikely(!sock))
1181 		return -ENOTSOCK;
1182 
1183 	flags = sr->msg_flags;
1184 	if (force_nonblock)
1185 		flags |= MSG_DONTWAIT;
1186 
1187 retry_multishot:
1188 	sel.buf_list = NULL;
1189 	if (io_do_buffer_select(req)) {
1190 		sel.val = sr->len;
1191 		ret = io_recv_buf_select(req, kmsg, &sel, issue_flags);
1192 		if (unlikely(ret < 0)) {
1193 			kmsg->msg.msg_inq = -1;
1194 			goto out_free;
1195 		}
1196 		sr->buf = NULL;
1197 	}
1198 
1199 	kmsg->msg.msg_flags = 0;
1200 	kmsg->msg.msg_inq = -1;
1201 
1202 	if (flags & MSG_WAITALL)
1203 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1204 
1205 	ret = sock_recvmsg(sock, &kmsg->msg, flags);
1206 	if (ret < min_ret) {
1207 		if (ret == -EAGAIN && force_nonblock) {
1208 			io_kbuf_recycle(req, sel.buf_list, issue_flags);
1209 			return IOU_RETRY;
1210 		}
1211 		if (ret > 0 && io_net_retry(sock, flags)) {
1212 			sr->len -= ret;
1213 			sr->buf += ret;
1214 			sr->done_io += ret;
1215 			return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret);
1216 		}
1217 		if (ret == -ERESTARTSYS)
1218 			ret = -EINTR;
1219 		req_set_fail(req);
1220 	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
1221 out_free:
1222 		req_set_fail(req);
1223 	}
1224 
1225 	mshot_finished = ret <= 0;
1226 	if (ret > 0)
1227 		ret += sr->done_io;
1228 	else if (sr->done_io)
1229 		ret = sr->done_io;
1230 	else
1231 		io_kbuf_recycle(req, sel.buf_list, issue_flags);
1232 
1233 	sel.val = ret;
1234 	if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags))
1235 		goto retry_multishot;
1236 
1237 	return sel.val;
1238 }
1239 
io_recvzc_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)1240 int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1241 {
1242 	struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
1243 	unsigned ifq_idx;
1244 
1245 	if (unlikely(sqe->addr2 || sqe->addr || sqe->addr3))
1246 		return -EINVAL;
1247 
1248 	ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx);
1249 	zc->ifq = xa_load(&req->ctx->zcrx_ctxs, ifq_idx);
1250 	if (!zc->ifq)
1251 		return -EINVAL;
1252 
1253 	zc->len = READ_ONCE(sqe->len);
1254 	zc->flags = READ_ONCE(sqe->ioprio);
1255 	if (READ_ONCE(sqe->msg_flags))
1256 		return -EINVAL;
1257 	if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT))
1258 		return -EINVAL;
1259 	/* multishot required */
1260 	if (!(zc->flags & IORING_RECV_MULTISHOT))
1261 		return -EINVAL;
1262 	/* All data completions are posted as aux CQEs. */
1263 	req->flags |= REQ_F_APOLL_MULTISHOT;
1264 
1265 	return 0;
1266 }
1267 
io_recvzc(struct io_kiocb * req,unsigned int issue_flags)1268 int io_recvzc(struct io_kiocb *req, unsigned int issue_flags)
1269 {
1270 	struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
1271 	struct socket *sock;
1272 	unsigned int len;
1273 	int ret;
1274 
1275 	if (!(req->flags & REQ_F_POLLED) &&
1276 	    (zc->flags & IORING_RECVSEND_POLL_FIRST))
1277 		return -EAGAIN;
1278 
1279 	sock = sock_from_file(req->file);
1280 	if (unlikely(!sock))
1281 		return -ENOTSOCK;
1282 
1283 	len = zc->len;
1284 	ret = io_zcrx_recv(req, zc->ifq, sock, 0, issue_flags, &zc->len);
1285 	if (len && zc->len == 0) {
1286 		io_req_set_res(req, 0, 0);
1287 
1288 		return IOU_COMPLETE;
1289 	}
1290 	if (unlikely(ret <= 0) && ret != -EAGAIN) {
1291 		if (ret == -ERESTARTSYS)
1292 			ret = -EINTR;
1293 		if (ret == IOU_REQUEUE)
1294 			return IOU_REQUEUE;
1295 
1296 		req_set_fail(req);
1297 		io_req_set_res(req, ret, 0);
1298 		return IOU_COMPLETE;
1299 	}
1300 	return IOU_RETRY;
1301 }
1302 
io_send_zc_cleanup(struct io_kiocb * req)1303 void io_send_zc_cleanup(struct io_kiocb *req)
1304 {
1305 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1306 	struct io_async_msghdr *io = req->async_data;
1307 
1308 	if (req_has_async_data(req))
1309 		io_netmsg_iovec_free(io);
1310 	if (zc->notif) {
1311 		io_notif_flush(zc->notif);
1312 		zc->notif = NULL;
1313 	}
1314 }
1315 
1316 #define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF)
1317 #define IO_ZC_FLAGS_VALID  (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE | \
1318 				IORING_SEND_VECTORIZED)
1319 
io_send_zc_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)1320 int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1321 {
1322 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1323 	struct io_ring_ctx *ctx = req->ctx;
1324 	struct io_async_msghdr *iomsg;
1325 	struct io_kiocb *notif;
1326 	int ret;
1327 
1328 	zc->done_io = 0;
1329 
1330 	if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)))
1331 		return -EINVAL;
1332 	/* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */
1333 	if (req->flags & REQ_F_CQE_SKIP)
1334 		return -EINVAL;
1335 
1336 	notif = zc->notif = io_alloc_notif(ctx);
1337 	if (!notif)
1338 		return -ENOMEM;
1339 	notif->cqe.user_data = req->cqe.user_data;
1340 	notif->cqe.res = 0;
1341 	notif->cqe.flags = IORING_CQE_F_NOTIF;
1342 	req->flags |= REQ_F_NEED_CLEANUP | REQ_F_POLL_NO_LAZY;
1343 
1344 	zc->flags = READ_ONCE(sqe->ioprio);
1345 	if (unlikely(zc->flags & ~IO_ZC_FLAGS_COMMON)) {
1346 		if (zc->flags & ~IO_ZC_FLAGS_VALID)
1347 			return -EINVAL;
1348 		if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) {
1349 			struct io_notif_data *nd = io_notif_to_data(notif);
1350 
1351 			nd->zc_report = true;
1352 			nd->zc_used = false;
1353 			nd->zc_copied = false;
1354 		}
1355 	}
1356 
1357 	zc->len = READ_ONCE(sqe->len);
1358 	zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY;
1359 	req->buf_index = READ_ONCE(sqe->buf_index);
1360 	if (zc->msg_flags & MSG_DONTWAIT)
1361 		req->flags |= REQ_F_NOWAIT;
1362 
1363 	if (io_is_compat(req->ctx))
1364 		zc->msg_flags |= MSG_CMSG_COMPAT;
1365 
1366 	iomsg = io_msg_alloc_async(req);
1367 	if (unlikely(!iomsg))
1368 		return -ENOMEM;
1369 
1370 	if (req->opcode == IORING_OP_SEND_ZC) {
1371 		ret = io_send_setup(req, sqe);
1372 	} else {
1373 		if (unlikely(sqe->addr2 || sqe->file_index))
1374 			return -EINVAL;
1375 		ret = io_sendmsg_setup(req, sqe);
1376 	}
1377 	if (unlikely(ret))
1378 		return ret;
1379 
1380 	if (!(zc->flags & IORING_RECVSEND_FIXED_BUF)) {
1381 		iomsg->msg.sg_from_iter = io_sg_from_iter_iovec;
1382 		return io_notif_account_mem(zc->notif, iomsg->msg.msg_iter.count);
1383 	}
1384 	iomsg->msg.sg_from_iter = io_sg_from_iter;
1385 	return 0;
1386 }
1387 
io_sg_from_iter_iovec(struct sk_buff * skb,struct iov_iter * from,size_t length)1388 static int io_sg_from_iter_iovec(struct sk_buff *skb,
1389 				 struct iov_iter *from, size_t length)
1390 {
1391 	skb_zcopy_downgrade_managed(skb);
1392 	return zerocopy_fill_skb_from_iter(skb, from, length);
1393 }
1394 
io_sg_from_iter(struct sk_buff * skb,struct iov_iter * from,size_t length)1395 static int io_sg_from_iter(struct sk_buff *skb,
1396 			   struct iov_iter *from, size_t length)
1397 {
1398 	struct skb_shared_info *shinfo = skb_shinfo(skb);
1399 	int frag = shinfo->nr_frags;
1400 	int ret = 0;
1401 	struct bvec_iter bi;
1402 	ssize_t copied = 0;
1403 	unsigned long truesize = 0;
1404 
1405 	if (!frag)
1406 		shinfo->flags |= SKBFL_MANAGED_FRAG_REFS;
1407 	else if (unlikely(!skb_zcopy_managed(skb)))
1408 		return zerocopy_fill_skb_from_iter(skb, from, length);
1409 
1410 	bi.bi_size = min(from->count, length);
1411 	bi.bi_bvec_done = from->iov_offset;
1412 	bi.bi_idx = 0;
1413 
1414 	while (bi.bi_size && frag < MAX_SKB_FRAGS) {
1415 		struct bio_vec v = mp_bvec_iter_bvec(from->bvec, bi);
1416 
1417 		copied += v.bv_len;
1418 		truesize += PAGE_ALIGN(v.bv_len + v.bv_offset);
1419 		__skb_fill_page_desc_noacc(shinfo, frag++, v.bv_page,
1420 					   v.bv_offset, v.bv_len);
1421 		bvec_iter_advance_single(from->bvec, &bi, v.bv_len);
1422 	}
1423 	if (bi.bi_size)
1424 		ret = -EMSGSIZE;
1425 
1426 	shinfo->nr_frags = frag;
1427 	from->bvec += bi.bi_idx;
1428 	from->nr_segs -= bi.bi_idx;
1429 	from->count -= copied;
1430 	from->iov_offset = bi.bi_bvec_done;
1431 
1432 	skb->data_len += copied;
1433 	skb->len += copied;
1434 	skb->truesize += truesize;
1435 	return ret;
1436 }
1437 
io_send_zc_import(struct io_kiocb * req,unsigned int issue_flags)1438 static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags)
1439 {
1440 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1441 	struct io_async_msghdr *kmsg = req->async_data;
1442 
1443 	WARN_ON_ONCE(!(sr->flags & IORING_RECVSEND_FIXED_BUF));
1444 
1445 	sr->notif->buf_index = req->buf_index;
1446 	return io_import_reg_buf(sr->notif, &kmsg->msg.msg_iter,
1447 				(u64)(uintptr_t)sr->buf, sr->len,
1448 				ITER_SOURCE, issue_flags);
1449 }
1450 
io_send_zc(struct io_kiocb * req,unsigned int issue_flags)1451 int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
1452 {
1453 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1454 	struct io_async_msghdr *kmsg = req->async_data;
1455 	struct socket *sock;
1456 	unsigned msg_flags;
1457 	int ret, min_ret = 0;
1458 
1459 	sock = sock_from_file(req->file);
1460 	if (unlikely(!sock))
1461 		return -ENOTSOCK;
1462 	if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
1463 		return -EOPNOTSUPP;
1464 
1465 	if (!(req->flags & REQ_F_POLLED) &&
1466 	    (zc->flags & IORING_RECVSEND_POLL_FIRST))
1467 		return -EAGAIN;
1468 
1469 	if (req->flags & REQ_F_IMPORT_BUFFER) {
1470 		req->flags &= ~REQ_F_IMPORT_BUFFER;
1471 		ret = io_send_zc_import(req, issue_flags);
1472 		if (unlikely(ret))
1473 			return ret;
1474 	}
1475 
1476 	msg_flags = zc->msg_flags;
1477 	if (issue_flags & IO_URING_F_NONBLOCK)
1478 		msg_flags |= MSG_DONTWAIT;
1479 	if (msg_flags & MSG_WAITALL)
1480 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1481 	msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
1482 
1483 	kmsg->msg.msg_flags = msg_flags;
1484 	kmsg->msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg;
1485 	ret = sock_sendmsg(sock, &kmsg->msg);
1486 
1487 	if (unlikely(ret < min_ret)) {
1488 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1489 			return -EAGAIN;
1490 
1491 		if (ret > 0 && io_net_retry(sock, kmsg->msg.msg_flags)) {
1492 			zc->len -= ret;
1493 			zc->buf += ret;
1494 			zc->done_io += ret;
1495 			return -EAGAIN;
1496 		}
1497 		if (ret == -ERESTARTSYS)
1498 			ret = -EINTR;
1499 		req_set_fail(req);
1500 	}
1501 
1502 	if (ret >= 0)
1503 		ret += zc->done_io;
1504 	else if (zc->done_io)
1505 		ret = zc->done_io;
1506 
1507 	/*
1508 	 * If we're in io-wq we can't rely on tw ordering guarantees, defer
1509 	 * flushing notif to io_send_zc_cleanup()
1510 	 */
1511 	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
1512 		io_notif_flush(zc->notif);
1513 		zc->notif = NULL;
1514 		io_req_msg_cleanup(req, 0);
1515 	}
1516 	io_req_set_res(req, ret, IORING_CQE_F_MORE);
1517 	return IOU_COMPLETE;
1518 }
1519 
io_sendmsg_zc(struct io_kiocb * req,unsigned int issue_flags)1520 int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
1521 {
1522 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1523 	struct io_async_msghdr *kmsg = req->async_data;
1524 	struct socket *sock;
1525 	unsigned flags;
1526 	int ret, min_ret = 0;
1527 
1528 	if (req->flags & REQ_F_IMPORT_BUFFER) {
1529 		unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs;
1530 		int ret;
1531 
1532 		sr->notif->buf_index = req->buf_index;
1533 		ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter,
1534 					sr->notif, &kmsg->vec, uvec_segs,
1535 					issue_flags);
1536 		if (unlikely(ret))
1537 			return ret;
1538 		req->flags &= ~REQ_F_IMPORT_BUFFER;
1539 	}
1540 
1541 	sock = sock_from_file(req->file);
1542 	if (unlikely(!sock))
1543 		return -ENOTSOCK;
1544 	if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
1545 		return -EOPNOTSUPP;
1546 
1547 	if (!(req->flags & REQ_F_POLLED) &&
1548 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
1549 		return -EAGAIN;
1550 
1551 	flags = sr->msg_flags;
1552 	if (issue_flags & IO_URING_F_NONBLOCK)
1553 		flags |= MSG_DONTWAIT;
1554 	if (flags & MSG_WAITALL)
1555 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1556 
1557 	kmsg->msg.msg_control_user = sr->msg_control;
1558 	kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg;
1559 	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
1560 
1561 	if (unlikely(ret < min_ret)) {
1562 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1563 			return -EAGAIN;
1564 
1565 		if (ret > 0 && io_net_retry(sock, flags)) {
1566 			sr->done_io += ret;
1567 			return -EAGAIN;
1568 		}
1569 		if (ret == -ERESTARTSYS)
1570 			ret = -EINTR;
1571 		req_set_fail(req);
1572 	}
1573 
1574 	if (ret >= 0)
1575 		ret += sr->done_io;
1576 	else if (sr->done_io)
1577 		ret = sr->done_io;
1578 
1579 	/*
1580 	 * If we're in io-wq we can't rely on tw ordering guarantees, defer
1581 	 * flushing notif to io_send_zc_cleanup()
1582 	 */
1583 	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
1584 		io_notif_flush(sr->notif);
1585 		sr->notif = NULL;
1586 		io_req_msg_cleanup(req, 0);
1587 	}
1588 	io_req_set_res(req, ret, IORING_CQE_F_MORE);
1589 	return IOU_COMPLETE;
1590 }
1591 
io_sendrecv_fail(struct io_kiocb * req)1592 void io_sendrecv_fail(struct io_kiocb *req)
1593 {
1594 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1595 
1596 	if (sr->done_io)
1597 		req->cqe.res = sr->done_io;
1598 
1599 	if ((req->flags & REQ_F_NEED_CLEANUP) &&
1600 	    (req->opcode == IORING_OP_SEND_ZC || req->opcode == IORING_OP_SENDMSG_ZC))
1601 		req->cqe.flags |= IORING_CQE_F_MORE;
1602 }
1603 
1604 #define ACCEPT_FLAGS	(IORING_ACCEPT_MULTISHOT | IORING_ACCEPT_DONTWAIT | \
1605 			 IORING_ACCEPT_POLL_FIRST)
1606 
io_accept_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)1607 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1608 {
1609 	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
1610 
1611 	if (sqe->len || sqe->buf_index)
1612 		return -EINVAL;
1613 
1614 	accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1615 	accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
1616 	accept->flags = READ_ONCE(sqe->accept_flags);
1617 	accept->nofile = rlimit(RLIMIT_NOFILE);
1618 	accept->iou_flags = READ_ONCE(sqe->ioprio);
1619 	if (accept->iou_flags & ~ACCEPT_FLAGS)
1620 		return -EINVAL;
1621 
1622 	accept->file_slot = READ_ONCE(sqe->file_index);
1623 	if (accept->file_slot) {
1624 		if (accept->flags & SOCK_CLOEXEC)
1625 			return -EINVAL;
1626 		if (accept->iou_flags & IORING_ACCEPT_MULTISHOT &&
1627 		    accept->file_slot != IORING_FILE_INDEX_ALLOC)
1628 			return -EINVAL;
1629 	}
1630 	if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1631 		return -EINVAL;
1632 	if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
1633 		accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
1634 	if (accept->iou_flags & IORING_ACCEPT_MULTISHOT)
1635 		req->flags |= REQ_F_APOLL_MULTISHOT;
1636 	if (accept->iou_flags & IORING_ACCEPT_DONTWAIT)
1637 		req->flags |= REQ_F_NOWAIT;
1638 	return 0;
1639 }
1640 
io_accept(struct io_kiocb * req,unsigned int issue_flags)1641 int io_accept(struct io_kiocb *req, unsigned int issue_flags)
1642 {
1643 	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
1644 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1645 	bool fixed = !!accept->file_slot;
1646 	struct proto_accept_arg arg = {
1647 		.flags = force_nonblock ? O_NONBLOCK : 0,
1648 	};
1649 	struct file *file;
1650 	unsigned cflags;
1651 	int ret, fd;
1652 
1653 	if (!(req->flags & REQ_F_POLLED) &&
1654 	    accept->iou_flags & IORING_ACCEPT_POLL_FIRST)
1655 		return -EAGAIN;
1656 
1657 retry:
1658 	if (!fixed) {
1659 		fd = __get_unused_fd_flags(accept->flags, accept->nofile);
1660 		if (unlikely(fd < 0))
1661 			return fd;
1662 	}
1663 	arg.err = 0;
1664 	arg.is_empty = -1;
1665 	file = do_accept(req->file, &arg, accept->addr, accept->addr_len,
1666 			 accept->flags);
1667 	if (IS_ERR(file)) {
1668 		if (!fixed)
1669 			put_unused_fd(fd);
1670 		ret = PTR_ERR(file);
1671 		if (ret == -EAGAIN && force_nonblock &&
1672 		    !(accept->iou_flags & IORING_ACCEPT_DONTWAIT))
1673 			return IOU_RETRY;
1674 
1675 		if (ret == -ERESTARTSYS)
1676 			ret = -EINTR;
1677 	} else if (!fixed) {
1678 		fd_install(fd, file);
1679 		ret = fd;
1680 	} else {
1681 		ret = io_fixed_fd_install(req, issue_flags, file,
1682 						accept->file_slot);
1683 	}
1684 
1685 	cflags = 0;
1686 	if (!arg.is_empty)
1687 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
1688 
1689 	if (ret >= 0 && (req->flags & REQ_F_APOLL_MULTISHOT) &&
1690 	    io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) {
1691 		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1)
1692 			goto retry;
1693 		return IOU_RETRY;
1694 	}
1695 
1696 	io_req_set_res(req, ret, cflags);
1697 	if (ret < 0)
1698 		req_set_fail(req);
1699 	return IOU_COMPLETE;
1700 }
1701 
io_socket_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)1702 int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1703 {
1704 	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1705 
1706 	if (sqe->addr || sqe->rw_flags || sqe->buf_index)
1707 		return -EINVAL;
1708 
1709 	sock->domain = READ_ONCE(sqe->fd);
1710 	sock->type = READ_ONCE(sqe->off);
1711 	sock->protocol = READ_ONCE(sqe->len);
1712 	sock->file_slot = READ_ONCE(sqe->file_index);
1713 	sock->nofile = rlimit(RLIMIT_NOFILE);
1714 
1715 	sock->flags = sock->type & ~SOCK_TYPE_MASK;
1716 	if (sock->file_slot && (sock->flags & SOCK_CLOEXEC))
1717 		return -EINVAL;
1718 	if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1719 		return -EINVAL;
1720 	return 0;
1721 }
1722 
io_socket(struct io_kiocb * req,unsigned int issue_flags)1723 int io_socket(struct io_kiocb *req, unsigned int issue_flags)
1724 {
1725 	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1726 	bool fixed = !!sock->file_slot;
1727 	struct file *file;
1728 	int ret, fd;
1729 
1730 	if (!fixed) {
1731 		fd = __get_unused_fd_flags(sock->flags, sock->nofile);
1732 		if (unlikely(fd < 0))
1733 			return fd;
1734 	}
1735 	file = __sys_socket_file(sock->domain, sock->type, sock->protocol);
1736 	if (IS_ERR(file)) {
1737 		if (!fixed)
1738 			put_unused_fd(fd);
1739 		ret = PTR_ERR(file);
1740 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1741 			return -EAGAIN;
1742 		if (ret == -ERESTARTSYS)
1743 			ret = -EINTR;
1744 		req_set_fail(req);
1745 	} else if (!fixed) {
1746 		fd_install(fd, file);
1747 		ret = fd;
1748 	} else {
1749 		ret = io_fixed_fd_install(req, issue_flags, file,
1750 					    sock->file_slot);
1751 	}
1752 	io_req_set_res(req, ret, 0);
1753 	return IOU_COMPLETE;
1754 }
1755 
io_connect_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)1756 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1757 {
1758 	struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect);
1759 	struct io_async_msghdr *io;
1760 
1761 	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
1762 		return -EINVAL;
1763 
1764 	conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1765 	conn->addr_len =  READ_ONCE(sqe->addr2);
1766 	conn->in_progress = conn->seen_econnaborted = false;
1767 
1768 	io = io_msg_alloc_async(req);
1769 	if (unlikely(!io))
1770 		return -ENOMEM;
1771 
1772 	return move_addr_to_kernel(conn->addr, conn->addr_len, &io->addr);
1773 }
1774 
io_connect(struct io_kiocb * req,unsigned int issue_flags)1775 int io_connect(struct io_kiocb *req, unsigned int issue_flags)
1776 {
1777 	struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect);
1778 	struct io_async_msghdr *io = req->async_data;
1779 	unsigned file_flags;
1780 	int ret;
1781 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1782 
1783 	if (connect->in_progress) {
1784 		struct poll_table_struct pt = { ._key = EPOLLERR };
1785 
1786 		if (vfs_poll(req->file, &pt) & EPOLLERR)
1787 			goto get_sock_err;
1788 	}
1789 
1790 	file_flags = force_nonblock ? O_NONBLOCK : 0;
1791 
1792 	ret = __sys_connect_file(req->file, &io->addr, connect->addr_len,
1793 				 file_flags);
1794 	if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED)
1795 	    && force_nonblock) {
1796 		if (ret == -EINPROGRESS) {
1797 			connect->in_progress = true;
1798 		} else if (ret == -ECONNABORTED) {
1799 			if (connect->seen_econnaborted)
1800 				goto out;
1801 			connect->seen_econnaborted = true;
1802 		}
1803 		return -EAGAIN;
1804 	}
1805 	if (connect->in_progress) {
1806 		/*
1807 		 * At least bluetooth will return -EBADFD on a re-connect
1808 		 * attempt, and it's (supposedly) also valid to get -EISCONN
1809 		 * which means the previous result is good. For both of these,
1810 		 * grab the sock_error() and use that for the completion.
1811 		 */
1812 		if (ret == -EBADFD || ret == -EISCONN) {
1813 get_sock_err:
1814 			ret = sock_error(sock_from_file(req->file)->sk);
1815 		}
1816 	}
1817 	if (ret == -ERESTARTSYS)
1818 		ret = -EINTR;
1819 out:
1820 	if (ret < 0)
1821 		req_set_fail(req);
1822 	io_req_msg_cleanup(req, issue_flags);
1823 	io_req_set_res(req, ret, 0);
1824 	return IOU_COMPLETE;
1825 }
1826 
io_bind_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)1827 int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1828 {
1829 	struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind);
1830 	struct sockaddr __user *uaddr;
1831 	struct io_async_msghdr *io;
1832 
1833 	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
1834 		return -EINVAL;
1835 
1836 	uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1837 	bind->addr_len =  READ_ONCE(sqe->addr2);
1838 
1839 	io = io_msg_alloc_async(req);
1840 	if (unlikely(!io))
1841 		return -ENOMEM;
1842 	return move_addr_to_kernel(uaddr, bind->addr_len, &io->addr);
1843 }
1844 
io_bind(struct io_kiocb * req,unsigned int issue_flags)1845 int io_bind(struct io_kiocb *req, unsigned int issue_flags)
1846 {
1847 	struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind);
1848 	struct io_async_msghdr *io = req->async_data;
1849 	struct socket *sock;
1850 	int ret;
1851 
1852 	sock = sock_from_file(req->file);
1853 	if (unlikely(!sock))
1854 		return -ENOTSOCK;
1855 
1856 	ret = __sys_bind_socket(sock, &io->addr, bind->addr_len);
1857 	if (ret < 0)
1858 		req_set_fail(req);
1859 	io_req_set_res(req, ret, 0);
1860 	return 0;
1861 }
1862 
io_listen_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)1863 int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1864 {
1865 	struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen);
1866 
1867 	if (sqe->addr || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in || sqe->addr2)
1868 		return -EINVAL;
1869 
1870 	listen->backlog = READ_ONCE(sqe->len);
1871 	return 0;
1872 }
1873 
io_listen(struct io_kiocb * req,unsigned int issue_flags)1874 int io_listen(struct io_kiocb *req, unsigned int issue_flags)
1875 {
1876 	struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen);
1877 	struct socket *sock;
1878 	int ret;
1879 
1880 	sock = sock_from_file(req->file);
1881 	if (unlikely(!sock))
1882 		return -ENOTSOCK;
1883 
1884 	ret = __sys_listen_socket(sock, listen->backlog);
1885 	if (ret < 0)
1886 		req_set_fail(req);
1887 	io_req_set_res(req, ret, 0);
1888 	return 0;
1889 }
1890 
io_netmsg_cache_free(const void * entry)1891 void io_netmsg_cache_free(const void *entry)
1892 {
1893 	struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry;
1894 
1895 	io_vec_free(&kmsg->vec);
1896 	kfree(kmsg);
1897 }
1898