xref: /linux/io_uring/net.c (revision c10130c234c81f4a7a143edbf413080235f8d8ce)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/file.h>
5 #include <linux/slab.h>
6 #include <linux/net.h>
7 #include <linux/un.h>
8 #include <linux/compat.h>
9 #include <net/compat.h>
10 #include <linux/io_uring.h>
11 
12 #include <uapi/linux/io_uring.h>
13 
14 #include "filetable.h"
15 #include "io_uring.h"
16 #include "kbuf.h"
17 #include "alloc_cache.h"
18 #include "net.h"
19 #include "notif.h"
20 #include "rsrc.h"
21 #include "zcrx.h"
22 
23 struct io_shutdown {
24 	struct file			*file;
25 	int				how;
26 };
27 
28 struct io_accept {
29 	struct file			*file;
30 	struct sockaddr __user		*addr;
31 	int __user			*addr_len;
32 	int				flags;
33 	int				iou_flags;
34 	u32				file_slot;
35 	unsigned long			nofile;
36 };
37 
38 struct io_socket {
39 	struct file			*file;
40 	int				domain;
41 	int				type;
42 	int				protocol;
43 	int				flags;
44 	u32				file_slot;
45 	unsigned long			nofile;
46 };
47 
48 struct io_connect {
49 	struct file			*file;
50 	struct sockaddr __user		*addr;
51 	int				addr_len;
52 	bool				in_progress;
53 	bool				seen_econnaborted;
54 };
55 
56 struct io_bind {
57 	struct file			*file;
58 	int				addr_len;
59 };
60 
61 struct io_listen {
62 	struct file			*file;
63 	int				backlog;
64 };
65 
66 struct io_sr_msg {
67 	struct file			*file;
68 	union {
69 		struct compat_msghdr __user	*umsg_compat;
70 		struct user_msghdr __user	*umsg;
71 		void __user			*buf;
72 	};
73 	int				len;
74 	unsigned			done_io;
75 	unsigned			msg_flags;
76 	unsigned			nr_multishot_loops;
77 	u16				flags;
78 	/* initialised and used only by !msg send variants */
79 	u16				buf_group;
80 	/* per-invocation mshot limit */
81 	unsigned			mshot_len;
82 	/* overall mshot byte limit */
83 	unsigned			mshot_total_len;
84 	void __user			*msg_control;
85 	/* used only for send zerocopy */
86 	struct io_kiocb 		*notif;
87 };
88 
89 /*
90  * The UAPI flags are the lower 8 bits, as that's all sqe->ioprio will hold
91  * anyway. Use the upper 8 bits for internal uses.
92  */
93 enum sr_retry_flags {
94 	IORING_RECV_RETRY	= (1U << 15),
95 	IORING_RECV_PARTIAL_MAP	= (1U << 14),
96 	IORING_RECV_MSHOT_CAP	= (1U << 13),
97 	IORING_RECV_MSHOT_LIM	= (1U << 12),
98 	IORING_RECV_MSHOT_DONE	= (1U << 11),
99 
100 	IORING_RECV_RETRY_CLEAR	= IORING_RECV_RETRY | IORING_RECV_PARTIAL_MAP,
101 	IORING_RECV_NO_RETRY	= IORING_RECV_RETRY | IORING_RECV_PARTIAL_MAP |
102 				  IORING_RECV_MSHOT_CAP | IORING_RECV_MSHOT_DONE,
103 };
104 
105 /*
106  * Number of times we'll try and do receives if there's more data. If we
107  * exceed this limit, then add us to the back of the queue and retry from
108  * there. This helps fairness between flooding clients.
109  */
110 #define MULTISHOT_MAX_RETRY	32
111 
112 struct io_recvzc {
113 	struct file			*file;
114 	u16				flags;
115 	u32				len;
116 	struct io_zcrx_ifq		*ifq;
117 };
118 
119 static int io_sg_from_iter_iovec(struct sk_buff *skb,
120 				 struct iov_iter *from, size_t length);
121 static int io_sg_from_iter(struct sk_buff *skb,
122 			   struct iov_iter *from, size_t length);
123 
io_shutdown_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)124 int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
125 {
126 	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
127 
128 	if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
129 		     sqe->buf_index || sqe->splice_fd_in))
130 		return -EINVAL;
131 
132 	shutdown->how = READ_ONCE(sqe->len);
133 	req->flags |= REQ_F_FORCE_ASYNC;
134 	return 0;
135 }
136 
io_shutdown(struct io_kiocb * req,unsigned int issue_flags)137 int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
138 {
139 	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
140 	struct socket *sock;
141 	int ret;
142 
143 	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
144 
145 	sock = sock_from_file(req->file);
146 	if (unlikely(!sock))
147 		return -ENOTSOCK;
148 
149 	ret = __sys_shutdown_sock(sock, shutdown->how);
150 	io_req_set_res(req, ret, 0);
151 	return IOU_COMPLETE;
152 }
153 
io_net_retry(struct socket * sock,int flags)154 static bool io_net_retry(struct socket *sock, int flags)
155 {
156 	if (!(flags & MSG_WAITALL))
157 		return false;
158 	return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
159 }
160 
io_netmsg_iovec_free(struct io_async_msghdr * kmsg)161 static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg)
162 {
163 	if (kmsg->vec.iovec)
164 		io_vec_free(&kmsg->vec);
165 }
166 
io_netmsg_recycle(struct io_kiocb * req,unsigned int issue_flags)167 static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
168 {
169 	struct io_async_msghdr *hdr = req->async_data;
170 
171 	/* can't recycle, ensure we free the iovec if we have one */
172 	if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
173 		io_netmsg_iovec_free(hdr);
174 		return;
175 	}
176 
177 	/* Let normal cleanup path reap it if we fail adding to the cache */
178 	io_alloc_cache_vec_kasan(&hdr->vec);
179 	if (hdr->vec.nr > IO_VEC_CACHE_SOFT_CAP)
180 		io_vec_free(&hdr->vec);
181 
182 	if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr))
183 		io_req_async_data_clear(req, REQ_F_NEED_CLEANUP);
184 }
185 
io_msg_alloc_async(struct io_kiocb * req)186 static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req)
187 {
188 	struct io_ring_ctx *ctx = req->ctx;
189 	struct io_async_msghdr *hdr;
190 
191 	hdr = io_uring_alloc_async_data(&ctx->netmsg_cache, req);
192 	if (!hdr)
193 		return NULL;
194 
195 	/* If the async data was cached, we might have an iov cached inside. */
196 	if (hdr->vec.iovec)
197 		req->flags |= REQ_F_NEED_CLEANUP;
198 	return hdr;
199 }
200 
io_mshot_prep_retry(struct io_kiocb * req,struct io_async_msghdr * kmsg)201 static inline void io_mshot_prep_retry(struct io_kiocb *req,
202 				       struct io_async_msghdr *kmsg)
203 {
204 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
205 
206 	req->flags &= ~REQ_F_BL_EMPTY;
207 	sr->done_io = 0;
208 	sr->flags &= ~IORING_RECV_RETRY_CLEAR;
209 	sr->len = sr->mshot_len;
210 }
211 
io_net_import_vec(struct io_kiocb * req,struct io_async_msghdr * iomsg,const struct iovec __user * uiov,unsigned uvec_seg,int ddir)212 static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg,
213 			     const struct iovec __user *uiov, unsigned uvec_seg,
214 			     int ddir)
215 {
216 	struct iovec *iov;
217 	int ret, nr_segs;
218 
219 	if (iomsg->vec.iovec) {
220 		nr_segs = iomsg->vec.nr;
221 		iov = iomsg->vec.iovec;
222 	} else {
223 		nr_segs = 1;
224 		iov = &iomsg->fast_iov;
225 	}
226 
227 	ret = __import_iovec(ddir, uiov, uvec_seg, nr_segs, &iov,
228 			     &iomsg->msg.msg_iter, io_is_compat(req->ctx));
229 	if (unlikely(ret < 0))
230 		return ret;
231 
232 	if (iov) {
233 		req->flags |= REQ_F_NEED_CLEANUP;
234 		io_vec_reset_iovec(&iomsg->vec, iov, iomsg->msg.msg_iter.nr_segs);
235 	}
236 	return 0;
237 }
238 
io_compat_msg_copy_hdr(struct io_kiocb * req,struct io_async_msghdr * iomsg,struct compat_msghdr * msg,int ddir,struct sockaddr __user ** save_addr)239 static int io_compat_msg_copy_hdr(struct io_kiocb *req,
240 				  struct io_async_msghdr *iomsg,
241 				  struct compat_msghdr *msg, int ddir,
242 				  struct sockaddr __user **save_addr)
243 {
244 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
245 	struct compat_iovec __user *uiov;
246 	int ret;
247 
248 	if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg)))
249 		return -EFAULT;
250 
251 	ret = __get_compat_msghdr(&iomsg->msg, msg, save_addr);
252 	if (ret)
253 		return ret;
254 
255 	uiov = compat_ptr(msg->msg_iov);
256 	if (req->flags & REQ_F_BUFFER_SELECT) {
257 		if (msg->msg_iovlen == 0) {
258 			sr->len = 0;
259 		} else if (msg->msg_iovlen > 1) {
260 			return -EINVAL;
261 		} else {
262 			struct compat_iovec tmp_iov;
263 
264 			if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov)))
265 				return -EFAULT;
266 			sr->len = tmp_iov.iov_len;
267 		}
268 	}
269 	return 0;
270 }
271 
io_copy_msghdr_from_user(struct user_msghdr * msg,struct user_msghdr __user * umsg)272 static int io_copy_msghdr_from_user(struct user_msghdr *msg,
273 				    struct user_msghdr __user *umsg)
274 {
275 	if (!user_access_begin(umsg, sizeof(*umsg)))
276 		return -EFAULT;
277 	unsafe_get_user(msg->msg_name, &umsg->msg_name, ua_end);
278 	unsafe_get_user(msg->msg_namelen, &umsg->msg_namelen, ua_end);
279 	unsafe_get_user(msg->msg_iov, &umsg->msg_iov, ua_end);
280 	unsafe_get_user(msg->msg_iovlen, &umsg->msg_iovlen, ua_end);
281 	unsafe_get_user(msg->msg_control, &umsg->msg_control, ua_end);
282 	unsafe_get_user(msg->msg_controllen, &umsg->msg_controllen, ua_end);
283 	user_access_end();
284 	return 0;
285 ua_end:
286 	user_access_end();
287 	return -EFAULT;
288 }
289 
io_msg_copy_hdr(struct io_kiocb * req,struct io_async_msghdr * iomsg,struct user_msghdr * msg,int ddir,struct sockaddr __user ** save_addr)290 static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
291 			   struct user_msghdr *msg, int ddir,
292 			   struct sockaddr __user **save_addr)
293 {
294 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
295 	struct user_msghdr __user *umsg = sr->umsg;
296 	int ret;
297 
298 	iomsg->msg.msg_name = &iomsg->addr;
299 	iomsg->msg.msg_iter.nr_segs = 0;
300 
301 	if (io_is_compat(req->ctx)) {
302 		struct compat_msghdr cmsg;
303 
304 		ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ddir, save_addr);
305 		if (ret)
306 			return ret;
307 
308 		memset(msg, 0, sizeof(*msg));
309 		msg->msg_namelen = cmsg.msg_namelen;
310 		msg->msg_controllen = cmsg.msg_controllen;
311 		msg->msg_iov = compat_ptr(cmsg.msg_iov);
312 		msg->msg_iovlen = cmsg.msg_iovlen;
313 		return 0;
314 	}
315 
316 	ret = io_copy_msghdr_from_user(msg, umsg);
317 	if (unlikely(ret))
318 		return ret;
319 
320 	msg->msg_flags = 0;
321 
322 	ret = __copy_msghdr(&iomsg->msg, msg, save_addr);
323 	if (ret)
324 		return ret;
325 
326 	if (req->flags & REQ_F_BUFFER_SELECT) {
327 		if (msg->msg_iovlen == 0) {
328 			sr->len = 0;
329 		} else if (msg->msg_iovlen > 1) {
330 			return -EINVAL;
331 		} else {
332 			struct iovec __user *uiov = msg->msg_iov;
333 			struct iovec tmp_iov;
334 
335 			if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov)))
336 				return -EFAULT;
337 			sr->len = tmp_iov.iov_len;
338 		}
339 	}
340 	return 0;
341 }
342 
io_sendmsg_recvmsg_cleanup(struct io_kiocb * req)343 void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req)
344 {
345 	struct io_async_msghdr *io = req->async_data;
346 
347 	io_netmsg_iovec_free(io);
348 }
349 
io_send_setup(struct io_kiocb * req,const struct io_uring_sqe * sqe)350 static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
351 {
352 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
353 	struct io_async_msghdr *kmsg = req->async_data;
354 	void __user *addr;
355 	u16 addr_len;
356 	int ret;
357 
358 	sr->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
359 
360 	if (READ_ONCE(sqe->__pad3[0]))
361 		return -EINVAL;
362 
363 	kmsg->msg.msg_name = NULL;
364 	kmsg->msg.msg_namelen = 0;
365 	kmsg->msg.msg_control = NULL;
366 	kmsg->msg.msg_controllen = 0;
367 	kmsg->msg.msg_ubuf = NULL;
368 
369 	addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
370 	addr_len = READ_ONCE(sqe->addr_len);
371 	if (addr) {
372 		ret = move_addr_to_kernel(addr, addr_len, &kmsg->addr);
373 		if (unlikely(ret < 0))
374 			return ret;
375 		kmsg->msg.msg_name = &kmsg->addr;
376 		kmsg->msg.msg_namelen = addr_len;
377 	}
378 	if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
379 		if (!(sr->flags & IORING_SEND_VECTORIZED)) {
380 			req->flags |= REQ_F_IMPORT_BUFFER;
381 			return 0;
382 		}
383 
384 		kmsg->msg.msg_iter.nr_segs = sr->len;
385 		return io_prep_reg_iovec(req, &kmsg->vec, sr->buf, sr->len);
386 	}
387 	if (req->flags & REQ_F_BUFFER_SELECT)
388 		return 0;
389 
390 	if (sr->flags & IORING_SEND_VECTORIZED)
391 		return io_net_import_vec(req, kmsg, sr->buf, sr->len, ITER_SOURCE);
392 
393 	return import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter);
394 }
395 
io_sendmsg_setup(struct io_kiocb * req,const struct io_uring_sqe * sqe)396 static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
397 {
398 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
399 	struct io_async_msghdr *kmsg = req->async_data;
400 	struct user_msghdr msg;
401 	int ret;
402 
403 	sr->flags |= IORING_SEND_VECTORIZED;
404 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
405 	ret = io_msg_copy_hdr(req, kmsg, &msg, ITER_SOURCE, NULL);
406 	if (unlikely(ret))
407 		return ret;
408 	/* save msg_control as sys_sendmsg() overwrites it */
409 	sr->msg_control = kmsg->msg.msg_control_user;
410 
411 	if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
412 		kmsg->msg.msg_iter.nr_segs = msg.msg_iovlen;
413 		return io_prep_reg_iovec(req, &kmsg->vec, msg.msg_iov,
414 					 msg.msg_iovlen);
415 	}
416 	if (req->flags & REQ_F_BUFFER_SELECT)
417 		return 0;
418 	return io_net_import_vec(req, kmsg, msg.msg_iov, msg.msg_iovlen, ITER_SOURCE);
419 }
420 
421 #define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE | IORING_SEND_VECTORIZED)
422 
io_sendmsg_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)423 int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
424 {
425 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
426 
427 	sr->done_io = 0;
428 	sr->len = READ_ONCE(sqe->len);
429 	if (unlikely(sr->len < 0))
430 		return -EINVAL;
431 	sr->flags = READ_ONCE(sqe->ioprio);
432 	if (sr->flags & ~SENDMSG_FLAGS)
433 		return -EINVAL;
434 	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
435 	if (sr->msg_flags & MSG_DONTWAIT)
436 		req->flags |= REQ_F_NOWAIT;
437 	if (req->flags & REQ_F_BUFFER_SELECT)
438 		sr->buf_group = req->buf_index;
439 	if (sr->flags & IORING_RECVSEND_BUNDLE) {
440 		if (req->opcode == IORING_OP_SENDMSG)
441 			return -EINVAL;
442 		sr->msg_flags |= MSG_WAITALL;
443 		req->flags |= REQ_F_MULTISHOT;
444 	}
445 
446 	if (io_is_compat(req->ctx))
447 		sr->msg_flags |= MSG_CMSG_COMPAT;
448 
449 	if (unlikely(!io_msg_alloc_async(req)))
450 		return -ENOMEM;
451 	if (req->opcode != IORING_OP_SENDMSG)
452 		return io_send_setup(req, sqe);
453 	if (unlikely(sqe->addr2 || sqe->file_index))
454 		return -EINVAL;
455 	return io_sendmsg_setup(req, sqe);
456 }
457 
io_req_msg_cleanup(struct io_kiocb * req,unsigned int issue_flags)458 static void io_req_msg_cleanup(struct io_kiocb *req,
459 			       unsigned int issue_flags)
460 {
461 	io_netmsg_recycle(req, issue_flags);
462 }
463 
464 /*
465  * For bundle completions, we need to figure out how many segments we consumed.
466  * A bundle could be using a single ITER_UBUF if that's all we mapped, or it
467  * could be using an ITER_IOVEC. If the latter, then if we consumed all of
468  * the segments, then it's a trivial questiont o answer. If we have residual
469  * data in the iter, then loop the segments to figure out how much we
470  * transferred.
471  */
io_bundle_nbufs(struct io_async_msghdr * kmsg,int ret)472 static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret)
473 {
474 	struct iovec *iov;
475 	int nbufs;
476 
477 	/* no data is always zero segments, and a ubuf is always 1 segment */
478 	if (ret <= 0)
479 		return 0;
480 	if (iter_is_ubuf(&kmsg->msg.msg_iter))
481 		return 1;
482 
483 	iov = kmsg->vec.iovec;
484 	if (!iov)
485 		iov = &kmsg->fast_iov;
486 
487 	/* if all data was transferred, it's basic pointer math */
488 	if (!iov_iter_count(&kmsg->msg.msg_iter))
489 		return iter_iov(&kmsg->msg.msg_iter) - iov;
490 
491 	/* short transfer, count segments */
492 	nbufs = 0;
493 	do {
494 		int this_len = min_t(int, iov[nbufs].iov_len, ret);
495 
496 		nbufs++;
497 		ret -= this_len;
498 	} while (ret);
499 
500 	return nbufs;
501 }
502 
io_net_kbuf_recyle(struct io_kiocb * req,struct io_buffer_list * bl,struct io_async_msghdr * kmsg,int len)503 static int io_net_kbuf_recyle(struct io_kiocb *req, struct io_buffer_list *bl,
504 			      struct io_async_msghdr *kmsg, int len)
505 {
506 	req->flags |= REQ_F_BL_NO_RECYCLE;
507 	if (req->flags & REQ_F_BUFFERS_COMMIT)
508 		io_kbuf_commit(req, bl, len, io_bundle_nbufs(kmsg, len));
509 	return IOU_RETRY;
510 }
511 
io_send_finish(struct io_kiocb * req,struct io_async_msghdr * kmsg,struct io_br_sel * sel)512 static inline bool io_send_finish(struct io_kiocb *req,
513 				  struct io_async_msghdr *kmsg,
514 				  struct io_br_sel *sel)
515 {
516 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
517 	bool bundle_finished = sel->val <= 0;
518 	unsigned int cflags;
519 
520 	if (!(sr->flags & IORING_RECVSEND_BUNDLE)) {
521 		cflags = io_put_kbuf(req, sel->val, sel->buf_list);
522 		goto finish;
523 	}
524 
525 	cflags = io_put_kbufs(req, sel->val, sel->buf_list, io_bundle_nbufs(kmsg, sel->val));
526 
527 	/*
528 	 * Don't start new bundles if the buffer list is empty, or if the
529 	 * current operation needed to go through polling to complete.
530 	 */
531 	if (bundle_finished || req->flags & (REQ_F_BL_EMPTY | REQ_F_POLLED))
532 		goto finish;
533 
534 	/*
535 	 * Fill CQE for this receive and see if we should keep trying to
536 	 * receive from this socket.
537 	 */
538 	if (io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) {
539 		io_mshot_prep_retry(req, kmsg);
540 		return false;
541 	}
542 
543 	/* Otherwise stop bundle and use the current result. */
544 finish:
545 	io_req_set_res(req, sel->val, cflags);
546 	sel->val = IOU_COMPLETE;
547 	return true;
548 }
549 
io_sendmsg(struct io_kiocb * req,unsigned int issue_flags)550 int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
551 {
552 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
553 	struct io_async_msghdr *kmsg = req->async_data;
554 	struct socket *sock;
555 	unsigned flags;
556 	int min_ret = 0;
557 	int ret;
558 
559 	sock = sock_from_file(req->file);
560 	if (unlikely(!sock))
561 		return -ENOTSOCK;
562 
563 	if (!(req->flags & REQ_F_POLLED) &&
564 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
565 		return -EAGAIN;
566 
567 	flags = sr->msg_flags;
568 	if (issue_flags & IO_URING_F_NONBLOCK)
569 		flags |= MSG_DONTWAIT;
570 	if (flags & MSG_WAITALL)
571 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
572 
573 	kmsg->msg.msg_control_user = sr->msg_control;
574 
575 	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
576 
577 	if (ret < min_ret) {
578 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
579 			return -EAGAIN;
580 		if (ret > 0 && io_net_retry(sock, flags)) {
581 			kmsg->msg.msg_controllen = 0;
582 			kmsg->msg.msg_control = NULL;
583 			sr->done_io += ret;
584 			return -EAGAIN;
585 		}
586 		if (ret == -ERESTARTSYS)
587 			ret = -EINTR;
588 		req_set_fail(req);
589 	}
590 	io_req_msg_cleanup(req, issue_flags);
591 	if (ret >= 0)
592 		ret += sr->done_io;
593 	else if (sr->done_io)
594 		ret = sr->done_io;
595 	io_req_set_res(req, ret, 0);
596 	return IOU_COMPLETE;
597 }
598 
io_send_select_buffer(struct io_kiocb * req,unsigned int issue_flags,struct io_br_sel * sel,struct io_async_msghdr * kmsg)599 static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags,
600 				 struct io_br_sel *sel, struct io_async_msghdr *kmsg)
601 {
602 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
603 	struct buf_sel_arg arg = {
604 		.iovs = &kmsg->fast_iov,
605 		.max_len = min_not_zero(sr->len, INT_MAX),
606 		.nr_iovs = 1,
607 		.buf_group = sr->buf_group,
608 	};
609 	int ret;
610 
611 	if (kmsg->vec.iovec) {
612 		arg.nr_iovs = kmsg->vec.nr;
613 		arg.iovs = kmsg->vec.iovec;
614 		arg.mode = KBUF_MODE_FREE;
615 	}
616 
617 	if (!(sr->flags & IORING_RECVSEND_BUNDLE))
618 		arg.nr_iovs = 1;
619 	else
620 		arg.mode |= KBUF_MODE_EXPAND;
621 
622 	ret = io_buffers_select(req, &arg, sel, issue_flags);
623 	if (unlikely(ret < 0))
624 		return ret;
625 
626 	if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
627 		kmsg->vec.nr = ret;
628 		kmsg->vec.iovec = arg.iovs;
629 		req->flags |= REQ_F_NEED_CLEANUP;
630 	}
631 	sr->len = arg.out_len;
632 
633 	if (ret == 1) {
634 		sr->buf = arg.iovs[0].iov_base;
635 		ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len,
636 					&kmsg->msg.msg_iter);
637 		if (unlikely(ret))
638 			return ret;
639 	} else {
640 		iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE,
641 				arg.iovs, ret, arg.out_len);
642 	}
643 
644 	return 0;
645 }
646 
io_send(struct io_kiocb * req,unsigned int issue_flags)647 int io_send(struct io_kiocb *req, unsigned int issue_flags)
648 {
649 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
650 	struct io_async_msghdr *kmsg = req->async_data;
651 	struct io_br_sel sel = { };
652 	struct socket *sock;
653 	unsigned flags;
654 	int min_ret = 0;
655 	int ret;
656 
657 	sock = sock_from_file(req->file);
658 	if (unlikely(!sock))
659 		return -ENOTSOCK;
660 
661 	if (!(req->flags & REQ_F_POLLED) &&
662 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
663 		return -EAGAIN;
664 
665 	flags = sr->msg_flags;
666 	if (issue_flags & IO_URING_F_NONBLOCK)
667 		flags |= MSG_DONTWAIT;
668 
669 retry_bundle:
670 	sel.buf_list = NULL;
671 	if (io_do_buffer_select(req)) {
672 		ret = io_send_select_buffer(req, issue_flags, &sel, kmsg);
673 		if (ret)
674 			return ret;
675 	}
676 
677 	/*
678 	 * If MSG_WAITALL is set, or this is a bundle send, then we need
679 	 * the full amount. If just bundle is set, if we do a short send
680 	 * then we complete the bundle sequence rather than continue on.
681 	 */
682 	if (flags & MSG_WAITALL || sr->flags & IORING_RECVSEND_BUNDLE)
683 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
684 
685 	flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
686 	kmsg->msg.msg_flags = flags;
687 	ret = sock_sendmsg(sock, &kmsg->msg);
688 	if (ret < min_ret) {
689 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
690 			return -EAGAIN;
691 
692 		if (ret > 0 && io_net_retry(sock, flags)) {
693 			sr->len -= ret;
694 			sr->buf += ret;
695 			sr->done_io += ret;
696 			return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret);
697 		}
698 		if (ret == -ERESTARTSYS)
699 			ret = -EINTR;
700 		req_set_fail(req);
701 	}
702 	if (ret >= 0)
703 		ret += sr->done_io;
704 	else if (sr->done_io)
705 		ret = sr->done_io;
706 
707 	sel.val = ret;
708 	if (!io_send_finish(req, kmsg, &sel))
709 		goto retry_bundle;
710 
711 	io_req_msg_cleanup(req, issue_flags);
712 	return sel.val;
713 }
714 
io_recvmsg_mshot_prep(struct io_kiocb * req,struct io_async_msghdr * iomsg,int namelen,size_t controllen)715 static int io_recvmsg_mshot_prep(struct io_kiocb *req,
716 				 struct io_async_msghdr *iomsg,
717 				 int namelen, size_t controllen)
718 {
719 	if ((req->flags & (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) ==
720 			  (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) {
721 		int hdr;
722 
723 		if (unlikely(namelen < 0))
724 			return -EOVERFLOW;
725 		if (check_add_overflow(sizeof(struct io_uring_recvmsg_out),
726 					namelen, &hdr))
727 			return -EOVERFLOW;
728 		if (check_add_overflow(hdr, controllen, &hdr))
729 			return -EOVERFLOW;
730 
731 		iomsg->namelen = namelen;
732 		iomsg->controllen = controllen;
733 		return 0;
734 	}
735 
736 	return 0;
737 }
738 
io_recvmsg_copy_hdr(struct io_kiocb * req,struct io_async_msghdr * iomsg)739 static int io_recvmsg_copy_hdr(struct io_kiocb *req,
740 			       struct io_async_msghdr *iomsg)
741 {
742 	struct user_msghdr msg;
743 	int ret;
744 
745 	ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr);
746 	if (unlikely(ret))
747 		return ret;
748 
749 	if (!(req->flags & REQ_F_BUFFER_SELECT)) {
750 		ret = io_net_import_vec(req, iomsg, msg.msg_iov, msg.msg_iovlen,
751 					ITER_DEST);
752 		if (unlikely(ret))
753 			return ret;
754 	}
755 	return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen,
756 					msg.msg_controllen);
757 }
758 
io_recvmsg_prep_setup(struct io_kiocb * req)759 static int io_recvmsg_prep_setup(struct io_kiocb *req)
760 {
761 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
762 	struct io_async_msghdr *kmsg;
763 
764 	kmsg = io_msg_alloc_async(req);
765 	if (unlikely(!kmsg))
766 		return -ENOMEM;
767 
768 	if (req->opcode == IORING_OP_RECV) {
769 		kmsg->msg.msg_name = NULL;
770 		kmsg->msg.msg_namelen = 0;
771 		kmsg->msg.msg_inq = 0;
772 		kmsg->msg.msg_control = NULL;
773 		kmsg->msg.msg_get_inq = 1;
774 		kmsg->msg.msg_controllen = 0;
775 		kmsg->msg.msg_iocb = NULL;
776 		kmsg->msg.msg_ubuf = NULL;
777 
778 		if (req->flags & REQ_F_BUFFER_SELECT)
779 			return 0;
780 		return import_ubuf(ITER_DEST, sr->buf, sr->len,
781 				   &kmsg->msg.msg_iter);
782 	}
783 
784 	return io_recvmsg_copy_hdr(req, kmsg);
785 }
786 
787 #define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \
788 			IORING_RECVSEND_BUNDLE)
789 
io_recvmsg_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)790 int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
791 {
792 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
793 
794 	sr->done_io = 0;
795 
796 	if (unlikely(sqe->addr2))
797 		return -EINVAL;
798 
799 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
800 	sr->len = READ_ONCE(sqe->len);
801 	if (unlikely(sr->len < 0))
802 		return -EINVAL;
803 	sr->flags = READ_ONCE(sqe->ioprio);
804 	if (sr->flags & ~RECVMSG_FLAGS)
805 		return -EINVAL;
806 	sr->msg_flags = READ_ONCE(sqe->msg_flags);
807 	if (sr->msg_flags & MSG_DONTWAIT)
808 		req->flags |= REQ_F_NOWAIT;
809 	if (sr->msg_flags & MSG_ERRQUEUE)
810 		req->flags |= REQ_F_CLEAR_POLLIN;
811 	if (req->flags & REQ_F_BUFFER_SELECT)
812 		sr->buf_group = req->buf_index;
813 	sr->mshot_total_len = sr->mshot_len = 0;
814 	if (sr->flags & IORING_RECV_MULTISHOT) {
815 		if (!(req->flags & REQ_F_BUFFER_SELECT))
816 			return -EINVAL;
817 		if (sr->msg_flags & MSG_WAITALL)
818 			return -EINVAL;
819 		if (req->opcode == IORING_OP_RECV) {
820 			sr->mshot_len = sr->len;
821 			sr->mshot_total_len = READ_ONCE(sqe->optlen);
822 			if (sr->mshot_total_len)
823 				sr->flags |= IORING_RECV_MSHOT_LIM;
824 		} else if (sqe->optlen) {
825 			return -EINVAL;
826 		}
827 		req->flags |= REQ_F_APOLL_MULTISHOT;
828 	} else if (sqe->optlen) {
829 		return -EINVAL;
830 	}
831 
832 	if (sr->flags & IORING_RECVSEND_BUNDLE) {
833 		if (req->opcode == IORING_OP_RECVMSG)
834 			return -EINVAL;
835 	}
836 
837 	if (io_is_compat(req->ctx))
838 		sr->msg_flags |= MSG_CMSG_COMPAT;
839 
840 	sr->nr_multishot_loops = 0;
841 	return io_recvmsg_prep_setup(req);
842 }
843 
844 /* bits to clear in old and inherit in new cflags on bundle retry */
845 #define CQE_F_MASK	(IORING_CQE_F_SOCK_NONEMPTY|IORING_CQE_F_MORE|\
846 			 IORING_CQE_F_BUF_MORE)
847 
848 /*
849  * Finishes io_recv and io_recvmsg.
850  *
851  * Returns true if it is actually finished, or false if it should run
852  * again (for multishot).
853  */
io_recv_finish(struct io_kiocb * req,struct io_async_msghdr * kmsg,struct io_br_sel * sel,bool mshot_finished,unsigned issue_flags)854 static inline bool io_recv_finish(struct io_kiocb *req,
855 				  struct io_async_msghdr *kmsg,
856 				  struct io_br_sel *sel, bool mshot_finished,
857 				  unsigned issue_flags)
858 {
859 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
860 	unsigned int cflags = 0;
861 
862 	if (kmsg->msg.msg_inq > 0)
863 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
864 
865 	if (sel->val > 0 && sr->flags & IORING_RECV_MSHOT_LIM) {
866 		/*
867 		 * If sr->len hits zero, the limit has been reached. Mark
868 		 * mshot as finished, and flag MSHOT_DONE as well to prevent
869 		 * a potential bundle from being retried.
870 		 */
871 		sr->mshot_total_len -= min_t(int, sel->val, sr->mshot_total_len);
872 		if (!sr->mshot_total_len) {
873 			sr->flags |= IORING_RECV_MSHOT_DONE;
874 			mshot_finished = true;
875 		}
876 	}
877 
878 	if (sr->flags & IORING_RECVSEND_BUNDLE) {
879 		size_t this_ret = sel->val - sr->done_io;
880 
881 		cflags |= io_put_kbufs(req, this_ret, sel->buf_list, io_bundle_nbufs(kmsg, this_ret));
882 		if (sr->flags & IORING_RECV_RETRY)
883 			cflags = req->cqe.flags | (cflags & CQE_F_MASK);
884 		if (sr->mshot_len && sel->val >= sr->mshot_len)
885 			sr->flags |= IORING_RECV_MSHOT_CAP;
886 		/* bundle with no more immediate buffers, we're done */
887 		if (req->flags & REQ_F_BL_EMPTY)
888 			goto finish;
889 		/*
890 		 * If more is available AND it was a full transfer, retry and
891 		 * append to this one
892 		 */
893 		if (!(sr->flags & IORING_RECV_NO_RETRY) &&
894 		    kmsg->msg.msg_inq > 1 && this_ret > 0 &&
895 		    !iov_iter_count(&kmsg->msg.msg_iter)) {
896 			req->cqe.flags = cflags & ~CQE_F_MASK;
897 			sr->len = kmsg->msg.msg_inq;
898 			sr->done_io += this_ret;
899 			sr->flags |= IORING_RECV_RETRY;
900 			return false;
901 		}
902 	} else {
903 		cflags |= io_put_kbuf(req, sel->val, sel->buf_list);
904 	}
905 
906 	/*
907 	 * Fill CQE for this receive and see if we should keep trying to
908 	 * receive from this socket.
909 	 */
910 	if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished &&
911 	    io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) {
912 		sel->val = IOU_RETRY;
913 		io_mshot_prep_retry(req, kmsg);
914 		/* Known not-empty or unknown state, retry */
915 		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) {
916 			if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY &&
917 			    !(sr->flags & IORING_RECV_MSHOT_CAP)) {
918 				return false;
919 			}
920 			/* mshot retries exceeded, force a requeue */
921 			sr->nr_multishot_loops = 0;
922 			sr->flags &= ~IORING_RECV_MSHOT_CAP;
923 			if (issue_flags & IO_URING_F_MULTISHOT)
924 				sel->val = IOU_REQUEUE;
925 		}
926 		return true;
927 	}
928 
929 	/* Finish the request / stop multishot. */
930 finish:
931 	io_req_set_res(req, sel->val, cflags);
932 	sel->val = IOU_COMPLETE;
933 	io_req_msg_cleanup(req, issue_flags);
934 	return true;
935 }
936 
io_recvmsg_prep_multishot(struct io_async_msghdr * kmsg,struct io_sr_msg * sr,void __user ** buf,size_t * len)937 static int io_recvmsg_prep_multishot(struct io_async_msghdr *kmsg,
938 				     struct io_sr_msg *sr, void __user **buf,
939 				     size_t *len)
940 {
941 	unsigned long ubuf = (unsigned long) *buf;
942 	unsigned long hdr;
943 
944 	hdr = sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
945 		kmsg->controllen;
946 	if (*len < hdr)
947 		return -EFAULT;
948 
949 	if (kmsg->controllen) {
950 		unsigned long control = ubuf + hdr - kmsg->controllen;
951 
952 		kmsg->msg.msg_control_user = (void __user *) control;
953 		kmsg->msg.msg_controllen = kmsg->controllen;
954 	}
955 
956 	sr->buf = *buf; /* stash for later copy */
957 	*buf = (void __user *) (ubuf + hdr);
958 	kmsg->payloadlen = *len = *len - hdr;
959 	return 0;
960 }
961 
962 struct io_recvmsg_multishot_hdr {
963 	struct io_uring_recvmsg_out msg;
964 	struct sockaddr_storage addr;
965 };
966 
io_recvmsg_multishot(struct socket * sock,struct io_sr_msg * io,struct io_async_msghdr * kmsg,unsigned int flags,bool * finished)967 static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io,
968 				struct io_async_msghdr *kmsg,
969 				unsigned int flags, bool *finished)
970 {
971 	int err;
972 	int copy_len;
973 	struct io_recvmsg_multishot_hdr hdr;
974 
975 	if (kmsg->namelen)
976 		kmsg->msg.msg_name = &hdr.addr;
977 	kmsg->msg.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
978 	kmsg->msg.msg_namelen = 0;
979 
980 	if (sock->file->f_flags & O_NONBLOCK)
981 		flags |= MSG_DONTWAIT;
982 
983 	err = sock_recvmsg(sock, &kmsg->msg, flags);
984 	*finished = err <= 0;
985 	if (err < 0)
986 		return err;
987 
988 	hdr.msg = (struct io_uring_recvmsg_out) {
989 		.controllen = kmsg->controllen - kmsg->msg.msg_controllen,
990 		.flags = kmsg->msg.msg_flags & ~MSG_CMSG_COMPAT
991 	};
992 
993 	hdr.msg.payloadlen = err;
994 	if (err > kmsg->payloadlen)
995 		err = kmsg->payloadlen;
996 
997 	copy_len = sizeof(struct io_uring_recvmsg_out);
998 	if (kmsg->msg.msg_namelen > kmsg->namelen)
999 		copy_len += kmsg->namelen;
1000 	else
1001 		copy_len += kmsg->msg.msg_namelen;
1002 
1003 	/*
1004 	 *      "fromlen shall refer to the value before truncation.."
1005 	 *                      1003.1g
1006 	 */
1007 	hdr.msg.namelen = kmsg->msg.msg_namelen;
1008 
1009 	/* ensure that there is no gap between hdr and sockaddr_storage */
1010 	BUILD_BUG_ON(offsetof(struct io_recvmsg_multishot_hdr, addr) !=
1011 		     sizeof(struct io_uring_recvmsg_out));
1012 	if (copy_to_user(io->buf, &hdr, copy_len)) {
1013 		*finished = true;
1014 		return -EFAULT;
1015 	}
1016 
1017 	return sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
1018 			kmsg->controllen + err;
1019 }
1020 
io_recvmsg(struct io_kiocb * req,unsigned int issue_flags)1021 int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
1022 {
1023 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1024 	struct io_async_msghdr *kmsg = req->async_data;
1025 	struct io_br_sel sel = { };
1026 	struct socket *sock;
1027 	unsigned flags;
1028 	int ret, min_ret = 0;
1029 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1030 	bool mshot_finished = true;
1031 
1032 	sock = sock_from_file(req->file);
1033 	if (unlikely(!sock))
1034 		return -ENOTSOCK;
1035 
1036 	if (!(req->flags & REQ_F_POLLED) &&
1037 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
1038 		return -EAGAIN;
1039 
1040 	flags = sr->msg_flags;
1041 	if (force_nonblock)
1042 		flags |= MSG_DONTWAIT;
1043 
1044 retry_multishot:
1045 	sel.buf_list = NULL;
1046 	if (io_do_buffer_select(req)) {
1047 		size_t len = sr->len;
1048 
1049 		sel = io_buffer_select(req, &len, sr->buf_group, issue_flags);
1050 		if (!sel.addr)
1051 			return -ENOBUFS;
1052 
1053 		if (req->flags & REQ_F_APOLL_MULTISHOT) {
1054 			ret = io_recvmsg_prep_multishot(kmsg, sr, &sel.addr, &len);
1055 			if (ret) {
1056 				io_kbuf_recycle(req, sel.buf_list, issue_flags);
1057 				return ret;
1058 			}
1059 		}
1060 
1061 		iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, sel.addr, len);
1062 	}
1063 
1064 	kmsg->msg.msg_get_inq = 1;
1065 	kmsg->msg.msg_inq = -1;
1066 	if (req->flags & REQ_F_APOLL_MULTISHOT) {
1067 		ret = io_recvmsg_multishot(sock, sr, kmsg, flags,
1068 					   &mshot_finished);
1069 	} else {
1070 		/* disable partial retry for recvmsg with cmsg attached */
1071 		if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen)
1072 			min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1073 
1074 		ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg,
1075 					 kmsg->uaddr, flags);
1076 	}
1077 
1078 	if (ret < min_ret) {
1079 		if (ret == -EAGAIN && force_nonblock) {
1080 			io_kbuf_recycle(req, sel.buf_list, issue_flags);
1081 			return IOU_RETRY;
1082 		}
1083 		if (ret > 0 && io_net_retry(sock, flags)) {
1084 			sr->done_io += ret;
1085 			return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret);
1086 		}
1087 		if (ret == -ERESTARTSYS)
1088 			ret = -EINTR;
1089 		req_set_fail(req);
1090 	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
1091 		req_set_fail(req);
1092 	}
1093 
1094 	if (ret > 0)
1095 		ret += sr->done_io;
1096 	else if (sr->done_io)
1097 		ret = sr->done_io;
1098 	else
1099 		io_kbuf_recycle(req, sel.buf_list, issue_flags);
1100 
1101 	sel.val = ret;
1102 	if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags))
1103 		goto retry_multishot;
1104 
1105 	return sel.val;
1106 }
1107 
io_recv_buf_select(struct io_kiocb * req,struct io_async_msghdr * kmsg,struct io_br_sel * sel,unsigned int issue_flags)1108 static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg,
1109 			      struct io_br_sel *sel, unsigned int issue_flags)
1110 {
1111 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1112 	int ret;
1113 
1114 	/*
1115 	 * If the ring isn't locked, then don't use the peek interface
1116 	 * to grab multiple buffers as we will lock/unlock between
1117 	 * this selection and posting the buffers.
1118 	 */
1119 	if (!(issue_flags & IO_URING_F_UNLOCKED) &&
1120 	    sr->flags & IORING_RECVSEND_BUNDLE) {
1121 		struct buf_sel_arg arg = {
1122 			.iovs = &kmsg->fast_iov,
1123 			.nr_iovs = 1,
1124 			.mode = KBUF_MODE_EXPAND,
1125 			.buf_group = sr->buf_group,
1126 		};
1127 
1128 		if (kmsg->vec.iovec) {
1129 			arg.nr_iovs = kmsg->vec.nr;
1130 			arg.iovs = kmsg->vec.iovec;
1131 			arg.mode |= KBUF_MODE_FREE;
1132 		}
1133 
1134 		if (sel->val)
1135 			arg.max_len = sel->val;
1136 		else if (kmsg->msg.msg_inq > 1)
1137 			arg.max_len = min_not_zero(sel->val, (ssize_t) kmsg->msg.msg_inq);
1138 
1139 		/* if mshot limited, ensure we don't go over */
1140 		if (sr->flags & IORING_RECV_MSHOT_LIM)
1141 			arg.max_len = min_not_zero(arg.max_len, sr->mshot_total_len);
1142 		ret = io_buffers_peek(req, &arg, sel);
1143 		if (unlikely(ret < 0))
1144 			return ret;
1145 
1146 		if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
1147 			kmsg->vec.nr = ret;
1148 			kmsg->vec.iovec = arg.iovs;
1149 			req->flags |= REQ_F_NEED_CLEANUP;
1150 		}
1151 		if (arg.partial_map)
1152 			sr->flags |= IORING_RECV_PARTIAL_MAP;
1153 
1154 		/* special case 1 vec, can be a fast path */
1155 		if (ret == 1) {
1156 			sr->buf = arg.iovs[0].iov_base;
1157 			sr->len = arg.iovs[0].iov_len;
1158 			goto map_ubuf;
1159 		}
1160 		iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret,
1161 				arg.out_len);
1162 	} else {
1163 		size_t len = sel->val;
1164 
1165 		*sel = io_buffer_select(req, &len, sr->buf_group, issue_flags);
1166 		if (!sel->addr)
1167 			return -ENOBUFS;
1168 		sr->buf = sel->addr;
1169 		sr->len = len;
1170 map_ubuf:
1171 		ret = import_ubuf(ITER_DEST, sr->buf, sr->len,
1172 				  &kmsg->msg.msg_iter);
1173 		if (unlikely(ret))
1174 			return ret;
1175 	}
1176 
1177 	return 0;
1178 }
1179 
io_recv(struct io_kiocb * req,unsigned int issue_flags)1180 int io_recv(struct io_kiocb *req, unsigned int issue_flags)
1181 {
1182 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1183 	struct io_async_msghdr *kmsg = req->async_data;
1184 	struct io_br_sel sel;
1185 	struct socket *sock;
1186 	unsigned flags;
1187 	int ret, min_ret = 0;
1188 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1189 	bool mshot_finished;
1190 
1191 	if (!(req->flags & REQ_F_POLLED) &&
1192 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
1193 		return -EAGAIN;
1194 
1195 	sock = sock_from_file(req->file);
1196 	if (unlikely(!sock))
1197 		return -ENOTSOCK;
1198 
1199 	flags = sr->msg_flags;
1200 	if (force_nonblock)
1201 		flags |= MSG_DONTWAIT;
1202 
1203 retry_multishot:
1204 	sel.buf_list = NULL;
1205 	if (io_do_buffer_select(req)) {
1206 		sel.val = sr->len;
1207 		ret = io_recv_buf_select(req, kmsg, &sel, issue_flags);
1208 		if (unlikely(ret < 0)) {
1209 			kmsg->msg.msg_inq = -1;
1210 			goto out_free;
1211 		}
1212 		sr->buf = NULL;
1213 	}
1214 
1215 	kmsg->msg.msg_flags = 0;
1216 	kmsg->msg.msg_inq = -1;
1217 
1218 	if (flags & MSG_WAITALL)
1219 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1220 
1221 	ret = sock_recvmsg(sock, &kmsg->msg, flags);
1222 	if (ret < min_ret) {
1223 		if (ret == -EAGAIN && force_nonblock) {
1224 			io_kbuf_recycle(req, sel.buf_list, issue_flags);
1225 			return IOU_RETRY;
1226 		}
1227 		if (ret > 0 && io_net_retry(sock, flags)) {
1228 			sr->len -= ret;
1229 			sr->buf += ret;
1230 			sr->done_io += ret;
1231 			return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret);
1232 		}
1233 		if (ret == -ERESTARTSYS)
1234 			ret = -EINTR;
1235 		req_set_fail(req);
1236 	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
1237 out_free:
1238 		req_set_fail(req);
1239 	}
1240 
1241 	mshot_finished = ret <= 0;
1242 	if (ret > 0)
1243 		ret += sr->done_io;
1244 	else if (sr->done_io)
1245 		ret = sr->done_io;
1246 	else
1247 		io_kbuf_recycle(req, sel.buf_list, issue_flags);
1248 
1249 	sel.val = ret;
1250 	if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags))
1251 		goto retry_multishot;
1252 
1253 	return sel.val;
1254 }
1255 
io_recvzc_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)1256 int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1257 {
1258 	struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
1259 	unsigned ifq_idx;
1260 
1261 	if (unlikely(sqe->addr2 || sqe->addr || sqe->addr3))
1262 		return -EINVAL;
1263 
1264 	ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx);
1265 	zc->ifq = xa_load(&req->ctx->zcrx_ctxs, ifq_idx);
1266 	if (!zc->ifq)
1267 		return -EINVAL;
1268 
1269 	zc->len = READ_ONCE(sqe->len);
1270 	zc->flags = READ_ONCE(sqe->ioprio);
1271 	if (READ_ONCE(sqe->msg_flags))
1272 		return -EINVAL;
1273 	if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT))
1274 		return -EINVAL;
1275 	/* multishot required */
1276 	if (!(zc->flags & IORING_RECV_MULTISHOT))
1277 		return -EINVAL;
1278 	/* All data completions are posted as aux CQEs. */
1279 	req->flags |= REQ_F_APOLL_MULTISHOT;
1280 
1281 	return 0;
1282 }
1283 
io_recvzc(struct io_kiocb * req,unsigned int issue_flags)1284 int io_recvzc(struct io_kiocb *req, unsigned int issue_flags)
1285 {
1286 	struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
1287 	struct socket *sock;
1288 	unsigned int len;
1289 	int ret;
1290 
1291 	if (!(req->flags & REQ_F_POLLED) &&
1292 	    (zc->flags & IORING_RECVSEND_POLL_FIRST))
1293 		return -EAGAIN;
1294 
1295 	sock = sock_from_file(req->file);
1296 	if (unlikely(!sock))
1297 		return -ENOTSOCK;
1298 
1299 	len = zc->len;
1300 	ret = io_zcrx_recv(req, zc->ifq, sock, 0, issue_flags, &zc->len);
1301 	if (len && zc->len == 0) {
1302 		io_req_set_res(req, 0, 0);
1303 
1304 		return IOU_COMPLETE;
1305 	}
1306 	if (unlikely(ret <= 0) && ret != -EAGAIN) {
1307 		if (ret == -ERESTARTSYS)
1308 			ret = -EINTR;
1309 		if (ret == IOU_REQUEUE)
1310 			return IOU_REQUEUE;
1311 
1312 		req_set_fail(req);
1313 		io_req_set_res(req, ret, 0);
1314 		return IOU_COMPLETE;
1315 	}
1316 	return IOU_RETRY;
1317 }
1318 
io_send_zc_cleanup(struct io_kiocb * req)1319 void io_send_zc_cleanup(struct io_kiocb *req)
1320 {
1321 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1322 	struct io_async_msghdr *io = req->async_data;
1323 
1324 	if (req_has_async_data(req))
1325 		io_netmsg_iovec_free(io);
1326 	if (zc->notif) {
1327 		io_notif_flush(zc->notif);
1328 		zc->notif = NULL;
1329 	}
1330 }
1331 
1332 #define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF)
1333 #define IO_ZC_FLAGS_VALID  (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE | \
1334 				IORING_SEND_VECTORIZED)
1335 
io_send_zc_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)1336 int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1337 {
1338 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1339 	struct io_ring_ctx *ctx = req->ctx;
1340 	struct io_async_msghdr *iomsg;
1341 	struct io_kiocb *notif;
1342 	u64 user_data;
1343 	int ret;
1344 
1345 	zc->done_io = 0;
1346 
1347 	if (unlikely(READ_ONCE(sqe->__pad2[0])))
1348 		return -EINVAL;
1349 	/* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */
1350 	if (req->flags & REQ_F_CQE_SKIP)
1351 		return -EINVAL;
1352 
1353 	notif = zc->notif = io_alloc_notif(ctx);
1354 	if (!notif)
1355 		return -ENOMEM;
1356 	user_data = READ_ONCE(sqe->addr3);
1357 	if (!user_data)
1358 		user_data = req->cqe.user_data;
1359 
1360 	notif->cqe.user_data = user_data;
1361 	notif->cqe.res = 0;
1362 	notif->cqe.flags = IORING_CQE_F_NOTIF;
1363 	req->flags |= REQ_F_NEED_CLEANUP | REQ_F_POLL_NO_LAZY;
1364 
1365 	zc->flags = READ_ONCE(sqe->ioprio);
1366 	if (unlikely(zc->flags & ~IO_ZC_FLAGS_COMMON)) {
1367 		if (zc->flags & ~IO_ZC_FLAGS_VALID)
1368 			return -EINVAL;
1369 		if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) {
1370 			struct io_notif_data *nd = io_notif_to_data(notif);
1371 
1372 			nd->zc_report = true;
1373 			nd->zc_used = false;
1374 			nd->zc_copied = false;
1375 		}
1376 	}
1377 
1378 	zc->len = READ_ONCE(sqe->len);
1379 	zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY;
1380 	req->buf_index = READ_ONCE(sqe->buf_index);
1381 	if (zc->msg_flags & MSG_DONTWAIT)
1382 		req->flags |= REQ_F_NOWAIT;
1383 
1384 	if (io_is_compat(ctx))
1385 		zc->msg_flags |= MSG_CMSG_COMPAT;
1386 
1387 	iomsg = io_msg_alloc_async(req);
1388 	if (unlikely(!iomsg))
1389 		return -ENOMEM;
1390 
1391 	if (req->opcode == IORING_OP_SEND_ZC) {
1392 		ret = io_send_setup(req, sqe);
1393 	} else {
1394 		if (unlikely(sqe->addr2 || sqe->file_index))
1395 			return -EINVAL;
1396 		ret = io_sendmsg_setup(req, sqe);
1397 	}
1398 	if (unlikely(ret))
1399 		return ret;
1400 
1401 	if (!(zc->flags & IORING_RECVSEND_FIXED_BUF)) {
1402 		iomsg->msg.sg_from_iter = io_sg_from_iter_iovec;
1403 		return io_notif_account_mem(zc->notif, iomsg->msg.msg_iter.count);
1404 	}
1405 	iomsg->msg.sg_from_iter = io_sg_from_iter;
1406 	return 0;
1407 }
1408 
io_sg_from_iter_iovec(struct sk_buff * skb,struct iov_iter * from,size_t length)1409 static int io_sg_from_iter_iovec(struct sk_buff *skb,
1410 				 struct iov_iter *from, size_t length)
1411 {
1412 	skb_zcopy_downgrade_managed(skb);
1413 	return zerocopy_fill_skb_from_iter(skb, from, length);
1414 }
1415 
io_sg_from_iter(struct sk_buff * skb,struct iov_iter * from,size_t length)1416 static int io_sg_from_iter(struct sk_buff *skb,
1417 			   struct iov_iter *from, size_t length)
1418 {
1419 	struct skb_shared_info *shinfo = skb_shinfo(skb);
1420 	int frag = shinfo->nr_frags;
1421 	int ret = 0;
1422 	struct bvec_iter bi;
1423 	ssize_t copied = 0;
1424 	unsigned long truesize = 0;
1425 
1426 	if (!frag)
1427 		shinfo->flags |= SKBFL_MANAGED_FRAG_REFS;
1428 	else if (unlikely(!skb_zcopy_managed(skb)))
1429 		return zerocopy_fill_skb_from_iter(skb, from, length);
1430 
1431 	bi.bi_size = min(from->count, length);
1432 	bi.bi_bvec_done = from->iov_offset;
1433 	bi.bi_idx = 0;
1434 
1435 	while (bi.bi_size && frag < MAX_SKB_FRAGS) {
1436 		struct bio_vec v = mp_bvec_iter_bvec(from->bvec, bi);
1437 
1438 		copied += v.bv_len;
1439 		truesize += PAGE_ALIGN(v.bv_len + v.bv_offset);
1440 		__skb_fill_page_desc_noacc(shinfo, frag++, v.bv_page,
1441 					   v.bv_offset, v.bv_len);
1442 		bvec_iter_advance_single(from->bvec, &bi, v.bv_len);
1443 	}
1444 	if (bi.bi_size)
1445 		ret = -EMSGSIZE;
1446 
1447 	shinfo->nr_frags = frag;
1448 	from->bvec += bi.bi_idx;
1449 	from->nr_segs -= bi.bi_idx;
1450 	from->count -= copied;
1451 	from->iov_offset = bi.bi_bvec_done;
1452 
1453 	skb->data_len += copied;
1454 	skb->len += copied;
1455 	skb->truesize += truesize;
1456 	return ret;
1457 }
1458 
io_send_zc_import(struct io_kiocb * req,struct io_async_msghdr * kmsg,unsigned int issue_flags)1459 static int io_send_zc_import(struct io_kiocb *req,
1460 			     struct io_async_msghdr *kmsg,
1461 			     unsigned int issue_flags)
1462 {
1463 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1464 	struct io_kiocb *notif = sr->notif;
1465 	int ret;
1466 
1467 	WARN_ON_ONCE(!(sr->flags & IORING_RECVSEND_FIXED_BUF));
1468 
1469 	notif->buf_index = req->buf_index;
1470 
1471 	if (!(sr->flags & IORING_SEND_VECTORIZED)) {
1472 		ret = io_import_reg_buf(notif, &kmsg->msg.msg_iter,
1473 					(u64)(uintptr_t)sr->buf, sr->len,
1474 					ITER_SOURCE, issue_flags);
1475 	} else {
1476 		unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs;
1477 
1478 		ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter,
1479 					notif, &kmsg->vec, uvec_segs,
1480 					issue_flags);
1481 	}
1482 
1483 	if (unlikely(ret))
1484 		return ret;
1485 	req->flags &= ~REQ_F_IMPORT_BUFFER;
1486 	return 0;
1487 }
1488 
io_sendmsg_zc(struct io_kiocb * req,unsigned int issue_flags)1489 int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
1490 {
1491 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1492 	struct io_async_msghdr *kmsg = req->async_data;
1493 	struct socket *sock;
1494 	unsigned msg_flags;
1495 	int ret, min_ret = 0;
1496 
1497 	sock = sock_from_file(req->file);
1498 	if (unlikely(!sock))
1499 		return -ENOTSOCK;
1500 	if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
1501 		return -EOPNOTSUPP;
1502 	if (!(req->flags & REQ_F_POLLED) &&
1503 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
1504 		return -EAGAIN;
1505 
1506 	if (req->flags & REQ_F_IMPORT_BUFFER) {
1507 		ret = io_send_zc_import(req, kmsg, issue_flags);
1508 		if (unlikely(ret))
1509 			return ret;
1510 	}
1511 
1512 	msg_flags = sr->msg_flags;
1513 	if (issue_flags & IO_URING_F_NONBLOCK)
1514 		msg_flags |= MSG_DONTWAIT;
1515 	if (msg_flags & MSG_WAITALL)
1516 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1517 
1518 	kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg;
1519 
1520 	if (req->opcode == IORING_OP_SEND_ZC) {
1521 		msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
1522 		kmsg->msg.msg_flags = msg_flags;
1523 		ret = sock_sendmsg(sock, &kmsg->msg);
1524 	} else {
1525 		kmsg->msg.msg_control_user = sr->msg_control;
1526 		ret = __sys_sendmsg_sock(sock, &kmsg->msg, msg_flags);
1527 	}
1528 
1529 	if (unlikely(ret < min_ret)) {
1530 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1531 			return -EAGAIN;
1532 
1533 		if (ret > 0 && io_net_retry(sock, sr->msg_flags)) {
1534 			sr->done_io += ret;
1535 			return -EAGAIN;
1536 		}
1537 		if (ret == -ERESTARTSYS)
1538 			ret = -EINTR;
1539 		req_set_fail(req);
1540 	}
1541 
1542 	if (ret >= 0)
1543 		ret += sr->done_io;
1544 	else if (sr->done_io)
1545 		ret = sr->done_io;
1546 
1547 	/*
1548 	 * If we're in io-wq we can't rely on tw ordering guarantees, defer
1549 	 * flushing notif to io_send_zc_cleanup()
1550 	 */
1551 	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
1552 		io_notif_flush(sr->notif);
1553 		sr->notif = NULL;
1554 		io_req_msg_cleanup(req, 0);
1555 	}
1556 	io_req_set_res(req, ret, IORING_CQE_F_MORE);
1557 	return IOU_COMPLETE;
1558 }
1559 
io_sendrecv_fail(struct io_kiocb * req)1560 void io_sendrecv_fail(struct io_kiocb *req)
1561 {
1562 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1563 
1564 	if (sr->done_io)
1565 		req->cqe.res = sr->done_io;
1566 
1567 	if ((req->flags & REQ_F_NEED_CLEANUP) &&
1568 	    (req->opcode == IORING_OP_SEND_ZC || req->opcode == IORING_OP_SENDMSG_ZC))
1569 		req->cqe.flags |= IORING_CQE_F_MORE;
1570 }
1571 
1572 #define ACCEPT_FLAGS	(IORING_ACCEPT_MULTISHOT | IORING_ACCEPT_DONTWAIT | \
1573 			 IORING_ACCEPT_POLL_FIRST)
1574 
io_accept_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)1575 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1576 {
1577 	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
1578 
1579 	if (sqe->len || sqe->buf_index)
1580 		return -EINVAL;
1581 
1582 	accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1583 	accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
1584 	accept->flags = READ_ONCE(sqe->accept_flags);
1585 	accept->nofile = rlimit(RLIMIT_NOFILE);
1586 	accept->iou_flags = READ_ONCE(sqe->ioprio);
1587 	if (accept->iou_flags & ~ACCEPT_FLAGS)
1588 		return -EINVAL;
1589 
1590 	accept->file_slot = READ_ONCE(sqe->file_index);
1591 	if (accept->file_slot) {
1592 		if (accept->flags & SOCK_CLOEXEC)
1593 			return -EINVAL;
1594 		if (accept->iou_flags & IORING_ACCEPT_MULTISHOT &&
1595 		    accept->file_slot != IORING_FILE_INDEX_ALLOC)
1596 			return -EINVAL;
1597 	}
1598 	if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1599 		return -EINVAL;
1600 	if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
1601 		accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
1602 	if (accept->iou_flags & IORING_ACCEPT_MULTISHOT)
1603 		req->flags |= REQ_F_APOLL_MULTISHOT;
1604 	if (accept->iou_flags & IORING_ACCEPT_DONTWAIT)
1605 		req->flags |= REQ_F_NOWAIT;
1606 	return 0;
1607 }
1608 
io_accept(struct io_kiocb * req,unsigned int issue_flags)1609 int io_accept(struct io_kiocb *req, unsigned int issue_flags)
1610 {
1611 	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
1612 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1613 	bool fixed = !!accept->file_slot;
1614 	struct proto_accept_arg arg = {
1615 		.flags = force_nonblock ? O_NONBLOCK : 0,
1616 	};
1617 	struct file *file;
1618 	unsigned cflags;
1619 	int ret, fd;
1620 
1621 	if (!(req->flags & REQ_F_POLLED) &&
1622 	    accept->iou_flags & IORING_ACCEPT_POLL_FIRST)
1623 		return -EAGAIN;
1624 
1625 retry:
1626 	if (!fixed) {
1627 		fd = __get_unused_fd_flags(accept->flags, accept->nofile);
1628 		if (unlikely(fd < 0))
1629 			return fd;
1630 	}
1631 	arg.err = 0;
1632 	arg.is_empty = -1;
1633 	file = do_accept(req->file, &arg, accept->addr, accept->addr_len,
1634 			 accept->flags);
1635 	if (IS_ERR(file)) {
1636 		if (!fixed)
1637 			put_unused_fd(fd);
1638 		ret = PTR_ERR(file);
1639 		if (ret == -EAGAIN && force_nonblock &&
1640 		    !(accept->iou_flags & IORING_ACCEPT_DONTWAIT))
1641 			return IOU_RETRY;
1642 
1643 		if (ret == -ERESTARTSYS)
1644 			ret = -EINTR;
1645 	} else if (!fixed) {
1646 		fd_install(fd, file);
1647 		ret = fd;
1648 	} else {
1649 		ret = io_fixed_fd_install(req, issue_flags, file,
1650 						accept->file_slot);
1651 	}
1652 
1653 	cflags = 0;
1654 	if (!arg.is_empty)
1655 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
1656 
1657 	if (ret >= 0 && (req->flags & REQ_F_APOLL_MULTISHOT) &&
1658 	    io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) {
1659 		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1)
1660 			goto retry;
1661 		return IOU_RETRY;
1662 	}
1663 
1664 	io_req_set_res(req, ret, cflags);
1665 	if (ret < 0)
1666 		req_set_fail(req);
1667 	return IOU_COMPLETE;
1668 }
1669 
io_socket_bpf_populate(struct io_uring_bpf_ctx * bctx,struct io_kiocb * req)1670 void io_socket_bpf_populate(struct io_uring_bpf_ctx *bctx, struct io_kiocb *req)
1671 {
1672 	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1673 
1674 	bctx->socket.family = sock->domain;
1675 	bctx->socket.type = sock->type;
1676 	bctx->socket.protocol = sock->protocol;
1677 }
1678 
io_socket_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)1679 int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1680 {
1681 	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1682 
1683 	if (sqe->addr || sqe->rw_flags || sqe->buf_index)
1684 		return -EINVAL;
1685 
1686 	sock->domain = READ_ONCE(sqe->fd);
1687 	sock->type = READ_ONCE(sqe->off);
1688 	sock->protocol = READ_ONCE(sqe->len);
1689 	sock->file_slot = READ_ONCE(sqe->file_index);
1690 	sock->nofile = rlimit(RLIMIT_NOFILE);
1691 
1692 	sock->flags = sock->type & ~SOCK_TYPE_MASK;
1693 	if (sock->file_slot && (sock->flags & SOCK_CLOEXEC))
1694 		return -EINVAL;
1695 	if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1696 		return -EINVAL;
1697 	return 0;
1698 }
1699 
io_socket(struct io_kiocb * req,unsigned int issue_flags)1700 int io_socket(struct io_kiocb *req, unsigned int issue_flags)
1701 {
1702 	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1703 	bool fixed = !!sock->file_slot;
1704 	struct file *file;
1705 	int ret, fd;
1706 
1707 	if (!fixed) {
1708 		fd = __get_unused_fd_flags(sock->flags, sock->nofile);
1709 		if (unlikely(fd < 0))
1710 			return fd;
1711 	}
1712 	file = __sys_socket_file(sock->domain, sock->type, sock->protocol);
1713 	if (IS_ERR(file)) {
1714 		if (!fixed)
1715 			put_unused_fd(fd);
1716 		ret = PTR_ERR(file);
1717 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1718 			return -EAGAIN;
1719 		if (ret == -ERESTARTSYS)
1720 			ret = -EINTR;
1721 		req_set_fail(req);
1722 	} else if (!fixed) {
1723 		fd_install(fd, file);
1724 		ret = fd;
1725 	} else {
1726 		ret = io_fixed_fd_install(req, issue_flags, file,
1727 					    sock->file_slot);
1728 	}
1729 	io_req_set_res(req, ret, 0);
1730 	return IOU_COMPLETE;
1731 }
1732 
io_connect_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)1733 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1734 {
1735 	struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect);
1736 	struct io_async_msghdr *io;
1737 
1738 	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
1739 		return -EINVAL;
1740 
1741 	conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1742 	conn->addr_len =  READ_ONCE(sqe->addr2);
1743 	conn->in_progress = conn->seen_econnaborted = false;
1744 
1745 	io = io_msg_alloc_async(req);
1746 	if (unlikely(!io))
1747 		return -ENOMEM;
1748 
1749 	return move_addr_to_kernel(conn->addr, conn->addr_len, &io->addr);
1750 }
1751 
io_connect(struct io_kiocb * req,unsigned int issue_flags)1752 int io_connect(struct io_kiocb *req, unsigned int issue_flags)
1753 {
1754 	struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect);
1755 	struct io_async_msghdr *io = req->async_data;
1756 	unsigned file_flags;
1757 	int ret;
1758 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1759 
1760 	if (connect->in_progress) {
1761 		struct poll_table_struct pt = { ._key = EPOLLERR };
1762 
1763 		if (vfs_poll(req->file, &pt) & EPOLLERR)
1764 			goto get_sock_err;
1765 	}
1766 
1767 	file_flags = force_nonblock ? O_NONBLOCK : 0;
1768 
1769 	ret = __sys_connect_file(req->file, &io->addr, connect->addr_len,
1770 				 file_flags);
1771 	if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED)
1772 	    && force_nonblock) {
1773 		if (ret == -EINPROGRESS) {
1774 			connect->in_progress = true;
1775 		} else if (ret == -ECONNABORTED) {
1776 			if (connect->seen_econnaborted)
1777 				goto out;
1778 			connect->seen_econnaborted = true;
1779 		}
1780 		return -EAGAIN;
1781 	}
1782 	if (connect->in_progress) {
1783 		/*
1784 		 * At least bluetooth will return -EBADFD on a re-connect
1785 		 * attempt, and it's (supposedly) also valid to get -EISCONN
1786 		 * which means the previous result is good. For both of these,
1787 		 * grab the sock_error() and use that for the completion.
1788 		 */
1789 		if (ret == -EBADFD || ret == -EISCONN) {
1790 get_sock_err:
1791 			ret = sock_error(sock_from_file(req->file)->sk);
1792 		}
1793 	}
1794 	if (ret == -ERESTARTSYS)
1795 		ret = -EINTR;
1796 out:
1797 	if (ret < 0)
1798 		req_set_fail(req);
1799 	io_req_msg_cleanup(req, issue_flags);
1800 	io_req_set_res(req, ret, 0);
1801 	return IOU_COMPLETE;
1802 }
1803 
1804 /*
1805  * Check if bind request would potentially end up with filename_create(),
1806  * which in turn end up in mnt_want_write() which will grab the fs
1807  * percpu start write sem. This can trigger a lockdep warning.
1808  */
io_bind_file_create(const struct io_async_msghdr * io,int addr_len)1809 static int io_bind_file_create(const struct io_async_msghdr *io, int addr_len)
1810 {
1811 	const struct sockaddr_un *sun;
1812 
1813 	if (io->addr.ss_family != AF_UNIX)
1814 		return 0;
1815 	if (addr_len <= offsetof(struct sockaddr_un, sun_path))
1816 		return 0;
1817 	sun = (const struct sockaddr_un *) &io->addr;
1818 	return sun->sun_path[0] != '\0';
1819 }
1820 
io_bind_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)1821 int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1822 {
1823 	struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind);
1824 	struct sockaddr __user *uaddr;
1825 	struct io_async_msghdr *io;
1826 	int ret;
1827 
1828 	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
1829 		return -EINVAL;
1830 
1831 	uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1832 	bind->addr_len =  READ_ONCE(sqe->addr2);
1833 
1834 	io = io_msg_alloc_async(req);
1835 	if (unlikely(!io))
1836 		return -ENOMEM;
1837 	ret = move_addr_to_kernel(uaddr, bind->addr_len, &io->addr);
1838 	if (unlikely(ret))
1839 		return ret;
1840 	if (io_bind_file_create(io, bind->addr_len))
1841 		req->flags |= REQ_F_FORCE_ASYNC;
1842 	return 0;
1843 }
1844 
io_bind(struct io_kiocb * req,unsigned int issue_flags)1845 int io_bind(struct io_kiocb *req, unsigned int issue_flags)
1846 {
1847 	struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind);
1848 	struct io_async_msghdr *io = req->async_data;
1849 	struct socket *sock;
1850 	int ret;
1851 
1852 	sock = sock_from_file(req->file);
1853 	if (unlikely(!sock))
1854 		return -ENOTSOCK;
1855 
1856 	ret = __sys_bind_socket(sock, &io->addr, bind->addr_len);
1857 	if (ret < 0)
1858 		req_set_fail(req);
1859 	io_req_set_res(req, ret, 0);
1860 	return 0;
1861 }
1862 
io_listen_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)1863 int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1864 {
1865 	struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen);
1866 
1867 	if (sqe->addr || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in || sqe->addr2)
1868 		return -EINVAL;
1869 
1870 	listen->backlog = READ_ONCE(sqe->len);
1871 	return 0;
1872 }
1873 
io_listen(struct io_kiocb * req,unsigned int issue_flags)1874 int io_listen(struct io_kiocb *req, unsigned int issue_flags)
1875 {
1876 	struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen);
1877 	struct socket *sock;
1878 	int ret;
1879 
1880 	sock = sock_from_file(req->file);
1881 	if (unlikely(!sock))
1882 		return -ENOTSOCK;
1883 
1884 	ret = __sys_listen_socket(sock, listen->backlog);
1885 	if (ret < 0)
1886 		req_set_fail(req);
1887 	io_req_set_res(req, ret, 0);
1888 	return 0;
1889 }
1890 
io_netmsg_cache_free(const void * entry)1891 void io_netmsg_cache_free(const void *entry)
1892 {
1893 	struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry;
1894 
1895 	io_vec_free(&kmsg->vec);
1896 	kfree(kmsg);
1897 }
1898