xref: /linux/io_uring/net.c (revision 2c142b63c8ee982cdfdba49a616027c266294838)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/file.h>
5 #include <linux/slab.h>
6 #include <linux/net.h>
7 #include <linux/un.h>
8 #include <linux/compat.h>
9 #include <net/compat.h>
10 #include <linux/io_uring.h>
11 
12 #include <uapi/linux/io_uring.h>
13 
14 #include "filetable.h"
15 #include "io_uring.h"
16 #include "kbuf.h"
17 #include "alloc_cache.h"
18 #include "net.h"
19 #include "notif.h"
20 #include "rsrc.h"
21 #include "zcrx.h"
22 
23 struct io_shutdown {
24 	struct file			*file;
25 	int				how;
26 };
27 
28 struct io_accept {
29 	struct file			*file;
30 	struct sockaddr __user		*addr;
31 	int __user			*addr_len;
32 	int				flags;
33 	int				iou_flags;
34 	u32				file_slot;
35 	unsigned long			nofile;
36 };
37 
38 struct io_socket {
39 	struct file			*file;
40 	int				domain;
41 	int				type;
42 	int				protocol;
43 	int				flags;
44 	u32				file_slot;
45 	unsigned long			nofile;
46 };
47 
48 struct io_connect {
49 	struct file			*file;
50 	struct sockaddr __user		*addr;
51 	int				addr_len;
52 	bool				in_progress;
53 	bool				seen_econnaborted;
54 };
55 
56 struct io_bind {
57 	struct file			*file;
58 	int				addr_len;
59 };
60 
61 struct io_listen {
62 	struct file			*file;
63 	int				backlog;
64 };
65 
66 struct io_sr_msg {
67 	struct file			*file;
68 	union {
69 		struct compat_msghdr __user	*umsg_compat;
70 		struct user_msghdr __user	*umsg;
71 		void __user			*buf;
72 	};
73 	int				len;
74 	unsigned			done_io;
75 	unsigned			msg_flags;
76 	unsigned			nr_multishot_loops;
77 	u16				flags;
78 	/* initialised and used only by !msg send variants */
79 	u16				buf_group;
80 	/* per-invocation mshot limit */
81 	unsigned			mshot_len;
82 	/* overall mshot byte limit */
83 	unsigned			mshot_total_len;
84 	void __user			*msg_control;
85 	/* used only for send zerocopy */
86 	struct io_kiocb 		*notif;
87 };
88 
89 /*
90  * The UAPI flags are the lower 8 bits, as that's all sqe->ioprio will hold
91  * anyway. Use the upper 8 bits for internal uses.
92  */
93 enum sr_retry_flags {
94 	IORING_RECV_RETRY	= (1U << 15),
95 	IORING_RECV_PARTIAL_MAP	= (1U << 14),
96 	IORING_RECV_MSHOT_CAP	= (1U << 13),
97 	IORING_RECV_MSHOT_LIM	= (1U << 12),
98 	IORING_RECV_MSHOT_DONE	= (1U << 11),
99 
100 	IORING_RECV_RETRY_CLEAR	= IORING_RECV_RETRY | IORING_RECV_PARTIAL_MAP,
101 	IORING_RECV_NO_RETRY	= IORING_RECV_RETRY | IORING_RECV_PARTIAL_MAP |
102 				  IORING_RECV_MSHOT_CAP | IORING_RECV_MSHOT_DONE,
103 };
104 
105 /*
106  * Number of times we'll try and do receives if there's more data. If we
107  * exceed this limit, then add us to the back of the queue and retry from
108  * there. This helps fairness between flooding clients.
109  */
110 #define MULTISHOT_MAX_RETRY	32
111 
112 struct io_recvzc {
113 	struct file			*file;
114 	u16				flags;
115 	u32				len;
116 	struct io_zcrx_ifq		*ifq;
117 };
118 
119 static int io_sg_from_iter_iovec(struct sk_buff *skb,
120 				 struct iov_iter *from, size_t length);
121 static int io_sg_from_iter(struct sk_buff *skb,
122 			   struct iov_iter *from, size_t length);
123 
io_shutdown_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)124 int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
125 {
126 	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
127 
128 	if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
129 		     sqe->buf_index || sqe->splice_fd_in))
130 		return -EINVAL;
131 
132 	shutdown->how = READ_ONCE(sqe->len);
133 	req->flags |= REQ_F_FORCE_ASYNC;
134 	return 0;
135 }
136 
io_shutdown(struct io_kiocb * req,unsigned int issue_flags)137 int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
138 {
139 	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
140 	struct socket *sock;
141 	int ret;
142 
143 	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
144 
145 	sock = sock_from_file(req->file);
146 	if (unlikely(!sock))
147 		return -ENOTSOCK;
148 
149 	ret = __sys_shutdown_sock(sock, shutdown->how);
150 	io_req_set_res(req, ret, 0);
151 	return IOU_COMPLETE;
152 }
153 
io_net_retry(struct socket * sock,int flags)154 static bool io_net_retry(struct socket *sock, int flags)
155 {
156 	if (!(flags & MSG_WAITALL))
157 		return false;
158 	return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
159 }
160 
io_netmsg_iovec_free(struct io_async_msghdr * kmsg)161 static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg)
162 {
163 	if (kmsg->vec.iovec)
164 		io_vec_free(&kmsg->vec);
165 }
166 
io_netmsg_recycle(struct io_kiocb * req,unsigned int issue_flags)167 static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
168 {
169 	struct io_async_msghdr *hdr = req->async_data;
170 
171 	/* can't recycle, ensure we free the iovec if we have one */
172 	if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
173 		io_netmsg_iovec_free(hdr);
174 		return;
175 	}
176 
177 	/* Let normal cleanup path reap it if we fail adding to the cache */
178 	io_alloc_cache_vec_kasan(&hdr->vec);
179 	if (hdr->vec.nr > IO_VEC_CACHE_SOFT_CAP)
180 		io_vec_free(&hdr->vec);
181 
182 	if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr))
183 		io_req_async_data_clear(req, REQ_F_NEED_CLEANUP);
184 }
185 
io_msg_alloc_async(struct io_kiocb * req)186 static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req)
187 {
188 	struct io_ring_ctx *ctx = req->ctx;
189 	struct io_async_msghdr *hdr;
190 
191 	hdr = io_uring_alloc_async_data(&ctx->netmsg_cache, req);
192 	if (!hdr)
193 		return NULL;
194 
195 	/* If the async data was cached, we might have an iov cached inside. */
196 	if (hdr->vec.iovec)
197 		req->flags |= REQ_F_NEED_CLEANUP;
198 	return hdr;
199 }
200 
io_mshot_prep_retry(struct io_kiocb * req,struct io_async_msghdr * kmsg)201 static inline void io_mshot_prep_retry(struct io_kiocb *req,
202 				       struct io_async_msghdr *kmsg)
203 {
204 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
205 
206 	req->flags &= ~REQ_F_BL_EMPTY;
207 	sr->done_io = 0;
208 	sr->flags &= ~IORING_RECV_RETRY_CLEAR;
209 	sr->len = sr->mshot_len;
210 }
211 
io_net_import_vec(struct io_kiocb * req,struct io_async_msghdr * iomsg,const struct iovec __user * uiov,unsigned uvec_seg,int ddir)212 static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg,
213 			     const struct iovec __user *uiov, unsigned uvec_seg,
214 			     int ddir)
215 {
216 	struct iovec *iov;
217 	int ret, nr_segs;
218 
219 	if (iomsg->vec.iovec) {
220 		nr_segs = iomsg->vec.nr;
221 		iov = iomsg->vec.iovec;
222 	} else {
223 		nr_segs = 1;
224 		iov = &iomsg->fast_iov;
225 	}
226 
227 	ret = __import_iovec(ddir, uiov, uvec_seg, nr_segs, &iov,
228 			     &iomsg->msg.msg_iter, io_is_compat(req->ctx));
229 	if (unlikely(ret < 0))
230 		return ret;
231 
232 	if (iov) {
233 		req->flags |= REQ_F_NEED_CLEANUP;
234 		io_vec_reset_iovec(&iomsg->vec, iov, iomsg->msg.msg_iter.nr_segs);
235 	}
236 	return 0;
237 }
238 
io_compat_msg_copy_hdr(struct io_kiocb * req,struct io_async_msghdr * iomsg,struct compat_msghdr * msg,int ddir,struct sockaddr __user ** save_addr)239 static int io_compat_msg_copy_hdr(struct io_kiocb *req,
240 				  struct io_async_msghdr *iomsg,
241 				  struct compat_msghdr *msg, int ddir,
242 				  struct sockaddr __user **save_addr)
243 {
244 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
245 	struct compat_iovec __user *uiov;
246 	int ret;
247 
248 	if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg)))
249 		return -EFAULT;
250 
251 	ret = __get_compat_msghdr(&iomsg->msg, msg, save_addr);
252 	if (ret)
253 		return ret;
254 
255 	uiov = compat_ptr(msg->msg_iov);
256 	if (req->flags & REQ_F_BUFFER_SELECT) {
257 		if (msg->msg_iovlen == 0) {
258 			sr->len = 0;
259 		} else if (msg->msg_iovlen > 1) {
260 			return -EINVAL;
261 		} else {
262 			struct compat_iovec tmp_iov;
263 
264 			if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov)))
265 				return -EFAULT;
266 			sr->len = tmp_iov.iov_len;
267 		}
268 	}
269 	return 0;
270 }
271 
io_copy_msghdr_from_user(struct user_msghdr * msg,struct user_msghdr __user * umsg)272 static int io_copy_msghdr_from_user(struct user_msghdr *msg,
273 				    struct user_msghdr __user *umsg)
274 {
275 	if (!user_access_begin(umsg, sizeof(*umsg)))
276 		return -EFAULT;
277 	unsafe_get_user(msg->msg_name, &umsg->msg_name, ua_end);
278 	unsafe_get_user(msg->msg_namelen, &umsg->msg_namelen, ua_end);
279 	unsafe_get_user(msg->msg_iov, &umsg->msg_iov, ua_end);
280 	unsafe_get_user(msg->msg_iovlen, &umsg->msg_iovlen, ua_end);
281 	unsafe_get_user(msg->msg_control, &umsg->msg_control, ua_end);
282 	unsafe_get_user(msg->msg_controllen, &umsg->msg_controllen, ua_end);
283 	user_access_end();
284 	return 0;
285 ua_end:
286 	user_access_end();
287 	return -EFAULT;
288 }
289 
io_msg_copy_hdr(struct io_kiocb * req,struct io_async_msghdr * iomsg,struct user_msghdr * msg,int ddir,struct sockaddr __user ** save_addr)290 static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
291 			   struct user_msghdr *msg, int ddir,
292 			   struct sockaddr __user **save_addr)
293 {
294 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
295 	struct user_msghdr __user *umsg = sr->umsg;
296 	int ret;
297 
298 	iomsg->msg.msg_name = &iomsg->addr;
299 	iomsg->msg.msg_iter.nr_segs = 0;
300 
301 	if (io_is_compat(req->ctx)) {
302 		struct compat_msghdr cmsg;
303 
304 		ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ddir, save_addr);
305 		if (ret)
306 			return ret;
307 
308 		memset(msg, 0, sizeof(*msg));
309 		msg->msg_namelen = cmsg.msg_namelen;
310 		msg->msg_controllen = cmsg.msg_controllen;
311 		msg->msg_iov = compat_ptr(cmsg.msg_iov);
312 		msg->msg_iovlen = cmsg.msg_iovlen;
313 		return 0;
314 	}
315 
316 	ret = io_copy_msghdr_from_user(msg, umsg);
317 	if (unlikely(ret))
318 		return ret;
319 
320 	msg->msg_flags = 0;
321 
322 	ret = __copy_msghdr(&iomsg->msg, msg, save_addr);
323 	if (ret)
324 		return ret;
325 
326 	if (req->flags & REQ_F_BUFFER_SELECT) {
327 		if (msg->msg_iovlen == 0) {
328 			sr->len = 0;
329 		} else if (msg->msg_iovlen > 1) {
330 			return -EINVAL;
331 		} else {
332 			struct iovec __user *uiov = msg->msg_iov;
333 			struct iovec tmp_iov;
334 
335 			if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov)))
336 				return -EFAULT;
337 			sr->len = tmp_iov.iov_len;
338 		}
339 	}
340 	return 0;
341 }
342 
io_sendmsg_recvmsg_cleanup(struct io_kiocb * req)343 void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req)
344 {
345 	struct io_async_msghdr *io = req->async_data;
346 
347 	io_netmsg_iovec_free(io);
348 }
349 
io_send_setup(struct io_kiocb * req,const struct io_uring_sqe * sqe)350 static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
351 {
352 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
353 	struct io_async_msghdr *kmsg = req->async_data;
354 	void __user *addr;
355 	u16 addr_len;
356 	int ret;
357 
358 	sr->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
359 
360 	if (READ_ONCE(sqe->__pad3[0]))
361 		return -EINVAL;
362 
363 	kmsg->msg.msg_name = NULL;
364 	kmsg->msg.msg_namelen = 0;
365 	kmsg->msg.msg_control = NULL;
366 	kmsg->msg.msg_controllen = 0;
367 	kmsg->msg.msg_ubuf = NULL;
368 
369 	addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
370 	addr_len = READ_ONCE(sqe->addr_len);
371 	if (addr) {
372 		ret = move_addr_to_kernel(addr, addr_len, &kmsg->addr);
373 		if (unlikely(ret < 0))
374 			return ret;
375 		kmsg->msg.msg_name = &kmsg->addr;
376 		kmsg->msg.msg_namelen = addr_len;
377 	}
378 	if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
379 		if (!(sr->flags & IORING_SEND_VECTORIZED)) {
380 			req->flags |= REQ_F_IMPORT_BUFFER;
381 			return 0;
382 		}
383 
384 		kmsg->msg.msg_iter.nr_segs = sr->len;
385 		return io_prep_reg_iovec(req, &kmsg->vec, sr->buf, sr->len);
386 	}
387 	if (req->flags & REQ_F_BUFFER_SELECT)
388 		return 0;
389 
390 	if (sr->flags & IORING_SEND_VECTORIZED)
391 		return io_net_import_vec(req, kmsg, sr->buf, sr->len, ITER_SOURCE);
392 
393 	return import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter);
394 }
395 
io_sendmsg_setup(struct io_kiocb * req,const struct io_uring_sqe * sqe)396 static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
397 {
398 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
399 	struct io_async_msghdr *kmsg = req->async_data;
400 	struct user_msghdr msg;
401 	int ret;
402 
403 	sr->flags |= IORING_SEND_VECTORIZED;
404 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
405 	ret = io_msg_copy_hdr(req, kmsg, &msg, ITER_SOURCE, NULL);
406 	if (unlikely(ret))
407 		return ret;
408 	/* save msg_control as sys_sendmsg() overwrites it */
409 	sr->msg_control = kmsg->msg.msg_control_user;
410 
411 	if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
412 		kmsg->msg.msg_iter.nr_segs = msg.msg_iovlen;
413 		return io_prep_reg_iovec(req, &kmsg->vec, msg.msg_iov,
414 					 msg.msg_iovlen);
415 	}
416 	if (req->flags & REQ_F_BUFFER_SELECT)
417 		return 0;
418 	return io_net_import_vec(req, kmsg, msg.msg_iov, msg.msg_iovlen, ITER_SOURCE);
419 }
420 
421 #define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE | IORING_SEND_VECTORIZED)
422 
io_sendmsg_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)423 int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
424 {
425 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
426 
427 	sr->done_io = 0;
428 	sr->len = READ_ONCE(sqe->len);
429 	if (unlikely(sr->len < 0))
430 		return -EINVAL;
431 	sr->flags = READ_ONCE(sqe->ioprio);
432 	if (sr->flags & ~SENDMSG_FLAGS)
433 		return -EINVAL;
434 	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
435 	if (sr->msg_flags & MSG_DONTWAIT)
436 		req->flags |= REQ_F_NOWAIT;
437 	if (req->flags & REQ_F_BUFFER_SELECT)
438 		sr->buf_group = req->buf_index;
439 	if (sr->flags & IORING_RECVSEND_BUNDLE) {
440 		if (req->opcode == IORING_OP_SENDMSG)
441 			return -EINVAL;
442 		sr->msg_flags |= MSG_WAITALL;
443 		req->flags |= REQ_F_MULTISHOT;
444 	}
445 
446 	if (io_is_compat(req->ctx))
447 		sr->msg_flags |= MSG_CMSG_COMPAT;
448 
449 	if (unlikely(!io_msg_alloc_async(req)))
450 		return -ENOMEM;
451 	if (req->opcode != IORING_OP_SENDMSG)
452 		return io_send_setup(req, sqe);
453 	if (unlikely(sqe->addr2 || sqe->file_index))
454 		return -EINVAL;
455 	return io_sendmsg_setup(req, sqe);
456 }
457 
io_req_msg_cleanup(struct io_kiocb * req,unsigned int issue_flags)458 static void io_req_msg_cleanup(struct io_kiocb *req,
459 			       unsigned int issue_flags)
460 {
461 	io_netmsg_recycle(req, issue_flags);
462 }
463 
464 /*
465  * For bundle completions, we need to figure out how many segments we consumed.
466  * A bundle could be using a single ITER_UBUF if that's all we mapped, or it
467  * could be using an ITER_IOVEC. If the latter, then if we consumed all of
468  * the segments, then it's a trivial questiont o answer. If we have residual
469  * data in the iter, then loop the segments to figure out how much we
470  * transferred.
471  */
io_bundle_nbufs(struct io_async_msghdr * kmsg,int ret)472 static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret)
473 {
474 	struct iovec *iov;
475 	int nbufs;
476 
477 	/* no data is always zero segments, and a ubuf is always 1 segment */
478 	if (ret <= 0)
479 		return 0;
480 	if (iter_is_ubuf(&kmsg->msg.msg_iter))
481 		return 1;
482 
483 	iov = kmsg->vec.iovec;
484 	if (!iov)
485 		iov = &kmsg->fast_iov;
486 
487 	/* if all data was transferred, it's basic pointer math */
488 	if (!iov_iter_count(&kmsg->msg.msg_iter))
489 		return iter_iov(&kmsg->msg.msg_iter) - iov;
490 
491 	/* short transfer, count segments */
492 	nbufs = 0;
493 	do {
494 		int this_len = min_t(int, iov[nbufs].iov_len, ret);
495 
496 		nbufs++;
497 		ret -= this_len;
498 	} while (ret);
499 
500 	return nbufs;
501 }
502 
io_net_kbuf_recyle(struct io_kiocb * req,struct io_buffer_list * bl,struct io_async_msghdr * kmsg,int len)503 static int io_net_kbuf_recyle(struct io_kiocb *req, struct io_buffer_list *bl,
504 			      struct io_async_msghdr *kmsg, int len)
505 {
506 	req->flags |= REQ_F_BL_NO_RECYCLE;
507 	if (req->flags & REQ_F_BUFFERS_COMMIT)
508 		io_kbuf_commit(req, bl, len, io_bundle_nbufs(kmsg, len));
509 	return IOU_RETRY;
510 }
511 
io_send_finish(struct io_kiocb * req,struct io_async_msghdr * kmsg,struct io_br_sel * sel)512 static inline bool io_send_finish(struct io_kiocb *req,
513 				  struct io_async_msghdr *kmsg,
514 				  struct io_br_sel *sel)
515 {
516 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
517 	bool bundle_finished = sel->val <= 0;
518 	unsigned int cflags;
519 
520 	if (!(sr->flags & IORING_RECVSEND_BUNDLE)) {
521 		cflags = io_put_kbuf(req, sel->val, sel->buf_list);
522 		goto finish;
523 	}
524 
525 	cflags = io_put_kbufs(req, sel->val, sel->buf_list, io_bundle_nbufs(kmsg, sel->val));
526 
527 	/*
528 	 * Don't start new bundles if the buffer list is empty, or if the
529 	 * current operation needed to go through polling to complete.
530 	 */
531 	if (bundle_finished || req->flags & (REQ_F_BL_EMPTY | REQ_F_POLLED))
532 		goto finish;
533 
534 	/*
535 	 * Fill CQE for this receive and see if we should keep trying to
536 	 * receive from this socket.
537 	 */
538 	if (io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) {
539 		io_mshot_prep_retry(req, kmsg);
540 		return false;
541 	}
542 
543 	/* Otherwise stop bundle and use the current result. */
544 finish:
545 	io_req_set_res(req, sel->val, cflags);
546 	sel->val = IOU_COMPLETE;
547 	return true;
548 }
549 
io_sendmsg(struct io_kiocb * req,unsigned int issue_flags)550 int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
551 {
552 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
553 	struct io_async_msghdr *kmsg = req->async_data;
554 	struct socket *sock;
555 	unsigned flags;
556 	int min_ret = 0;
557 	int ret;
558 
559 	sock = sock_from_file(req->file);
560 	if (unlikely(!sock))
561 		return -ENOTSOCK;
562 
563 	if (!(req->flags & REQ_F_POLLED) &&
564 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
565 		return -EAGAIN;
566 
567 	flags = sr->msg_flags;
568 	if (issue_flags & IO_URING_F_NONBLOCK)
569 		flags |= MSG_DONTWAIT;
570 	if (flags & MSG_WAITALL)
571 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
572 
573 	kmsg->msg.msg_control_user = sr->msg_control;
574 
575 	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
576 
577 	if (ret < min_ret) {
578 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
579 			return -EAGAIN;
580 		if (ret > 0 && io_net_retry(sock, flags)) {
581 			kmsg->msg.msg_controllen = 0;
582 			kmsg->msg.msg_control = NULL;
583 			sr->done_io += ret;
584 			return -EAGAIN;
585 		}
586 		if (ret == -ERESTARTSYS)
587 			ret = -EINTR;
588 		req_set_fail(req);
589 	}
590 	io_req_msg_cleanup(req, issue_flags);
591 	if (ret >= 0)
592 		ret += sr->done_io;
593 	else if (sr->done_io)
594 		ret = sr->done_io;
595 	io_req_set_res(req, ret, 0);
596 	return IOU_COMPLETE;
597 }
598 
io_send_select_buffer(struct io_kiocb * req,unsigned int issue_flags,struct io_br_sel * sel,struct io_async_msghdr * kmsg)599 static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags,
600 				 struct io_br_sel *sel, struct io_async_msghdr *kmsg)
601 {
602 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
603 	struct buf_sel_arg arg = {
604 		.iovs = &kmsg->fast_iov,
605 		.max_len = min_not_zero(sr->len, INT_MAX),
606 		.nr_iovs = 1,
607 		.buf_group = sr->buf_group,
608 	};
609 	int ret;
610 
611 	if (kmsg->vec.iovec) {
612 		arg.nr_iovs = kmsg->vec.nr;
613 		arg.iovs = kmsg->vec.iovec;
614 		arg.mode = KBUF_MODE_FREE;
615 	}
616 
617 	if (!(sr->flags & IORING_RECVSEND_BUNDLE))
618 		arg.nr_iovs = 1;
619 	else
620 		arg.mode |= KBUF_MODE_EXPAND;
621 
622 	ret = io_buffers_select(req, &arg, sel, issue_flags);
623 	if (unlikely(ret < 0))
624 		return ret;
625 
626 	if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
627 		kmsg->vec.nr = ret;
628 		kmsg->vec.iovec = arg.iovs;
629 		req->flags |= REQ_F_NEED_CLEANUP;
630 	}
631 	sr->len = arg.out_len;
632 
633 	if (ret == 1) {
634 		sr->buf = arg.iovs[0].iov_base;
635 		ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len,
636 					&kmsg->msg.msg_iter);
637 		if (unlikely(ret))
638 			return ret;
639 	} else {
640 		iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE,
641 				arg.iovs, ret, arg.out_len);
642 	}
643 
644 	return 0;
645 }
646 
io_send(struct io_kiocb * req,unsigned int issue_flags)647 int io_send(struct io_kiocb *req, unsigned int issue_flags)
648 {
649 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
650 	struct io_async_msghdr *kmsg = req->async_data;
651 	struct io_br_sel sel = { };
652 	struct socket *sock;
653 	unsigned flags;
654 	int min_ret = 0;
655 	int ret;
656 
657 	sock = sock_from_file(req->file);
658 	if (unlikely(!sock))
659 		return -ENOTSOCK;
660 
661 	if (!(req->flags & REQ_F_POLLED) &&
662 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
663 		return -EAGAIN;
664 
665 	flags = sr->msg_flags;
666 	if (issue_flags & IO_URING_F_NONBLOCK)
667 		flags |= MSG_DONTWAIT;
668 
669 retry_bundle:
670 	sel.buf_list = NULL;
671 	if (io_do_buffer_select(req)) {
672 		ret = io_send_select_buffer(req, issue_flags, &sel, kmsg);
673 		if (ret)
674 			return ret;
675 	}
676 
677 	/*
678 	 * If MSG_WAITALL is set, or this is a bundle send, then we need
679 	 * the full amount. If just bundle is set, if we do a short send
680 	 * then we complete the bundle sequence rather than continue on.
681 	 */
682 	if (flags & MSG_WAITALL || sr->flags & IORING_RECVSEND_BUNDLE)
683 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
684 
685 	flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
686 	kmsg->msg.msg_flags = flags;
687 	ret = sock_sendmsg(sock, &kmsg->msg);
688 	if (ret < min_ret) {
689 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
690 			return -EAGAIN;
691 
692 		if (ret > 0 && io_net_retry(sock, flags)) {
693 			sr->len -= ret;
694 			sr->buf += ret;
695 			sr->done_io += ret;
696 			return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret);
697 		}
698 		if (ret == -ERESTARTSYS)
699 			ret = -EINTR;
700 		req_set_fail(req);
701 	}
702 	if (ret >= 0)
703 		ret += sr->done_io;
704 	else if (sr->done_io)
705 		ret = sr->done_io;
706 
707 	sel.val = ret;
708 	if (!io_send_finish(req, kmsg, &sel))
709 		goto retry_bundle;
710 
711 	io_req_msg_cleanup(req, issue_flags);
712 	return sel.val;
713 }
714 
io_recvmsg_mshot_prep(struct io_kiocb * req,struct io_async_msghdr * iomsg,int namelen,size_t controllen)715 static int io_recvmsg_mshot_prep(struct io_kiocb *req,
716 				 struct io_async_msghdr *iomsg,
717 				 int namelen, size_t controllen)
718 {
719 	if ((req->flags & (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) ==
720 			  (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) {
721 		int hdr;
722 
723 		if (unlikely(namelen < 0))
724 			return -EOVERFLOW;
725 		if (check_add_overflow(sizeof(struct io_uring_recvmsg_out),
726 					namelen, &hdr))
727 			return -EOVERFLOW;
728 		if (check_add_overflow(hdr, controllen, &hdr))
729 			return -EOVERFLOW;
730 
731 		iomsg->namelen = namelen;
732 		iomsg->controllen = controllen;
733 		return 0;
734 	}
735 
736 	return 0;
737 }
738 
io_recvmsg_copy_hdr(struct io_kiocb * req,struct io_async_msghdr * iomsg)739 static int io_recvmsg_copy_hdr(struct io_kiocb *req,
740 			       struct io_async_msghdr *iomsg)
741 {
742 	struct user_msghdr msg;
743 	int ret;
744 
745 	ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr);
746 	if (unlikely(ret))
747 		return ret;
748 
749 	if (!(req->flags & REQ_F_BUFFER_SELECT)) {
750 		ret = io_net_import_vec(req, iomsg, msg.msg_iov, msg.msg_iovlen,
751 					ITER_DEST);
752 		if (unlikely(ret))
753 			return ret;
754 	}
755 	return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen,
756 					msg.msg_controllen);
757 }
758 
io_recvmsg_prep_setup(struct io_kiocb * req)759 static int io_recvmsg_prep_setup(struct io_kiocb *req)
760 {
761 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
762 	struct io_async_msghdr *kmsg;
763 
764 	kmsg = io_msg_alloc_async(req);
765 	if (unlikely(!kmsg))
766 		return -ENOMEM;
767 
768 	if (req->opcode == IORING_OP_RECV) {
769 		kmsg->msg.msg_name = NULL;
770 		kmsg->msg.msg_namelen = 0;
771 		kmsg->msg.msg_inq = 0;
772 		kmsg->msg.msg_control = NULL;
773 		kmsg->msg.msg_get_inq = 1;
774 		kmsg->msg.msg_controllen = 0;
775 		kmsg->msg.msg_iocb = NULL;
776 		kmsg->msg.msg_ubuf = NULL;
777 
778 		if (req->flags & REQ_F_BUFFER_SELECT)
779 			return 0;
780 		return import_ubuf(ITER_DEST, sr->buf, sr->len,
781 				   &kmsg->msg.msg_iter);
782 	}
783 
784 	return io_recvmsg_copy_hdr(req, kmsg);
785 }
786 
787 #define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \
788 			IORING_RECVSEND_BUNDLE)
789 
io_recvmsg_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)790 int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
791 {
792 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
793 
794 	sr->done_io = 0;
795 
796 	if (unlikely(sqe->addr2))
797 		return -EINVAL;
798 
799 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
800 	sr->len = READ_ONCE(sqe->len);
801 	if (unlikely(sr->len < 0))
802 		return -EINVAL;
803 	sr->flags = READ_ONCE(sqe->ioprio);
804 	if (sr->flags & ~RECVMSG_FLAGS)
805 		return -EINVAL;
806 	sr->msg_flags = READ_ONCE(sqe->msg_flags);
807 	if (sr->msg_flags & MSG_DONTWAIT)
808 		req->flags |= REQ_F_NOWAIT;
809 	if (sr->msg_flags & MSG_ERRQUEUE)
810 		req->flags |= REQ_F_CLEAR_POLLIN;
811 	if (req->flags & REQ_F_BUFFER_SELECT)
812 		sr->buf_group = req->buf_index;
813 	sr->mshot_total_len = sr->mshot_len = 0;
814 	if (sr->flags & IORING_RECV_MULTISHOT) {
815 		if (!(req->flags & REQ_F_BUFFER_SELECT))
816 			return -EINVAL;
817 		if (sr->msg_flags & MSG_WAITALL)
818 			return -EINVAL;
819 		if (req->opcode == IORING_OP_RECV) {
820 			sr->mshot_len = sr->len;
821 			sr->mshot_total_len = READ_ONCE(sqe->optlen);
822 			if (sr->mshot_total_len)
823 				sr->flags |= IORING_RECV_MSHOT_LIM;
824 		} else if (sqe->optlen) {
825 			return -EINVAL;
826 		}
827 		req->flags |= REQ_F_APOLL_MULTISHOT;
828 	} else if (sqe->optlen) {
829 		return -EINVAL;
830 	}
831 
832 	if (sr->flags & IORING_RECVSEND_BUNDLE) {
833 		if (req->opcode == IORING_OP_RECVMSG)
834 			return -EINVAL;
835 	}
836 
837 	if (io_is_compat(req->ctx))
838 		sr->msg_flags |= MSG_CMSG_COMPAT;
839 
840 	sr->nr_multishot_loops = 0;
841 	return io_recvmsg_prep_setup(req);
842 }
843 
844 /* bits to clear in old and inherit in new cflags on bundle retry */
845 #define CQE_F_MASK	(IORING_CQE_F_SOCK_NONEMPTY|IORING_CQE_F_MORE)
846 
847 /*
848  * Finishes io_recv and io_recvmsg.
849  *
850  * Returns true if it is actually finished, or false if it should run
851  * again (for multishot).
852  */
io_recv_finish(struct io_kiocb * req,struct io_async_msghdr * kmsg,struct io_br_sel * sel,bool mshot_finished,unsigned issue_flags)853 static inline bool io_recv_finish(struct io_kiocb *req,
854 				  struct io_async_msghdr *kmsg,
855 				  struct io_br_sel *sel, bool mshot_finished,
856 				  unsigned issue_flags)
857 {
858 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
859 	unsigned int cflags = 0;
860 
861 	if (kmsg->msg.msg_inq > 0)
862 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
863 
864 	if (sel->val > 0 && sr->flags & IORING_RECV_MSHOT_LIM) {
865 		/*
866 		 * If sr->len hits zero, the limit has been reached. Mark
867 		 * mshot as finished, and flag MSHOT_DONE as well to prevent
868 		 * a potential bundle from being retried.
869 		 */
870 		sr->mshot_total_len -= min_t(int, sel->val, sr->mshot_total_len);
871 		if (!sr->mshot_total_len) {
872 			sr->flags |= IORING_RECV_MSHOT_DONE;
873 			mshot_finished = true;
874 		}
875 	}
876 
877 	if (sr->flags & IORING_RECVSEND_BUNDLE) {
878 		size_t this_ret = sel->val - sr->done_io;
879 
880 		cflags |= io_put_kbufs(req, this_ret, sel->buf_list, io_bundle_nbufs(kmsg, this_ret));
881 		if (sr->flags & IORING_RECV_RETRY)
882 			cflags = req->cqe.flags | (cflags & CQE_F_MASK);
883 		if (sr->mshot_len && sel->val >= sr->mshot_len)
884 			sr->flags |= IORING_RECV_MSHOT_CAP;
885 		/* bundle with no more immediate buffers, we're done */
886 		if (req->flags & REQ_F_BL_EMPTY)
887 			goto finish;
888 		/*
889 		 * If more is available AND it was a full transfer, retry and
890 		 * append to this one
891 		 */
892 		if (!(sr->flags & IORING_RECV_NO_RETRY) &&
893 		    kmsg->msg.msg_inq > 1 && this_ret > 0 &&
894 		    !iov_iter_count(&kmsg->msg.msg_iter)) {
895 			req->cqe.flags = cflags & ~CQE_F_MASK;
896 			sr->len = kmsg->msg.msg_inq;
897 			sr->done_io += this_ret;
898 			sr->flags |= IORING_RECV_RETRY;
899 			return false;
900 		}
901 	} else {
902 		cflags |= io_put_kbuf(req, sel->val, sel->buf_list);
903 	}
904 
905 	/*
906 	 * Fill CQE for this receive and see if we should keep trying to
907 	 * receive from this socket.
908 	 */
909 	if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished &&
910 	    io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) {
911 		sel->val = IOU_RETRY;
912 		io_mshot_prep_retry(req, kmsg);
913 		/* Known not-empty or unknown state, retry */
914 		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) {
915 			if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY &&
916 			    !(sr->flags & IORING_RECV_MSHOT_CAP)) {
917 				return false;
918 			}
919 			/* mshot retries exceeded, force a requeue */
920 			sr->nr_multishot_loops = 0;
921 			sr->flags &= ~IORING_RECV_MSHOT_CAP;
922 			if (issue_flags & IO_URING_F_MULTISHOT)
923 				sel->val = IOU_REQUEUE;
924 		}
925 		return true;
926 	}
927 
928 	/* Finish the request / stop multishot. */
929 finish:
930 	io_req_set_res(req, sel->val, cflags);
931 	sel->val = IOU_COMPLETE;
932 	io_req_msg_cleanup(req, issue_flags);
933 	return true;
934 }
935 
io_recvmsg_prep_multishot(struct io_async_msghdr * kmsg,struct io_sr_msg * sr,void __user ** buf,size_t * len)936 static int io_recvmsg_prep_multishot(struct io_async_msghdr *kmsg,
937 				     struct io_sr_msg *sr, void __user **buf,
938 				     size_t *len)
939 {
940 	unsigned long ubuf = (unsigned long) *buf;
941 	unsigned long hdr;
942 
943 	hdr = sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
944 		kmsg->controllen;
945 	if (*len < hdr)
946 		return -EFAULT;
947 
948 	if (kmsg->controllen) {
949 		unsigned long control = ubuf + hdr - kmsg->controllen;
950 
951 		kmsg->msg.msg_control_user = (void __user *) control;
952 		kmsg->msg.msg_controllen = kmsg->controllen;
953 	}
954 
955 	sr->buf = *buf; /* stash for later copy */
956 	*buf = (void __user *) (ubuf + hdr);
957 	kmsg->payloadlen = *len = *len - hdr;
958 	return 0;
959 }
960 
961 struct io_recvmsg_multishot_hdr {
962 	struct io_uring_recvmsg_out msg;
963 	struct sockaddr_storage addr;
964 };
965 
io_recvmsg_multishot(struct socket * sock,struct io_sr_msg * io,struct io_async_msghdr * kmsg,unsigned int flags,bool * finished)966 static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io,
967 				struct io_async_msghdr *kmsg,
968 				unsigned int flags, bool *finished)
969 {
970 	int err;
971 	int copy_len;
972 	struct io_recvmsg_multishot_hdr hdr;
973 
974 	if (kmsg->namelen)
975 		kmsg->msg.msg_name = &hdr.addr;
976 	kmsg->msg.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
977 	kmsg->msg.msg_namelen = 0;
978 
979 	if (sock->file->f_flags & O_NONBLOCK)
980 		flags |= MSG_DONTWAIT;
981 
982 	err = sock_recvmsg(sock, &kmsg->msg, flags);
983 	*finished = err <= 0;
984 	if (err < 0)
985 		return err;
986 
987 	hdr.msg = (struct io_uring_recvmsg_out) {
988 		.controllen = kmsg->controllen - kmsg->msg.msg_controllen,
989 		.flags = kmsg->msg.msg_flags & ~MSG_CMSG_COMPAT
990 	};
991 
992 	hdr.msg.payloadlen = err;
993 	if (err > kmsg->payloadlen)
994 		err = kmsg->payloadlen;
995 
996 	copy_len = sizeof(struct io_uring_recvmsg_out);
997 	if (kmsg->msg.msg_namelen > kmsg->namelen)
998 		copy_len += kmsg->namelen;
999 	else
1000 		copy_len += kmsg->msg.msg_namelen;
1001 
1002 	/*
1003 	 *      "fromlen shall refer to the value before truncation.."
1004 	 *                      1003.1g
1005 	 */
1006 	hdr.msg.namelen = kmsg->msg.msg_namelen;
1007 
1008 	/* ensure that there is no gap between hdr and sockaddr_storage */
1009 	BUILD_BUG_ON(offsetof(struct io_recvmsg_multishot_hdr, addr) !=
1010 		     sizeof(struct io_uring_recvmsg_out));
1011 	if (copy_to_user(io->buf, &hdr, copy_len)) {
1012 		*finished = true;
1013 		return -EFAULT;
1014 	}
1015 
1016 	return sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
1017 			kmsg->controllen + err;
1018 }
1019 
io_recvmsg(struct io_kiocb * req,unsigned int issue_flags)1020 int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
1021 {
1022 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1023 	struct io_async_msghdr *kmsg = req->async_data;
1024 	struct io_br_sel sel = { };
1025 	struct socket *sock;
1026 	unsigned flags;
1027 	int ret, min_ret = 0;
1028 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1029 	bool mshot_finished = true;
1030 
1031 	sock = sock_from_file(req->file);
1032 	if (unlikely(!sock))
1033 		return -ENOTSOCK;
1034 
1035 	if (!(req->flags & REQ_F_POLLED) &&
1036 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
1037 		return -EAGAIN;
1038 
1039 	flags = sr->msg_flags;
1040 	if (force_nonblock)
1041 		flags |= MSG_DONTWAIT;
1042 
1043 retry_multishot:
1044 	sel.buf_list = NULL;
1045 	if (io_do_buffer_select(req)) {
1046 		size_t len = sr->len;
1047 
1048 		sel = io_buffer_select(req, &len, sr->buf_group, issue_flags);
1049 		if (!sel.addr)
1050 			return -ENOBUFS;
1051 
1052 		if (req->flags & REQ_F_APOLL_MULTISHOT) {
1053 			ret = io_recvmsg_prep_multishot(kmsg, sr, &sel.addr, &len);
1054 			if (ret) {
1055 				io_kbuf_recycle(req, sel.buf_list, issue_flags);
1056 				return ret;
1057 			}
1058 		}
1059 
1060 		iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, sel.addr, len);
1061 	}
1062 
1063 	kmsg->msg.msg_get_inq = 1;
1064 	kmsg->msg.msg_inq = -1;
1065 	if (req->flags & REQ_F_APOLL_MULTISHOT) {
1066 		ret = io_recvmsg_multishot(sock, sr, kmsg, flags,
1067 					   &mshot_finished);
1068 	} else {
1069 		/* disable partial retry for recvmsg with cmsg attached */
1070 		if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen)
1071 			min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1072 
1073 		ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg,
1074 					 kmsg->uaddr, flags);
1075 	}
1076 
1077 	if (ret < min_ret) {
1078 		if (ret == -EAGAIN && force_nonblock) {
1079 			io_kbuf_recycle(req, sel.buf_list, issue_flags);
1080 			return IOU_RETRY;
1081 		}
1082 		if (ret > 0 && io_net_retry(sock, flags)) {
1083 			sr->done_io += ret;
1084 			return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret);
1085 		}
1086 		if (ret == -ERESTARTSYS)
1087 			ret = -EINTR;
1088 		req_set_fail(req);
1089 	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
1090 		req_set_fail(req);
1091 	}
1092 
1093 	if (ret > 0)
1094 		ret += sr->done_io;
1095 	else if (sr->done_io)
1096 		ret = sr->done_io;
1097 	else
1098 		io_kbuf_recycle(req, sel.buf_list, issue_flags);
1099 
1100 	sel.val = ret;
1101 	if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags))
1102 		goto retry_multishot;
1103 
1104 	return sel.val;
1105 }
1106 
io_recv_buf_select(struct io_kiocb * req,struct io_async_msghdr * kmsg,struct io_br_sel * sel,unsigned int issue_flags)1107 static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg,
1108 			      struct io_br_sel *sel, unsigned int issue_flags)
1109 {
1110 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1111 	int ret;
1112 
1113 	/*
1114 	 * If the ring isn't locked, then don't use the peek interface
1115 	 * to grab multiple buffers as we will lock/unlock between
1116 	 * this selection and posting the buffers.
1117 	 */
1118 	if (!(issue_flags & IO_URING_F_UNLOCKED) &&
1119 	    sr->flags & IORING_RECVSEND_BUNDLE) {
1120 		struct buf_sel_arg arg = {
1121 			.iovs = &kmsg->fast_iov,
1122 			.nr_iovs = 1,
1123 			.mode = KBUF_MODE_EXPAND,
1124 			.buf_group = sr->buf_group,
1125 		};
1126 
1127 		if (kmsg->vec.iovec) {
1128 			arg.nr_iovs = kmsg->vec.nr;
1129 			arg.iovs = kmsg->vec.iovec;
1130 			arg.mode |= KBUF_MODE_FREE;
1131 		}
1132 
1133 		if (sel->val)
1134 			arg.max_len = sel->val;
1135 		else if (kmsg->msg.msg_inq > 1)
1136 			arg.max_len = min_not_zero(sel->val, (ssize_t) kmsg->msg.msg_inq);
1137 
1138 		/* if mshot limited, ensure we don't go over */
1139 		if (sr->flags & IORING_RECV_MSHOT_LIM)
1140 			arg.max_len = min_not_zero(arg.max_len, sr->mshot_total_len);
1141 		ret = io_buffers_peek(req, &arg, sel);
1142 		if (unlikely(ret < 0))
1143 			return ret;
1144 
1145 		if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
1146 			kmsg->vec.nr = ret;
1147 			kmsg->vec.iovec = arg.iovs;
1148 			req->flags |= REQ_F_NEED_CLEANUP;
1149 		}
1150 		if (arg.partial_map)
1151 			sr->flags |= IORING_RECV_PARTIAL_MAP;
1152 
1153 		/* special case 1 vec, can be a fast path */
1154 		if (ret == 1) {
1155 			sr->buf = arg.iovs[0].iov_base;
1156 			sr->len = arg.iovs[0].iov_len;
1157 			goto map_ubuf;
1158 		}
1159 		iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret,
1160 				arg.out_len);
1161 	} else {
1162 		size_t len = sel->val;
1163 
1164 		*sel = io_buffer_select(req, &len, sr->buf_group, issue_flags);
1165 		if (!sel->addr)
1166 			return -ENOBUFS;
1167 		sr->buf = sel->addr;
1168 		sr->len = len;
1169 map_ubuf:
1170 		ret = import_ubuf(ITER_DEST, sr->buf, sr->len,
1171 				  &kmsg->msg.msg_iter);
1172 		if (unlikely(ret))
1173 			return ret;
1174 	}
1175 
1176 	return 0;
1177 }
1178 
io_recv(struct io_kiocb * req,unsigned int issue_flags)1179 int io_recv(struct io_kiocb *req, unsigned int issue_flags)
1180 {
1181 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1182 	struct io_async_msghdr *kmsg = req->async_data;
1183 	struct io_br_sel sel;
1184 	struct socket *sock;
1185 	unsigned flags;
1186 	int ret, min_ret = 0;
1187 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1188 	bool mshot_finished;
1189 
1190 	if (!(req->flags & REQ_F_POLLED) &&
1191 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
1192 		return -EAGAIN;
1193 
1194 	sock = sock_from_file(req->file);
1195 	if (unlikely(!sock))
1196 		return -ENOTSOCK;
1197 
1198 	flags = sr->msg_flags;
1199 	if (force_nonblock)
1200 		flags |= MSG_DONTWAIT;
1201 
1202 retry_multishot:
1203 	sel.buf_list = NULL;
1204 	if (io_do_buffer_select(req)) {
1205 		sel.val = sr->len;
1206 		ret = io_recv_buf_select(req, kmsg, &sel, issue_flags);
1207 		if (unlikely(ret < 0)) {
1208 			kmsg->msg.msg_inq = -1;
1209 			goto out_free;
1210 		}
1211 		sr->buf = NULL;
1212 	}
1213 
1214 	kmsg->msg.msg_flags = 0;
1215 	kmsg->msg.msg_inq = -1;
1216 
1217 	if (flags & MSG_WAITALL)
1218 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1219 
1220 	ret = sock_recvmsg(sock, &kmsg->msg, flags);
1221 	if (ret < min_ret) {
1222 		if (ret == -EAGAIN && force_nonblock) {
1223 			io_kbuf_recycle(req, sel.buf_list, issue_flags);
1224 			return IOU_RETRY;
1225 		}
1226 		if (ret > 0 && io_net_retry(sock, flags)) {
1227 			sr->len -= ret;
1228 			sr->buf += ret;
1229 			sr->done_io += ret;
1230 			return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret);
1231 		}
1232 		if (ret == -ERESTARTSYS)
1233 			ret = -EINTR;
1234 		req_set_fail(req);
1235 	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
1236 out_free:
1237 		req_set_fail(req);
1238 	}
1239 
1240 	mshot_finished = ret <= 0;
1241 	if (ret > 0)
1242 		ret += sr->done_io;
1243 	else if (sr->done_io)
1244 		ret = sr->done_io;
1245 	else
1246 		io_kbuf_recycle(req, sel.buf_list, issue_flags);
1247 
1248 	sel.val = ret;
1249 	if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags))
1250 		goto retry_multishot;
1251 
1252 	return sel.val;
1253 }
1254 
io_recvzc_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)1255 int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1256 {
1257 	struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
1258 	unsigned ifq_idx;
1259 
1260 	if (unlikely(sqe->addr2 || sqe->addr || sqe->addr3))
1261 		return -EINVAL;
1262 
1263 	ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx);
1264 	zc->ifq = xa_load(&req->ctx->zcrx_ctxs, ifq_idx);
1265 	if (!zc->ifq)
1266 		return -EINVAL;
1267 
1268 	zc->len = READ_ONCE(sqe->len);
1269 	zc->flags = READ_ONCE(sqe->ioprio);
1270 	if (READ_ONCE(sqe->msg_flags))
1271 		return -EINVAL;
1272 	if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT))
1273 		return -EINVAL;
1274 	/* multishot required */
1275 	if (!(zc->flags & IORING_RECV_MULTISHOT))
1276 		return -EINVAL;
1277 	/* All data completions are posted as aux CQEs. */
1278 	req->flags |= REQ_F_APOLL_MULTISHOT;
1279 
1280 	return 0;
1281 }
1282 
io_recvzc(struct io_kiocb * req,unsigned int issue_flags)1283 int io_recvzc(struct io_kiocb *req, unsigned int issue_flags)
1284 {
1285 	struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
1286 	struct socket *sock;
1287 	unsigned int len;
1288 	int ret;
1289 
1290 	if (!(req->flags & REQ_F_POLLED) &&
1291 	    (zc->flags & IORING_RECVSEND_POLL_FIRST))
1292 		return -EAGAIN;
1293 
1294 	sock = sock_from_file(req->file);
1295 	if (unlikely(!sock))
1296 		return -ENOTSOCK;
1297 
1298 	len = zc->len;
1299 	ret = io_zcrx_recv(req, zc->ifq, sock, 0, issue_flags, &zc->len);
1300 	if (len && zc->len == 0) {
1301 		io_req_set_res(req, 0, 0);
1302 
1303 		return IOU_COMPLETE;
1304 	}
1305 	if (unlikely(ret <= 0) && ret != -EAGAIN) {
1306 		if (ret == -ERESTARTSYS)
1307 			ret = -EINTR;
1308 		if (ret == IOU_REQUEUE)
1309 			return IOU_REQUEUE;
1310 
1311 		req_set_fail(req);
1312 		io_req_set_res(req, ret, 0);
1313 		return IOU_COMPLETE;
1314 	}
1315 	return IOU_RETRY;
1316 }
1317 
io_send_zc_cleanup(struct io_kiocb * req)1318 void io_send_zc_cleanup(struct io_kiocb *req)
1319 {
1320 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1321 	struct io_async_msghdr *io = req->async_data;
1322 
1323 	if (req_has_async_data(req))
1324 		io_netmsg_iovec_free(io);
1325 	if (zc->notif) {
1326 		io_notif_flush(zc->notif);
1327 		zc->notif = NULL;
1328 	}
1329 }
1330 
1331 #define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF)
1332 #define IO_ZC_FLAGS_VALID  (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE | \
1333 				IORING_SEND_VECTORIZED)
1334 
io_send_zc_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)1335 int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1336 {
1337 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1338 	struct io_ring_ctx *ctx = req->ctx;
1339 	struct io_async_msghdr *iomsg;
1340 	struct io_kiocb *notif;
1341 	u64 user_data;
1342 	int ret;
1343 
1344 	zc->done_io = 0;
1345 
1346 	if (unlikely(READ_ONCE(sqe->__pad2[0])))
1347 		return -EINVAL;
1348 	/* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */
1349 	if (req->flags & REQ_F_CQE_SKIP)
1350 		return -EINVAL;
1351 
1352 	notif = zc->notif = io_alloc_notif(ctx);
1353 	if (!notif)
1354 		return -ENOMEM;
1355 	user_data = READ_ONCE(sqe->addr3);
1356 	if (!user_data)
1357 		user_data = req->cqe.user_data;
1358 
1359 	notif->cqe.user_data = user_data;
1360 	notif->cqe.res = 0;
1361 	notif->cqe.flags = IORING_CQE_F_NOTIF;
1362 	req->flags |= REQ_F_NEED_CLEANUP | REQ_F_POLL_NO_LAZY;
1363 
1364 	zc->flags = READ_ONCE(sqe->ioprio);
1365 	if (unlikely(zc->flags & ~IO_ZC_FLAGS_COMMON)) {
1366 		if (zc->flags & ~IO_ZC_FLAGS_VALID)
1367 			return -EINVAL;
1368 		if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) {
1369 			struct io_notif_data *nd = io_notif_to_data(notif);
1370 
1371 			nd->zc_report = true;
1372 			nd->zc_used = false;
1373 			nd->zc_copied = false;
1374 		}
1375 	}
1376 
1377 	zc->len = READ_ONCE(sqe->len);
1378 	zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY;
1379 	req->buf_index = READ_ONCE(sqe->buf_index);
1380 	if (zc->msg_flags & MSG_DONTWAIT)
1381 		req->flags |= REQ_F_NOWAIT;
1382 
1383 	if (io_is_compat(ctx))
1384 		zc->msg_flags |= MSG_CMSG_COMPAT;
1385 
1386 	iomsg = io_msg_alloc_async(req);
1387 	if (unlikely(!iomsg))
1388 		return -ENOMEM;
1389 
1390 	if (req->opcode == IORING_OP_SEND_ZC) {
1391 		ret = io_send_setup(req, sqe);
1392 	} else {
1393 		if (unlikely(sqe->addr2 || sqe->file_index))
1394 			return -EINVAL;
1395 		ret = io_sendmsg_setup(req, sqe);
1396 	}
1397 	if (unlikely(ret))
1398 		return ret;
1399 
1400 	if (!(zc->flags & IORING_RECVSEND_FIXED_BUF)) {
1401 		iomsg->msg.sg_from_iter = io_sg_from_iter_iovec;
1402 		return io_notif_account_mem(zc->notif, iomsg->msg.msg_iter.count);
1403 	}
1404 	iomsg->msg.sg_from_iter = io_sg_from_iter;
1405 	return 0;
1406 }
1407 
io_sg_from_iter_iovec(struct sk_buff * skb,struct iov_iter * from,size_t length)1408 static int io_sg_from_iter_iovec(struct sk_buff *skb,
1409 				 struct iov_iter *from, size_t length)
1410 {
1411 	skb_zcopy_downgrade_managed(skb);
1412 	return zerocopy_fill_skb_from_iter(skb, from, length);
1413 }
1414 
io_sg_from_iter(struct sk_buff * skb,struct iov_iter * from,size_t length)1415 static int io_sg_from_iter(struct sk_buff *skb,
1416 			   struct iov_iter *from, size_t length)
1417 {
1418 	struct skb_shared_info *shinfo = skb_shinfo(skb);
1419 	int frag = shinfo->nr_frags;
1420 	int ret = 0;
1421 	struct bvec_iter bi;
1422 	ssize_t copied = 0;
1423 	unsigned long truesize = 0;
1424 
1425 	if (!frag)
1426 		shinfo->flags |= SKBFL_MANAGED_FRAG_REFS;
1427 	else if (unlikely(!skb_zcopy_managed(skb)))
1428 		return zerocopy_fill_skb_from_iter(skb, from, length);
1429 
1430 	bi.bi_size = min(from->count, length);
1431 	bi.bi_bvec_done = from->iov_offset;
1432 	bi.bi_idx = 0;
1433 
1434 	while (bi.bi_size && frag < MAX_SKB_FRAGS) {
1435 		struct bio_vec v = mp_bvec_iter_bvec(from->bvec, bi);
1436 
1437 		copied += v.bv_len;
1438 		truesize += PAGE_ALIGN(v.bv_len + v.bv_offset);
1439 		__skb_fill_page_desc_noacc(shinfo, frag++, v.bv_page,
1440 					   v.bv_offset, v.bv_len);
1441 		bvec_iter_advance_single(from->bvec, &bi, v.bv_len);
1442 	}
1443 	if (bi.bi_size)
1444 		ret = -EMSGSIZE;
1445 
1446 	shinfo->nr_frags = frag;
1447 	from->bvec += bi.bi_idx;
1448 	from->nr_segs -= bi.bi_idx;
1449 	from->count -= copied;
1450 	from->iov_offset = bi.bi_bvec_done;
1451 
1452 	skb->data_len += copied;
1453 	skb->len += copied;
1454 	skb->truesize += truesize;
1455 	return ret;
1456 }
1457 
io_send_zc_import(struct io_kiocb * req,struct io_async_msghdr * kmsg,unsigned int issue_flags)1458 static int io_send_zc_import(struct io_kiocb *req,
1459 			     struct io_async_msghdr *kmsg,
1460 			     unsigned int issue_flags)
1461 {
1462 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1463 	struct io_kiocb *notif = sr->notif;
1464 	int ret;
1465 
1466 	WARN_ON_ONCE(!(sr->flags & IORING_RECVSEND_FIXED_BUF));
1467 
1468 	notif->buf_index = req->buf_index;
1469 
1470 	if (!(sr->flags & IORING_SEND_VECTORIZED)) {
1471 		ret = io_import_reg_buf(notif, &kmsg->msg.msg_iter,
1472 					(u64)(uintptr_t)sr->buf, sr->len,
1473 					ITER_SOURCE, issue_flags);
1474 	} else {
1475 		unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs;
1476 
1477 		ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter,
1478 					notif, &kmsg->vec, uvec_segs,
1479 					issue_flags);
1480 	}
1481 
1482 	if (unlikely(ret))
1483 		return ret;
1484 	req->flags &= ~REQ_F_IMPORT_BUFFER;
1485 	return 0;
1486 }
1487 
io_sendmsg_zc(struct io_kiocb * req,unsigned int issue_flags)1488 int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
1489 {
1490 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1491 	struct io_async_msghdr *kmsg = req->async_data;
1492 	struct socket *sock;
1493 	unsigned msg_flags;
1494 	int ret, min_ret = 0;
1495 
1496 	sock = sock_from_file(req->file);
1497 	if (unlikely(!sock))
1498 		return -ENOTSOCK;
1499 	if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
1500 		return -EOPNOTSUPP;
1501 	if (!(req->flags & REQ_F_POLLED) &&
1502 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
1503 		return -EAGAIN;
1504 
1505 	if (req->flags & REQ_F_IMPORT_BUFFER) {
1506 		ret = io_send_zc_import(req, kmsg, issue_flags);
1507 		if (unlikely(ret))
1508 			return ret;
1509 	}
1510 
1511 	msg_flags = sr->msg_flags;
1512 	if (issue_flags & IO_URING_F_NONBLOCK)
1513 		msg_flags |= MSG_DONTWAIT;
1514 	if (msg_flags & MSG_WAITALL)
1515 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1516 
1517 	kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg;
1518 
1519 	if (req->opcode == IORING_OP_SEND_ZC) {
1520 		msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
1521 		kmsg->msg.msg_flags = msg_flags;
1522 		ret = sock_sendmsg(sock, &kmsg->msg);
1523 	} else {
1524 		kmsg->msg.msg_control_user = sr->msg_control;
1525 		ret = __sys_sendmsg_sock(sock, &kmsg->msg, msg_flags);
1526 	}
1527 
1528 	if (unlikely(ret < min_ret)) {
1529 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1530 			return -EAGAIN;
1531 
1532 		if (ret > 0 && io_net_retry(sock, sr->msg_flags)) {
1533 			sr->done_io += ret;
1534 			return -EAGAIN;
1535 		}
1536 		if (ret == -ERESTARTSYS)
1537 			ret = -EINTR;
1538 		req_set_fail(req);
1539 	}
1540 
1541 	if (ret >= 0)
1542 		ret += sr->done_io;
1543 	else if (sr->done_io)
1544 		ret = sr->done_io;
1545 
1546 	/*
1547 	 * If we're in io-wq we can't rely on tw ordering guarantees, defer
1548 	 * flushing notif to io_send_zc_cleanup()
1549 	 */
1550 	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
1551 		io_notif_flush(sr->notif);
1552 		sr->notif = NULL;
1553 		io_req_msg_cleanup(req, 0);
1554 	}
1555 	io_req_set_res(req, ret, IORING_CQE_F_MORE);
1556 	return IOU_COMPLETE;
1557 }
1558 
io_sendrecv_fail(struct io_kiocb * req)1559 void io_sendrecv_fail(struct io_kiocb *req)
1560 {
1561 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1562 
1563 	if (sr->done_io)
1564 		req->cqe.res = sr->done_io;
1565 
1566 	if ((req->flags & REQ_F_NEED_CLEANUP) &&
1567 	    (req->opcode == IORING_OP_SEND_ZC || req->opcode == IORING_OP_SENDMSG_ZC))
1568 		req->cqe.flags |= IORING_CQE_F_MORE;
1569 }
1570 
1571 #define ACCEPT_FLAGS	(IORING_ACCEPT_MULTISHOT | IORING_ACCEPT_DONTWAIT | \
1572 			 IORING_ACCEPT_POLL_FIRST)
1573 
io_accept_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)1574 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1575 {
1576 	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
1577 
1578 	if (sqe->len || sqe->buf_index)
1579 		return -EINVAL;
1580 
1581 	accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1582 	accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
1583 	accept->flags = READ_ONCE(sqe->accept_flags);
1584 	accept->nofile = rlimit(RLIMIT_NOFILE);
1585 	accept->iou_flags = READ_ONCE(sqe->ioprio);
1586 	if (accept->iou_flags & ~ACCEPT_FLAGS)
1587 		return -EINVAL;
1588 
1589 	accept->file_slot = READ_ONCE(sqe->file_index);
1590 	if (accept->file_slot) {
1591 		if (accept->flags & SOCK_CLOEXEC)
1592 			return -EINVAL;
1593 		if (accept->iou_flags & IORING_ACCEPT_MULTISHOT &&
1594 		    accept->file_slot != IORING_FILE_INDEX_ALLOC)
1595 			return -EINVAL;
1596 	}
1597 	if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1598 		return -EINVAL;
1599 	if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
1600 		accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
1601 	if (accept->iou_flags & IORING_ACCEPT_MULTISHOT)
1602 		req->flags |= REQ_F_APOLL_MULTISHOT;
1603 	if (accept->iou_flags & IORING_ACCEPT_DONTWAIT)
1604 		req->flags |= REQ_F_NOWAIT;
1605 	return 0;
1606 }
1607 
io_accept(struct io_kiocb * req,unsigned int issue_flags)1608 int io_accept(struct io_kiocb *req, unsigned int issue_flags)
1609 {
1610 	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
1611 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1612 	bool fixed = !!accept->file_slot;
1613 	struct proto_accept_arg arg = {
1614 		.flags = force_nonblock ? O_NONBLOCK : 0,
1615 	};
1616 	struct file *file;
1617 	unsigned cflags;
1618 	int ret, fd;
1619 
1620 	if (!(req->flags & REQ_F_POLLED) &&
1621 	    accept->iou_flags & IORING_ACCEPT_POLL_FIRST)
1622 		return -EAGAIN;
1623 
1624 retry:
1625 	if (!fixed) {
1626 		fd = __get_unused_fd_flags(accept->flags, accept->nofile);
1627 		if (unlikely(fd < 0))
1628 			return fd;
1629 	}
1630 	arg.err = 0;
1631 	arg.is_empty = -1;
1632 	file = do_accept(req->file, &arg, accept->addr, accept->addr_len,
1633 			 accept->flags);
1634 	if (IS_ERR(file)) {
1635 		if (!fixed)
1636 			put_unused_fd(fd);
1637 		ret = PTR_ERR(file);
1638 		if (ret == -EAGAIN && force_nonblock &&
1639 		    !(accept->iou_flags & IORING_ACCEPT_DONTWAIT))
1640 			return IOU_RETRY;
1641 
1642 		if (ret == -ERESTARTSYS)
1643 			ret = -EINTR;
1644 	} else if (!fixed) {
1645 		fd_install(fd, file);
1646 		ret = fd;
1647 	} else {
1648 		ret = io_fixed_fd_install(req, issue_flags, file,
1649 						accept->file_slot);
1650 	}
1651 
1652 	cflags = 0;
1653 	if (!arg.is_empty)
1654 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
1655 
1656 	if (ret >= 0 && (req->flags & REQ_F_APOLL_MULTISHOT) &&
1657 	    io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) {
1658 		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1)
1659 			goto retry;
1660 		return IOU_RETRY;
1661 	}
1662 
1663 	io_req_set_res(req, ret, cflags);
1664 	if (ret < 0)
1665 		req_set_fail(req);
1666 	return IOU_COMPLETE;
1667 }
1668 
io_socket_bpf_populate(struct io_uring_bpf_ctx * bctx,struct io_kiocb * req)1669 void io_socket_bpf_populate(struct io_uring_bpf_ctx *bctx, struct io_kiocb *req)
1670 {
1671 	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1672 
1673 	bctx->socket.family = sock->domain;
1674 	bctx->socket.type = sock->type;
1675 	bctx->socket.protocol = sock->protocol;
1676 }
1677 
io_socket_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)1678 int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1679 {
1680 	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1681 
1682 	if (sqe->addr || sqe->rw_flags || sqe->buf_index)
1683 		return -EINVAL;
1684 
1685 	sock->domain = READ_ONCE(sqe->fd);
1686 	sock->type = READ_ONCE(sqe->off);
1687 	sock->protocol = READ_ONCE(sqe->len);
1688 	sock->file_slot = READ_ONCE(sqe->file_index);
1689 	sock->nofile = rlimit(RLIMIT_NOFILE);
1690 
1691 	sock->flags = sock->type & ~SOCK_TYPE_MASK;
1692 	if (sock->file_slot && (sock->flags & SOCK_CLOEXEC))
1693 		return -EINVAL;
1694 	if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1695 		return -EINVAL;
1696 	return 0;
1697 }
1698 
io_socket(struct io_kiocb * req,unsigned int issue_flags)1699 int io_socket(struct io_kiocb *req, unsigned int issue_flags)
1700 {
1701 	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1702 	bool fixed = !!sock->file_slot;
1703 	struct file *file;
1704 	int ret, fd;
1705 
1706 	if (!fixed) {
1707 		fd = __get_unused_fd_flags(sock->flags, sock->nofile);
1708 		if (unlikely(fd < 0))
1709 			return fd;
1710 	}
1711 	file = __sys_socket_file(sock->domain, sock->type, sock->protocol);
1712 	if (IS_ERR(file)) {
1713 		if (!fixed)
1714 			put_unused_fd(fd);
1715 		ret = PTR_ERR(file);
1716 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1717 			return -EAGAIN;
1718 		if (ret == -ERESTARTSYS)
1719 			ret = -EINTR;
1720 		req_set_fail(req);
1721 	} else if (!fixed) {
1722 		fd_install(fd, file);
1723 		ret = fd;
1724 	} else {
1725 		ret = io_fixed_fd_install(req, issue_flags, file,
1726 					    sock->file_slot);
1727 	}
1728 	io_req_set_res(req, ret, 0);
1729 	return IOU_COMPLETE;
1730 }
1731 
io_connect_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)1732 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1733 {
1734 	struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect);
1735 	struct io_async_msghdr *io;
1736 
1737 	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
1738 		return -EINVAL;
1739 
1740 	conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1741 	conn->addr_len =  READ_ONCE(sqe->addr2);
1742 	conn->in_progress = conn->seen_econnaborted = false;
1743 
1744 	io = io_msg_alloc_async(req);
1745 	if (unlikely(!io))
1746 		return -ENOMEM;
1747 
1748 	return move_addr_to_kernel(conn->addr, conn->addr_len, &io->addr);
1749 }
1750 
io_connect(struct io_kiocb * req,unsigned int issue_flags)1751 int io_connect(struct io_kiocb *req, unsigned int issue_flags)
1752 {
1753 	struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect);
1754 	struct io_async_msghdr *io = req->async_data;
1755 	unsigned file_flags;
1756 	int ret;
1757 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1758 
1759 	if (connect->in_progress) {
1760 		struct poll_table_struct pt = { ._key = EPOLLERR };
1761 
1762 		if (vfs_poll(req->file, &pt) & EPOLLERR)
1763 			goto get_sock_err;
1764 	}
1765 
1766 	file_flags = force_nonblock ? O_NONBLOCK : 0;
1767 
1768 	ret = __sys_connect_file(req->file, &io->addr, connect->addr_len,
1769 				 file_flags);
1770 	if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED)
1771 	    && force_nonblock) {
1772 		if (ret == -EINPROGRESS) {
1773 			connect->in_progress = true;
1774 		} else if (ret == -ECONNABORTED) {
1775 			if (connect->seen_econnaborted)
1776 				goto out;
1777 			connect->seen_econnaborted = true;
1778 		}
1779 		return -EAGAIN;
1780 	}
1781 	if (connect->in_progress) {
1782 		/*
1783 		 * At least bluetooth will return -EBADFD on a re-connect
1784 		 * attempt, and it's (supposedly) also valid to get -EISCONN
1785 		 * which means the previous result is good. For both of these,
1786 		 * grab the sock_error() and use that for the completion.
1787 		 */
1788 		if (ret == -EBADFD || ret == -EISCONN) {
1789 get_sock_err:
1790 			ret = sock_error(sock_from_file(req->file)->sk);
1791 		}
1792 	}
1793 	if (ret == -ERESTARTSYS)
1794 		ret = -EINTR;
1795 out:
1796 	if (ret < 0)
1797 		req_set_fail(req);
1798 	io_req_msg_cleanup(req, issue_flags);
1799 	io_req_set_res(req, ret, 0);
1800 	return IOU_COMPLETE;
1801 }
1802 
1803 /*
1804  * Check if bind request would potentially end up with filename_create(),
1805  * which in turn end up in mnt_want_write() which will grab the fs
1806  * percpu start write sem. This can trigger a lockdep warning.
1807  */
io_bind_file_create(const struct io_async_msghdr * io,int addr_len)1808 static int io_bind_file_create(const struct io_async_msghdr *io, int addr_len)
1809 {
1810 	const struct sockaddr_un *sun;
1811 
1812 	if (io->addr.ss_family != AF_UNIX)
1813 		return 0;
1814 	if (addr_len <= offsetof(struct sockaddr_un, sun_path))
1815 		return 0;
1816 	sun = (const struct sockaddr_un *) &io->addr;
1817 	return sun->sun_path[0] != '\0';
1818 }
1819 
io_bind_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)1820 int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1821 {
1822 	struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind);
1823 	struct sockaddr __user *uaddr;
1824 	struct io_async_msghdr *io;
1825 	int ret;
1826 
1827 	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
1828 		return -EINVAL;
1829 
1830 	uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1831 	bind->addr_len =  READ_ONCE(sqe->addr2);
1832 
1833 	io = io_msg_alloc_async(req);
1834 	if (unlikely(!io))
1835 		return -ENOMEM;
1836 	ret = move_addr_to_kernel(uaddr, bind->addr_len, &io->addr);
1837 	if (unlikely(ret))
1838 		return ret;
1839 	if (io_bind_file_create(io, bind->addr_len))
1840 		req->flags |= REQ_F_FORCE_ASYNC;
1841 	return 0;
1842 }
1843 
io_bind(struct io_kiocb * req,unsigned int issue_flags)1844 int io_bind(struct io_kiocb *req, unsigned int issue_flags)
1845 {
1846 	struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind);
1847 	struct io_async_msghdr *io = req->async_data;
1848 	struct socket *sock;
1849 	int ret;
1850 
1851 	sock = sock_from_file(req->file);
1852 	if (unlikely(!sock))
1853 		return -ENOTSOCK;
1854 
1855 	ret = __sys_bind_socket(sock, &io->addr, bind->addr_len);
1856 	if (ret < 0)
1857 		req_set_fail(req);
1858 	io_req_set_res(req, ret, 0);
1859 	return 0;
1860 }
1861 
io_listen_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)1862 int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1863 {
1864 	struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen);
1865 
1866 	if (sqe->addr || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in || sqe->addr2)
1867 		return -EINVAL;
1868 
1869 	listen->backlog = READ_ONCE(sqe->len);
1870 	return 0;
1871 }
1872 
io_listen(struct io_kiocb * req,unsigned int issue_flags)1873 int io_listen(struct io_kiocb *req, unsigned int issue_flags)
1874 {
1875 	struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen);
1876 	struct socket *sock;
1877 	int ret;
1878 
1879 	sock = sock_from_file(req->file);
1880 	if (unlikely(!sock))
1881 		return -ENOTSOCK;
1882 
1883 	ret = __sys_listen_socket(sock, listen->backlog);
1884 	if (ret < 0)
1885 		req_set_fail(req);
1886 	io_req_set_res(req, ret, 0);
1887 	return 0;
1888 }
1889 
io_netmsg_cache_free(const void * entry)1890 void io_netmsg_cache_free(const void *entry)
1891 {
1892 	struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry;
1893 
1894 	io_vec_free(&kmsg->vec);
1895 	kfree(kmsg);
1896 }
1897