1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/file.h> 5 #include <linux/slab.h> 6 #include <linux/net.h> 7 #include <linux/compat.h> 8 #include <net/compat.h> 9 #include <linux/io_uring.h> 10 11 #include <uapi/linux/io_uring.h> 12 13 #include "io_uring.h" 14 #include "kbuf.h" 15 #include "alloc_cache.h" 16 #include "net.h" 17 #include "notif.h" 18 #include "rsrc.h" 19 #include "zcrx.h" 20 21 #if defined(CONFIG_NET) 22 struct io_shutdown { 23 struct file *file; 24 int how; 25 }; 26 27 struct io_accept { 28 struct file *file; 29 struct sockaddr __user *addr; 30 int __user *addr_len; 31 int flags; 32 int iou_flags; 33 u32 file_slot; 34 unsigned long nofile; 35 }; 36 37 struct io_socket { 38 struct file *file; 39 int domain; 40 int type; 41 int protocol; 42 int flags; 43 u32 file_slot; 44 unsigned long nofile; 45 }; 46 47 struct io_connect { 48 struct file *file; 49 struct sockaddr __user *addr; 50 int addr_len; 51 bool in_progress; 52 bool seen_econnaborted; 53 }; 54 55 struct io_bind { 56 struct file *file; 57 int addr_len; 58 }; 59 60 struct io_listen { 61 struct file *file; 62 int backlog; 63 }; 64 65 struct io_sr_msg { 66 struct file *file; 67 union { 68 struct compat_msghdr __user *umsg_compat; 69 struct user_msghdr __user *umsg; 70 void __user *buf; 71 }; 72 int len; 73 unsigned done_io; 74 unsigned msg_flags; 75 unsigned nr_multishot_loops; 76 u16 flags; 77 /* initialised and used only by !msg send variants */ 78 u16 buf_group; 79 bool retry; 80 void __user *msg_control; 81 /* used only for send zerocopy */ 82 struct io_kiocb *notif; 83 }; 84 85 /* 86 * Number of times we'll try and do receives if there's more data. If we 87 * exceed this limit, then add us to the back of the queue and retry from 88 * there. This helps fairness between flooding clients. 89 */ 90 #define MULTISHOT_MAX_RETRY 32 91 92 struct io_recvzc { 93 struct file *file; 94 unsigned msg_flags; 95 u16 flags; 96 u32 len; 97 struct io_zcrx_ifq *ifq; 98 }; 99 100 static int io_sg_from_iter_iovec(struct sk_buff *skb, 101 struct iov_iter *from, size_t length); 102 static int io_sg_from_iter(struct sk_buff *skb, 103 struct iov_iter *from, size_t length); 104 105 int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 106 { 107 struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); 108 109 if (unlikely(sqe->off || sqe->addr || sqe->rw_flags || 110 sqe->buf_index || sqe->splice_fd_in)) 111 return -EINVAL; 112 113 shutdown->how = READ_ONCE(sqe->len); 114 req->flags |= REQ_F_FORCE_ASYNC; 115 return 0; 116 } 117 118 int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) 119 { 120 struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); 121 struct socket *sock; 122 int ret; 123 124 WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); 125 126 sock = sock_from_file(req->file); 127 if (unlikely(!sock)) 128 return -ENOTSOCK; 129 130 ret = __sys_shutdown_sock(sock, shutdown->how); 131 io_req_set_res(req, ret, 0); 132 return IOU_OK; 133 } 134 135 static bool io_net_retry(struct socket *sock, int flags) 136 { 137 if (!(flags & MSG_WAITALL)) 138 return false; 139 return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; 140 } 141 142 static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg) 143 { 144 if (kmsg->vec.iovec) 145 io_vec_free(&kmsg->vec); 146 } 147 148 static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) 149 { 150 struct io_async_msghdr *hdr = req->async_data; 151 152 /* can't recycle, ensure we free the iovec if we have one */ 153 if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) { 154 io_netmsg_iovec_free(hdr); 155 return; 156 } 157 158 /* Let normal cleanup path reap it if we fail adding to the cache */ 159 io_alloc_cache_vec_kasan(&hdr->vec); 160 if (hdr->vec.nr > IO_VEC_CACHE_SOFT_CAP) 161 io_vec_free(&hdr->vec); 162 163 if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) { 164 req->async_data = NULL; 165 req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP); 166 } 167 } 168 169 static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) 170 { 171 struct io_ring_ctx *ctx = req->ctx; 172 struct io_async_msghdr *hdr; 173 174 hdr = io_uring_alloc_async_data(&ctx->netmsg_cache, req); 175 if (!hdr) 176 return NULL; 177 178 /* If the async data was cached, we might have an iov cached inside. */ 179 if (hdr->vec.iovec) 180 req->flags |= REQ_F_NEED_CLEANUP; 181 return hdr; 182 } 183 184 static inline void io_mshot_prep_retry(struct io_kiocb *req, 185 struct io_async_msghdr *kmsg) 186 { 187 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 188 189 req->flags &= ~REQ_F_BL_EMPTY; 190 sr->done_io = 0; 191 sr->retry = false; 192 sr->len = 0; /* get from the provided buffer */ 193 req->buf_index = sr->buf_group; 194 } 195 196 static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg, 197 const struct iovec __user *uiov, unsigned uvec_seg, 198 int ddir) 199 { 200 struct iovec *iov; 201 int ret, nr_segs; 202 203 if (iomsg->vec.iovec) { 204 nr_segs = iomsg->vec.nr; 205 iov = iomsg->vec.iovec; 206 } else { 207 nr_segs = 1; 208 iov = &iomsg->fast_iov; 209 } 210 211 ret = __import_iovec(ddir, uiov, uvec_seg, nr_segs, &iov, 212 &iomsg->msg.msg_iter, io_is_compat(req->ctx)); 213 if (unlikely(ret < 0)) 214 return ret; 215 216 if (iov) { 217 req->flags |= REQ_F_NEED_CLEANUP; 218 io_vec_reset_iovec(&iomsg->vec, iov, iomsg->msg.msg_iter.nr_segs); 219 } 220 return 0; 221 } 222 223 static int io_compat_msg_copy_hdr(struct io_kiocb *req, 224 struct io_async_msghdr *iomsg, 225 struct compat_msghdr *msg, int ddir, 226 struct sockaddr __user **save_addr) 227 { 228 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 229 struct compat_iovec __user *uiov; 230 int ret; 231 232 if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg))) 233 return -EFAULT; 234 235 ret = __get_compat_msghdr(&iomsg->msg, msg, save_addr); 236 if (ret) 237 return ret; 238 239 uiov = compat_ptr(msg->msg_iov); 240 if (req->flags & REQ_F_BUFFER_SELECT) { 241 if (msg->msg_iovlen == 0) { 242 sr->len = 0; 243 } else if (msg->msg_iovlen > 1) { 244 return -EINVAL; 245 } else { 246 struct compat_iovec tmp_iov; 247 248 if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov))) 249 return -EFAULT; 250 sr->len = tmp_iov.iov_len; 251 } 252 } 253 return 0; 254 } 255 256 static int io_copy_msghdr_from_user(struct user_msghdr *msg, 257 struct user_msghdr __user *umsg) 258 { 259 if (!user_access_begin(umsg, sizeof(*umsg))) 260 return -EFAULT; 261 unsafe_get_user(msg->msg_name, &umsg->msg_name, ua_end); 262 unsafe_get_user(msg->msg_namelen, &umsg->msg_namelen, ua_end); 263 unsafe_get_user(msg->msg_iov, &umsg->msg_iov, ua_end); 264 unsafe_get_user(msg->msg_iovlen, &umsg->msg_iovlen, ua_end); 265 unsafe_get_user(msg->msg_control, &umsg->msg_control, ua_end); 266 unsafe_get_user(msg->msg_controllen, &umsg->msg_controllen, ua_end); 267 user_access_end(); 268 return 0; 269 ua_end: 270 user_access_end(); 271 return -EFAULT; 272 } 273 274 static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, 275 struct user_msghdr *msg, int ddir, 276 struct sockaddr __user **save_addr) 277 { 278 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 279 struct user_msghdr __user *umsg = sr->umsg; 280 int ret; 281 282 iomsg->msg.msg_name = &iomsg->addr; 283 iomsg->msg.msg_iter.nr_segs = 0; 284 285 if (io_is_compat(req->ctx)) { 286 struct compat_msghdr cmsg; 287 288 ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ddir, save_addr); 289 if (ret) 290 return ret; 291 292 memset(msg, 0, sizeof(*msg)); 293 msg->msg_namelen = cmsg.msg_namelen; 294 msg->msg_controllen = cmsg.msg_controllen; 295 msg->msg_iov = compat_ptr(cmsg.msg_iov); 296 msg->msg_iovlen = cmsg.msg_iovlen; 297 return 0; 298 } 299 300 ret = io_copy_msghdr_from_user(msg, umsg); 301 if (unlikely(ret)) 302 return ret; 303 304 msg->msg_flags = 0; 305 306 ret = __copy_msghdr(&iomsg->msg, msg, save_addr); 307 if (ret) 308 return ret; 309 310 if (req->flags & REQ_F_BUFFER_SELECT) { 311 if (msg->msg_iovlen == 0) { 312 sr->len = 0; 313 } else if (msg->msg_iovlen > 1) { 314 return -EINVAL; 315 } else { 316 struct iovec __user *uiov = msg->msg_iov; 317 struct iovec tmp_iov; 318 319 if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov))) 320 return -EFAULT; 321 sr->len = tmp_iov.iov_len; 322 } 323 } 324 return 0; 325 } 326 327 void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) 328 { 329 struct io_async_msghdr *io = req->async_data; 330 331 io_netmsg_iovec_free(io); 332 } 333 334 static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) 335 { 336 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 337 struct io_async_msghdr *kmsg = req->async_data; 338 void __user *addr; 339 u16 addr_len; 340 int ret; 341 342 sr->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); 343 344 if (READ_ONCE(sqe->__pad3[0])) 345 return -EINVAL; 346 347 kmsg->msg.msg_name = NULL; 348 kmsg->msg.msg_namelen = 0; 349 kmsg->msg.msg_control = NULL; 350 kmsg->msg.msg_controllen = 0; 351 kmsg->msg.msg_ubuf = NULL; 352 353 addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 354 addr_len = READ_ONCE(sqe->addr_len); 355 if (addr) { 356 ret = move_addr_to_kernel(addr, addr_len, &kmsg->addr); 357 if (unlikely(ret < 0)) 358 return ret; 359 kmsg->msg.msg_name = &kmsg->addr; 360 kmsg->msg.msg_namelen = addr_len; 361 } 362 if (sr->flags & IORING_RECVSEND_FIXED_BUF) 363 return 0; 364 if (!io_do_buffer_select(req)) { 365 ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, 366 &kmsg->msg.msg_iter); 367 if (unlikely(ret < 0)) 368 return ret; 369 } 370 return 0; 371 } 372 373 static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) 374 { 375 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 376 struct io_async_msghdr *kmsg = req->async_data; 377 struct user_msghdr msg; 378 int ret; 379 380 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 381 ret = io_msg_copy_hdr(req, kmsg, &msg, ITER_SOURCE, NULL); 382 if (unlikely(ret)) 383 return ret; 384 /* save msg_control as sys_sendmsg() overwrites it */ 385 sr->msg_control = kmsg->msg.msg_control_user; 386 387 if (sr->flags & IORING_RECVSEND_FIXED_BUF) { 388 kmsg->msg.msg_iter.nr_segs = msg.msg_iovlen; 389 return io_prep_reg_iovec(req, &kmsg->vec, msg.msg_iov, 390 msg.msg_iovlen); 391 } 392 if (req->flags & REQ_F_BUFFER_SELECT) 393 return 0; 394 return io_net_import_vec(req, kmsg, msg.msg_iov, msg.msg_iovlen, ITER_SOURCE); 395 } 396 397 #define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE) 398 399 int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 400 { 401 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 402 403 sr->done_io = 0; 404 sr->retry = false; 405 sr->len = READ_ONCE(sqe->len); 406 sr->flags = READ_ONCE(sqe->ioprio); 407 if (sr->flags & ~SENDMSG_FLAGS) 408 return -EINVAL; 409 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; 410 if (sr->msg_flags & MSG_DONTWAIT) 411 req->flags |= REQ_F_NOWAIT; 412 if (sr->flags & IORING_RECVSEND_BUNDLE) { 413 if (req->opcode == IORING_OP_SENDMSG) 414 return -EINVAL; 415 if (!(req->flags & REQ_F_BUFFER_SELECT)) 416 return -EINVAL; 417 sr->msg_flags |= MSG_WAITALL; 418 sr->buf_group = req->buf_index; 419 req->buf_list = NULL; 420 req->flags |= REQ_F_MULTISHOT; 421 } 422 423 if (io_is_compat(req->ctx)) 424 sr->msg_flags |= MSG_CMSG_COMPAT; 425 426 if (unlikely(!io_msg_alloc_async(req))) 427 return -ENOMEM; 428 if (req->opcode != IORING_OP_SENDMSG) 429 return io_send_setup(req, sqe); 430 if (unlikely(sqe->addr2 || sqe->file_index)) 431 return -EINVAL; 432 return io_sendmsg_setup(req, sqe); 433 } 434 435 static void io_req_msg_cleanup(struct io_kiocb *req, 436 unsigned int issue_flags) 437 { 438 io_netmsg_recycle(req, issue_flags); 439 } 440 441 /* 442 * For bundle completions, we need to figure out how many segments we consumed. 443 * A bundle could be using a single ITER_UBUF if that's all we mapped, or it 444 * could be using an ITER_IOVEC. If the latter, then if we consumed all of 445 * the segments, then it's a trivial questiont o answer. If we have residual 446 * data in the iter, then loop the segments to figure out how much we 447 * transferred. 448 */ 449 static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret) 450 { 451 struct iovec *iov; 452 int nbufs; 453 454 /* no data is always zero segments, and a ubuf is always 1 segment */ 455 if (ret <= 0) 456 return 0; 457 if (iter_is_ubuf(&kmsg->msg.msg_iter)) 458 return 1; 459 460 iov = kmsg->vec.iovec; 461 if (!iov) 462 iov = &kmsg->fast_iov; 463 464 /* if all data was transferred, it's basic pointer math */ 465 if (!iov_iter_count(&kmsg->msg.msg_iter)) 466 return iter_iov(&kmsg->msg.msg_iter) - iov; 467 468 /* short transfer, count segments */ 469 nbufs = 0; 470 do { 471 int this_len = min_t(int, iov[nbufs].iov_len, ret); 472 473 nbufs++; 474 ret -= this_len; 475 } while (ret); 476 477 return nbufs; 478 } 479 480 static inline bool io_send_finish(struct io_kiocb *req, int *ret, 481 struct io_async_msghdr *kmsg, 482 unsigned issue_flags) 483 { 484 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 485 bool bundle_finished = *ret <= 0; 486 unsigned int cflags; 487 488 if (!(sr->flags & IORING_RECVSEND_BUNDLE)) { 489 cflags = io_put_kbuf(req, *ret, issue_flags); 490 goto finish; 491 } 492 493 cflags = io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), issue_flags); 494 495 if (bundle_finished || req->flags & REQ_F_BL_EMPTY) 496 goto finish; 497 498 /* 499 * Fill CQE for this receive and see if we should keep trying to 500 * receive from this socket. 501 */ 502 if (io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { 503 io_mshot_prep_retry(req, kmsg); 504 return false; 505 } 506 507 /* Otherwise stop bundle and use the current result. */ 508 finish: 509 io_req_set_res(req, *ret, cflags); 510 *ret = IOU_OK; 511 return true; 512 } 513 514 int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) 515 { 516 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 517 struct io_async_msghdr *kmsg = req->async_data; 518 struct socket *sock; 519 unsigned flags; 520 int min_ret = 0; 521 int ret; 522 523 sock = sock_from_file(req->file); 524 if (unlikely(!sock)) 525 return -ENOTSOCK; 526 527 if (!(req->flags & REQ_F_POLLED) && 528 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 529 return -EAGAIN; 530 531 flags = sr->msg_flags; 532 if (issue_flags & IO_URING_F_NONBLOCK) 533 flags |= MSG_DONTWAIT; 534 if (flags & MSG_WAITALL) 535 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 536 537 kmsg->msg.msg_control_user = sr->msg_control; 538 539 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); 540 541 if (ret < min_ret) { 542 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 543 return -EAGAIN; 544 if (ret > 0 && io_net_retry(sock, flags)) { 545 kmsg->msg.msg_controllen = 0; 546 kmsg->msg.msg_control = NULL; 547 sr->done_io += ret; 548 req->flags |= REQ_F_BL_NO_RECYCLE; 549 return -EAGAIN; 550 } 551 if (ret == -ERESTARTSYS) 552 ret = -EINTR; 553 req_set_fail(req); 554 } 555 io_req_msg_cleanup(req, issue_flags); 556 if (ret >= 0) 557 ret += sr->done_io; 558 else if (sr->done_io) 559 ret = sr->done_io; 560 io_req_set_res(req, ret, 0); 561 return IOU_OK; 562 } 563 564 static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags, 565 struct io_async_msghdr *kmsg) 566 { 567 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 568 569 int ret; 570 struct buf_sel_arg arg = { 571 .iovs = &kmsg->fast_iov, 572 .max_len = min_not_zero(sr->len, INT_MAX), 573 .nr_iovs = 1, 574 }; 575 576 if (kmsg->vec.iovec) { 577 arg.nr_iovs = kmsg->vec.nr; 578 arg.iovs = kmsg->vec.iovec; 579 arg.mode = KBUF_MODE_FREE; 580 } 581 582 if (!(sr->flags & IORING_RECVSEND_BUNDLE)) 583 arg.nr_iovs = 1; 584 else 585 arg.mode |= KBUF_MODE_EXPAND; 586 587 ret = io_buffers_select(req, &arg, issue_flags); 588 if (unlikely(ret < 0)) 589 return ret; 590 591 if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) { 592 kmsg->vec.nr = ret; 593 kmsg->vec.iovec = arg.iovs; 594 req->flags |= REQ_F_NEED_CLEANUP; 595 } 596 sr->len = arg.out_len; 597 598 if (ret == 1) { 599 sr->buf = arg.iovs[0].iov_base; 600 ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, 601 &kmsg->msg.msg_iter); 602 if (unlikely(ret)) 603 return ret; 604 } else { 605 iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, 606 arg.iovs, ret, arg.out_len); 607 } 608 609 return 0; 610 } 611 612 int io_send(struct io_kiocb *req, unsigned int issue_flags) 613 { 614 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 615 struct io_async_msghdr *kmsg = req->async_data; 616 struct socket *sock; 617 unsigned flags; 618 int min_ret = 0; 619 int ret; 620 621 sock = sock_from_file(req->file); 622 if (unlikely(!sock)) 623 return -ENOTSOCK; 624 625 if (!(req->flags & REQ_F_POLLED) && 626 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 627 return -EAGAIN; 628 629 flags = sr->msg_flags; 630 if (issue_flags & IO_URING_F_NONBLOCK) 631 flags |= MSG_DONTWAIT; 632 633 retry_bundle: 634 if (io_do_buffer_select(req)) { 635 ret = io_send_select_buffer(req, issue_flags, kmsg); 636 if (ret) 637 return ret; 638 } 639 640 /* 641 * If MSG_WAITALL is set, or this is a bundle send, then we need 642 * the full amount. If just bundle is set, if we do a short send 643 * then we complete the bundle sequence rather than continue on. 644 */ 645 if (flags & MSG_WAITALL || sr->flags & IORING_RECVSEND_BUNDLE) 646 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 647 648 flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; 649 kmsg->msg.msg_flags = flags; 650 ret = sock_sendmsg(sock, &kmsg->msg); 651 if (ret < min_ret) { 652 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 653 return -EAGAIN; 654 655 if (ret > 0 && io_net_retry(sock, flags)) { 656 sr->len -= ret; 657 sr->buf += ret; 658 sr->done_io += ret; 659 req->flags |= REQ_F_BL_NO_RECYCLE; 660 return -EAGAIN; 661 } 662 if (ret == -ERESTARTSYS) 663 ret = -EINTR; 664 req_set_fail(req); 665 } 666 if (ret >= 0) 667 ret += sr->done_io; 668 else if (sr->done_io) 669 ret = sr->done_io; 670 671 if (!io_send_finish(req, &ret, kmsg, issue_flags)) 672 goto retry_bundle; 673 674 io_req_msg_cleanup(req, issue_flags); 675 return ret; 676 } 677 678 static int io_recvmsg_mshot_prep(struct io_kiocb *req, 679 struct io_async_msghdr *iomsg, 680 int namelen, size_t controllen) 681 { 682 if ((req->flags & (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) == 683 (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) { 684 int hdr; 685 686 if (unlikely(namelen < 0)) 687 return -EOVERFLOW; 688 if (check_add_overflow(sizeof(struct io_uring_recvmsg_out), 689 namelen, &hdr)) 690 return -EOVERFLOW; 691 if (check_add_overflow(hdr, controllen, &hdr)) 692 return -EOVERFLOW; 693 694 iomsg->namelen = namelen; 695 iomsg->controllen = controllen; 696 return 0; 697 } 698 699 return 0; 700 } 701 702 static int io_recvmsg_copy_hdr(struct io_kiocb *req, 703 struct io_async_msghdr *iomsg) 704 { 705 struct user_msghdr msg; 706 int ret; 707 708 ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr); 709 if (unlikely(ret)) 710 return ret; 711 712 if (!(req->flags & REQ_F_BUFFER_SELECT)) { 713 ret = io_net_import_vec(req, iomsg, msg.msg_iov, msg.msg_iovlen, 714 ITER_DEST); 715 if (unlikely(ret)) 716 return ret; 717 } 718 return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen, 719 msg.msg_controllen); 720 } 721 722 static int io_recvmsg_prep_setup(struct io_kiocb *req) 723 { 724 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 725 struct io_async_msghdr *kmsg; 726 int ret; 727 728 kmsg = io_msg_alloc_async(req); 729 if (unlikely(!kmsg)) 730 return -ENOMEM; 731 732 if (req->opcode == IORING_OP_RECV) { 733 kmsg->msg.msg_name = NULL; 734 kmsg->msg.msg_namelen = 0; 735 kmsg->msg.msg_inq = 0; 736 kmsg->msg.msg_control = NULL; 737 kmsg->msg.msg_get_inq = 1; 738 kmsg->msg.msg_controllen = 0; 739 kmsg->msg.msg_iocb = NULL; 740 kmsg->msg.msg_ubuf = NULL; 741 742 if (!io_do_buffer_select(req)) { 743 ret = import_ubuf(ITER_DEST, sr->buf, sr->len, 744 &kmsg->msg.msg_iter); 745 if (unlikely(ret)) 746 return ret; 747 } 748 return 0; 749 } 750 751 return io_recvmsg_copy_hdr(req, kmsg); 752 } 753 754 #define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \ 755 IORING_RECVSEND_BUNDLE) 756 757 int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 758 { 759 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 760 761 sr->done_io = 0; 762 sr->retry = false; 763 764 if (unlikely(sqe->file_index || sqe->addr2)) 765 return -EINVAL; 766 767 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 768 sr->len = READ_ONCE(sqe->len); 769 sr->flags = READ_ONCE(sqe->ioprio); 770 if (sr->flags & ~RECVMSG_FLAGS) 771 return -EINVAL; 772 sr->msg_flags = READ_ONCE(sqe->msg_flags); 773 if (sr->msg_flags & MSG_DONTWAIT) 774 req->flags |= REQ_F_NOWAIT; 775 if (sr->msg_flags & MSG_ERRQUEUE) 776 req->flags |= REQ_F_CLEAR_POLLIN; 777 if (req->flags & REQ_F_BUFFER_SELECT) { 778 /* 779 * Store the buffer group for this multishot receive separately, 780 * as if we end up doing an io-wq based issue that selects a 781 * buffer, it has to be committed immediately and that will 782 * clear ->buf_list. This means we lose the link to the buffer 783 * list, and the eventual buffer put on completion then cannot 784 * restore it. 785 */ 786 sr->buf_group = req->buf_index; 787 req->buf_list = NULL; 788 } 789 if (sr->flags & IORING_RECV_MULTISHOT) { 790 if (!(req->flags & REQ_F_BUFFER_SELECT)) 791 return -EINVAL; 792 if (sr->msg_flags & MSG_WAITALL) 793 return -EINVAL; 794 if (req->opcode == IORING_OP_RECV && sr->len) 795 return -EINVAL; 796 req->flags |= REQ_F_APOLL_MULTISHOT; 797 } 798 if (sr->flags & IORING_RECVSEND_BUNDLE) { 799 if (req->opcode == IORING_OP_RECVMSG) 800 return -EINVAL; 801 } 802 803 if (io_is_compat(req->ctx)) 804 sr->msg_flags |= MSG_CMSG_COMPAT; 805 806 sr->nr_multishot_loops = 0; 807 return io_recvmsg_prep_setup(req); 808 } 809 810 /* bits to clear in old and inherit in new cflags on bundle retry */ 811 #define CQE_F_MASK (IORING_CQE_F_SOCK_NONEMPTY|IORING_CQE_F_MORE) 812 813 /* 814 * Finishes io_recv and io_recvmsg. 815 * 816 * Returns true if it is actually finished, or false if it should run 817 * again (for multishot). 818 */ 819 static inline bool io_recv_finish(struct io_kiocb *req, int *ret, 820 struct io_async_msghdr *kmsg, 821 bool mshot_finished, unsigned issue_flags) 822 { 823 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 824 unsigned int cflags = 0; 825 826 if (kmsg->msg.msg_inq > 0) 827 cflags |= IORING_CQE_F_SOCK_NONEMPTY; 828 829 if (sr->flags & IORING_RECVSEND_BUNDLE) { 830 cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), 831 issue_flags); 832 if (sr->retry) 833 cflags = req->cqe.flags | (cflags & CQE_F_MASK); 834 /* bundle with no more immediate buffers, we're done */ 835 if (req->flags & REQ_F_BL_EMPTY) 836 goto finish; 837 /* if more is available, retry and append to this one */ 838 if (!sr->retry && kmsg->msg.msg_inq > 0 && *ret > 0) { 839 req->cqe.flags = cflags & ~CQE_F_MASK; 840 sr->len = kmsg->msg.msg_inq; 841 sr->done_io += *ret; 842 sr->retry = true; 843 return false; 844 } 845 } else { 846 cflags |= io_put_kbuf(req, *ret, issue_flags); 847 } 848 849 /* 850 * Fill CQE for this receive and see if we should keep trying to 851 * receive from this socket. 852 */ 853 if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished && 854 io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { 855 *ret = IOU_RETRY; 856 io_mshot_prep_retry(req, kmsg); 857 /* Known not-empty or unknown state, retry */ 858 if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) { 859 if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY) 860 return false; 861 /* mshot retries exceeded, force a requeue */ 862 sr->nr_multishot_loops = 0; 863 if (issue_flags & IO_URING_F_MULTISHOT) 864 *ret = IOU_REQUEUE; 865 } 866 return true; 867 } 868 869 /* Finish the request / stop multishot. */ 870 finish: 871 io_req_set_res(req, *ret, cflags); 872 *ret = IOU_COMPLETE; 873 io_req_msg_cleanup(req, issue_flags); 874 return true; 875 } 876 877 static int io_recvmsg_prep_multishot(struct io_async_msghdr *kmsg, 878 struct io_sr_msg *sr, void __user **buf, 879 size_t *len) 880 { 881 unsigned long ubuf = (unsigned long) *buf; 882 unsigned long hdr; 883 884 hdr = sizeof(struct io_uring_recvmsg_out) + kmsg->namelen + 885 kmsg->controllen; 886 if (*len < hdr) 887 return -EFAULT; 888 889 if (kmsg->controllen) { 890 unsigned long control = ubuf + hdr - kmsg->controllen; 891 892 kmsg->msg.msg_control_user = (void __user *) control; 893 kmsg->msg.msg_controllen = kmsg->controllen; 894 } 895 896 sr->buf = *buf; /* stash for later copy */ 897 *buf = (void __user *) (ubuf + hdr); 898 kmsg->payloadlen = *len = *len - hdr; 899 return 0; 900 } 901 902 struct io_recvmsg_multishot_hdr { 903 struct io_uring_recvmsg_out msg; 904 struct sockaddr_storage addr; 905 }; 906 907 static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io, 908 struct io_async_msghdr *kmsg, 909 unsigned int flags, bool *finished) 910 { 911 int err; 912 int copy_len; 913 struct io_recvmsg_multishot_hdr hdr; 914 915 if (kmsg->namelen) 916 kmsg->msg.msg_name = &hdr.addr; 917 kmsg->msg.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT); 918 kmsg->msg.msg_namelen = 0; 919 920 if (sock->file->f_flags & O_NONBLOCK) 921 flags |= MSG_DONTWAIT; 922 923 err = sock_recvmsg(sock, &kmsg->msg, flags); 924 *finished = err <= 0; 925 if (err < 0) 926 return err; 927 928 hdr.msg = (struct io_uring_recvmsg_out) { 929 .controllen = kmsg->controllen - kmsg->msg.msg_controllen, 930 .flags = kmsg->msg.msg_flags & ~MSG_CMSG_COMPAT 931 }; 932 933 hdr.msg.payloadlen = err; 934 if (err > kmsg->payloadlen) 935 err = kmsg->payloadlen; 936 937 copy_len = sizeof(struct io_uring_recvmsg_out); 938 if (kmsg->msg.msg_namelen > kmsg->namelen) 939 copy_len += kmsg->namelen; 940 else 941 copy_len += kmsg->msg.msg_namelen; 942 943 /* 944 * "fromlen shall refer to the value before truncation.." 945 * 1003.1g 946 */ 947 hdr.msg.namelen = kmsg->msg.msg_namelen; 948 949 /* ensure that there is no gap between hdr and sockaddr_storage */ 950 BUILD_BUG_ON(offsetof(struct io_recvmsg_multishot_hdr, addr) != 951 sizeof(struct io_uring_recvmsg_out)); 952 if (copy_to_user(io->buf, &hdr, copy_len)) { 953 *finished = true; 954 return -EFAULT; 955 } 956 957 return sizeof(struct io_uring_recvmsg_out) + kmsg->namelen + 958 kmsg->controllen + err; 959 } 960 961 int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) 962 { 963 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 964 struct io_async_msghdr *kmsg = req->async_data; 965 struct socket *sock; 966 unsigned flags; 967 int ret, min_ret = 0; 968 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 969 bool mshot_finished = true; 970 971 sock = sock_from_file(req->file); 972 if (unlikely(!sock)) 973 return -ENOTSOCK; 974 975 if (!(req->flags & REQ_F_POLLED) && 976 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 977 return -EAGAIN; 978 979 flags = sr->msg_flags; 980 if (force_nonblock) 981 flags |= MSG_DONTWAIT; 982 983 retry_multishot: 984 if (io_do_buffer_select(req)) { 985 void __user *buf; 986 size_t len = sr->len; 987 988 buf = io_buffer_select(req, &len, issue_flags); 989 if (!buf) 990 return -ENOBUFS; 991 992 if (req->flags & REQ_F_APOLL_MULTISHOT) { 993 ret = io_recvmsg_prep_multishot(kmsg, sr, &buf, &len); 994 if (ret) { 995 io_kbuf_recycle(req, issue_flags); 996 return ret; 997 } 998 } 999 1000 iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, buf, len); 1001 } 1002 1003 kmsg->msg.msg_get_inq = 1; 1004 kmsg->msg.msg_inq = -1; 1005 if (req->flags & REQ_F_APOLL_MULTISHOT) { 1006 ret = io_recvmsg_multishot(sock, sr, kmsg, flags, 1007 &mshot_finished); 1008 } else { 1009 /* disable partial retry for recvmsg with cmsg attached */ 1010 if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen) 1011 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1012 1013 ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, 1014 kmsg->uaddr, flags); 1015 } 1016 1017 if (ret < min_ret) { 1018 if (ret == -EAGAIN && force_nonblock) { 1019 if (issue_flags & IO_URING_F_MULTISHOT) 1020 io_kbuf_recycle(req, issue_flags); 1021 1022 return IOU_RETRY; 1023 } 1024 if (ret > 0 && io_net_retry(sock, flags)) { 1025 sr->done_io += ret; 1026 req->flags |= REQ_F_BL_NO_RECYCLE; 1027 return IOU_RETRY; 1028 } 1029 if (ret == -ERESTARTSYS) 1030 ret = -EINTR; 1031 req_set_fail(req); 1032 } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { 1033 req_set_fail(req); 1034 } 1035 1036 if (ret > 0) 1037 ret += sr->done_io; 1038 else if (sr->done_io) 1039 ret = sr->done_io; 1040 else 1041 io_kbuf_recycle(req, issue_flags); 1042 1043 if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags)) 1044 goto retry_multishot; 1045 1046 return ret; 1047 } 1048 1049 static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg, 1050 size_t *len, unsigned int issue_flags) 1051 { 1052 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1053 int ret; 1054 1055 /* 1056 * If the ring isn't locked, then don't use the peek interface 1057 * to grab multiple buffers as we will lock/unlock between 1058 * this selection and posting the buffers. 1059 */ 1060 if (!(issue_flags & IO_URING_F_UNLOCKED) && 1061 sr->flags & IORING_RECVSEND_BUNDLE) { 1062 struct buf_sel_arg arg = { 1063 .iovs = &kmsg->fast_iov, 1064 .nr_iovs = 1, 1065 .mode = KBUF_MODE_EXPAND, 1066 }; 1067 1068 if (kmsg->vec.iovec) { 1069 arg.nr_iovs = kmsg->vec.nr; 1070 arg.iovs = kmsg->vec.iovec; 1071 arg.mode |= KBUF_MODE_FREE; 1072 } 1073 1074 if (kmsg->msg.msg_inq > 0) 1075 arg.max_len = min_not_zero(sr->len, kmsg->msg.msg_inq); 1076 1077 ret = io_buffers_peek(req, &arg); 1078 if (unlikely(ret < 0)) 1079 return ret; 1080 1081 /* special case 1 vec, can be a fast path */ 1082 if (ret == 1) { 1083 sr->buf = arg.iovs[0].iov_base; 1084 sr->len = arg.iovs[0].iov_len; 1085 goto map_ubuf; 1086 } 1087 iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret, 1088 arg.out_len); 1089 if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) { 1090 kmsg->vec.nr = ret; 1091 kmsg->vec.iovec = arg.iovs; 1092 req->flags |= REQ_F_NEED_CLEANUP; 1093 } 1094 } else { 1095 void __user *buf; 1096 1097 *len = sr->len; 1098 buf = io_buffer_select(req, len, issue_flags); 1099 if (!buf) 1100 return -ENOBUFS; 1101 sr->buf = buf; 1102 sr->len = *len; 1103 map_ubuf: 1104 ret = import_ubuf(ITER_DEST, sr->buf, sr->len, 1105 &kmsg->msg.msg_iter); 1106 if (unlikely(ret)) 1107 return ret; 1108 } 1109 1110 return 0; 1111 } 1112 1113 int io_recv(struct io_kiocb *req, unsigned int issue_flags) 1114 { 1115 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1116 struct io_async_msghdr *kmsg = req->async_data; 1117 struct socket *sock; 1118 unsigned flags; 1119 int ret, min_ret = 0; 1120 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 1121 size_t len = sr->len; 1122 bool mshot_finished; 1123 1124 if (!(req->flags & REQ_F_POLLED) && 1125 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 1126 return -EAGAIN; 1127 1128 sock = sock_from_file(req->file); 1129 if (unlikely(!sock)) 1130 return -ENOTSOCK; 1131 1132 flags = sr->msg_flags; 1133 if (force_nonblock) 1134 flags |= MSG_DONTWAIT; 1135 1136 retry_multishot: 1137 if (io_do_buffer_select(req)) { 1138 ret = io_recv_buf_select(req, kmsg, &len, issue_flags); 1139 if (unlikely(ret)) { 1140 kmsg->msg.msg_inq = -1; 1141 goto out_free; 1142 } 1143 sr->buf = NULL; 1144 } 1145 1146 kmsg->msg.msg_flags = 0; 1147 kmsg->msg.msg_inq = -1; 1148 1149 if (flags & MSG_WAITALL) 1150 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1151 1152 ret = sock_recvmsg(sock, &kmsg->msg, flags); 1153 if (ret < min_ret) { 1154 if (ret == -EAGAIN && force_nonblock) { 1155 if (issue_flags & IO_URING_F_MULTISHOT) 1156 io_kbuf_recycle(req, issue_flags); 1157 1158 return IOU_RETRY; 1159 } 1160 if (ret > 0 && io_net_retry(sock, flags)) { 1161 sr->len -= ret; 1162 sr->buf += ret; 1163 sr->done_io += ret; 1164 req->flags |= REQ_F_BL_NO_RECYCLE; 1165 return -EAGAIN; 1166 } 1167 if (ret == -ERESTARTSYS) 1168 ret = -EINTR; 1169 req_set_fail(req); 1170 } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { 1171 out_free: 1172 req_set_fail(req); 1173 } 1174 1175 mshot_finished = ret <= 0; 1176 if (ret > 0) 1177 ret += sr->done_io; 1178 else if (sr->done_io) 1179 ret = sr->done_io; 1180 else 1181 io_kbuf_recycle(req, issue_flags); 1182 1183 if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags)) 1184 goto retry_multishot; 1185 1186 return ret; 1187 } 1188 1189 int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1190 { 1191 struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc); 1192 unsigned ifq_idx; 1193 1194 if (unlikely(sqe->file_index || sqe->addr2 || sqe->addr || 1195 sqe->addr3)) 1196 return -EINVAL; 1197 1198 ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx); 1199 if (ifq_idx != 0) 1200 return -EINVAL; 1201 zc->ifq = req->ctx->ifq; 1202 if (!zc->ifq) 1203 return -EINVAL; 1204 zc->len = READ_ONCE(sqe->len); 1205 zc->flags = READ_ONCE(sqe->ioprio); 1206 zc->msg_flags = READ_ONCE(sqe->msg_flags); 1207 if (zc->msg_flags) 1208 return -EINVAL; 1209 if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT)) 1210 return -EINVAL; 1211 /* multishot required */ 1212 if (!(zc->flags & IORING_RECV_MULTISHOT)) 1213 return -EINVAL; 1214 /* All data completions are posted as aux CQEs. */ 1215 req->flags |= REQ_F_APOLL_MULTISHOT; 1216 1217 return 0; 1218 } 1219 1220 int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) 1221 { 1222 struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc); 1223 struct socket *sock; 1224 unsigned int len; 1225 int ret; 1226 1227 if (!(req->flags & REQ_F_POLLED) && 1228 (zc->flags & IORING_RECVSEND_POLL_FIRST)) 1229 return -EAGAIN; 1230 1231 sock = sock_from_file(req->file); 1232 if (unlikely(!sock)) 1233 return -ENOTSOCK; 1234 1235 len = zc->len; 1236 ret = io_zcrx_recv(req, zc->ifq, sock, zc->msg_flags | MSG_DONTWAIT, 1237 issue_flags, &zc->len); 1238 if (len && zc->len == 0) { 1239 io_req_set_res(req, 0, 0); 1240 1241 return IOU_COMPLETE; 1242 } 1243 if (unlikely(ret <= 0) && ret != -EAGAIN) { 1244 if (ret == -ERESTARTSYS) 1245 ret = -EINTR; 1246 if (ret == IOU_REQUEUE) 1247 return IOU_REQUEUE; 1248 1249 req_set_fail(req); 1250 io_req_set_res(req, ret, 0); 1251 return IOU_COMPLETE; 1252 } 1253 return IOU_RETRY; 1254 } 1255 1256 void io_send_zc_cleanup(struct io_kiocb *req) 1257 { 1258 struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); 1259 struct io_async_msghdr *io = req->async_data; 1260 1261 if (req_has_async_data(req)) 1262 io_netmsg_iovec_free(io); 1263 if (zc->notif) { 1264 io_notif_flush(zc->notif); 1265 zc->notif = NULL; 1266 } 1267 } 1268 1269 #define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF) 1270 #define IO_ZC_FLAGS_VALID (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE) 1271 1272 int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1273 { 1274 struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); 1275 struct io_ring_ctx *ctx = req->ctx; 1276 struct io_async_msghdr *iomsg; 1277 struct io_kiocb *notif; 1278 int ret; 1279 1280 zc->done_io = 0; 1281 zc->retry = false; 1282 1283 if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) 1284 return -EINVAL; 1285 /* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */ 1286 if (req->flags & REQ_F_CQE_SKIP) 1287 return -EINVAL; 1288 1289 notif = zc->notif = io_alloc_notif(ctx); 1290 if (!notif) 1291 return -ENOMEM; 1292 notif->cqe.user_data = req->cqe.user_data; 1293 notif->cqe.res = 0; 1294 notif->cqe.flags = IORING_CQE_F_NOTIF; 1295 req->flags |= REQ_F_NEED_CLEANUP | REQ_F_POLL_NO_LAZY; 1296 1297 zc->flags = READ_ONCE(sqe->ioprio); 1298 if (unlikely(zc->flags & ~IO_ZC_FLAGS_COMMON)) { 1299 if (zc->flags & ~IO_ZC_FLAGS_VALID) 1300 return -EINVAL; 1301 if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) { 1302 struct io_notif_data *nd = io_notif_to_data(notif); 1303 1304 nd->zc_report = true; 1305 nd->zc_used = false; 1306 nd->zc_copied = false; 1307 } 1308 } 1309 1310 zc->len = READ_ONCE(sqe->len); 1311 zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY; 1312 req->buf_index = READ_ONCE(sqe->buf_index); 1313 if (zc->msg_flags & MSG_DONTWAIT) 1314 req->flags |= REQ_F_NOWAIT; 1315 1316 if (io_is_compat(req->ctx)) 1317 zc->msg_flags |= MSG_CMSG_COMPAT; 1318 1319 iomsg = io_msg_alloc_async(req); 1320 if (unlikely(!iomsg)) 1321 return -ENOMEM; 1322 1323 if (req->opcode == IORING_OP_SEND_ZC) { 1324 if (zc->flags & IORING_RECVSEND_FIXED_BUF) 1325 req->flags |= REQ_F_IMPORT_BUFFER; 1326 ret = io_send_setup(req, sqe); 1327 } else { 1328 if (unlikely(sqe->addr2 || sqe->file_index)) 1329 return -EINVAL; 1330 ret = io_sendmsg_setup(req, sqe); 1331 } 1332 if (unlikely(ret)) 1333 return ret; 1334 1335 if (!(zc->flags & IORING_RECVSEND_FIXED_BUF)) { 1336 iomsg->msg.sg_from_iter = io_sg_from_iter_iovec; 1337 return io_notif_account_mem(zc->notif, iomsg->msg.msg_iter.count); 1338 } 1339 iomsg->msg.sg_from_iter = io_sg_from_iter; 1340 return 0; 1341 } 1342 1343 static int io_sg_from_iter_iovec(struct sk_buff *skb, 1344 struct iov_iter *from, size_t length) 1345 { 1346 skb_zcopy_downgrade_managed(skb); 1347 return zerocopy_fill_skb_from_iter(skb, from, length); 1348 } 1349 1350 static int io_sg_from_iter(struct sk_buff *skb, 1351 struct iov_iter *from, size_t length) 1352 { 1353 struct skb_shared_info *shinfo = skb_shinfo(skb); 1354 int frag = shinfo->nr_frags; 1355 int ret = 0; 1356 struct bvec_iter bi; 1357 ssize_t copied = 0; 1358 unsigned long truesize = 0; 1359 1360 if (!frag) 1361 shinfo->flags |= SKBFL_MANAGED_FRAG_REFS; 1362 else if (unlikely(!skb_zcopy_managed(skb))) 1363 return zerocopy_fill_skb_from_iter(skb, from, length); 1364 1365 bi.bi_size = min(from->count, length); 1366 bi.bi_bvec_done = from->iov_offset; 1367 bi.bi_idx = 0; 1368 1369 while (bi.bi_size && frag < MAX_SKB_FRAGS) { 1370 struct bio_vec v = mp_bvec_iter_bvec(from->bvec, bi); 1371 1372 copied += v.bv_len; 1373 truesize += PAGE_ALIGN(v.bv_len + v.bv_offset); 1374 __skb_fill_page_desc_noacc(shinfo, frag++, v.bv_page, 1375 v.bv_offset, v.bv_len); 1376 bvec_iter_advance_single(from->bvec, &bi, v.bv_len); 1377 } 1378 if (bi.bi_size) 1379 ret = -EMSGSIZE; 1380 1381 shinfo->nr_frags = frag; 1382 from->bvec += bi.bi_idx; 1383 from->nr_segs -= bi.bi_idx; 1384 from->count -= copied; 1385 from->iov_offset = bi.bi_bvec_done; 1386 1387 skb->data_len += copied; 1388 skb->len += copied; 1389 skb->truesize += truesize; 1390 return ret; 1391 } 1392 1393 static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags) 1394 { 1395 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1396 struct io_async_msghdr *kmsg = req->async_data; 1397 1398 WARN_ON_ONCE(!(sr->flags & IORING_RECVSEND_FIXED_BUF)); 1399 1400 sr->notif->buf_index = req->buf_index; 1401 return io_import_reg_buf(sr->notif, &kmsg->msg.msg_iter, 1402 (u64)(uintptr_t)sr->buf, sr->len, 1403 ITER_SOURCE, issue_flags); 1404 } 1405 1406 int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) 1407 { 1408 struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); 1409 struct io_async_msghdr *kmsg = req->async_data; 1410 struct socket *sock; 1411 unsigned msg_flags; 1412 int ret, min_ret = 0; 1413 1414 sock = sock_from_file(req->file); 1415 if (unlikely(!sock)) 1416 return -ENOTSOCK; 1417 if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) 1418 return -EOPNOTSUPP; 1419 1420 if (!(req->flags & REQ_F_POLLED) && 1421 (zc->flags & IORING_RECVSEND_POLL_FIRST)) 1422 return -EAGAIN; 1423 1424 if (req->flags & REQ_F_IMPORT_BUFFER) { 1425 req->flags &= ~REQ_F_IMPORT_BUFFER; 1426 ret = io_send_zc_import(req, issue_flags); 1427 if (unlikely(ret)) 1428 return ret; 1429 } 1430 1431 msg_flags = zc->msg_flags; 1432 if (issue_flags & IO_URING_F_NONBLOCK) 1433 msg_flags |= MSG_DONTWAIT; 1434 if (msg_flags & MSG_WAITALL) 1435 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1436 msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; 1437 1438 kmsg->msg.msg_flags = msg_flags; 1439 kmsg->msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg; 1440 ret = sock_sendmsg(sock, &kmsg->msg); 1441 1442 if (unlikely(ret < min_ret)) { 1443 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 1444 return -EAGAIN; 1445 1446 if (ret > 0 && io_net_retry(sock, kmsg->msg.msg_flags)) { 1447 zc->len -= ret; 1448 zc->buf += ret; 1449 zc->done_io += ret; 1450 req->flags |= REQ_F_BL_NO_RECYCLE; 1451 return -EAGAIN; 1452 } 1453 if (ret == -ERESTARTSYS) 1454 ret = -EINTR; 1455 req_set_fail(req); 1456 } 1457 1458 if (ret >= 0) 1459 ret += zc->done_io; 1460 else if (zc->done_io) 1461 ret = zc->done_io; 1462 1463 /* 1464 * If we're in io-wq we can't rely on tw ordering guarantees, defer 1465 * flushing notif to io_send_zc_cleanup() 1466 */ 1467 if (!(issue_flags & IO_URING_F_UNLOCKED)) { 1468 io_notif_flush(zc->notif); 1469 zc->notif = NULL; 1470 io_req_msg_cleanup(req, 0); 1471 } 1472 io_req_set_res(req, ret, IORING_CQE_F_MORE); 1473 return IOU_OK; 1474 } 1475 1476 int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) 1477 { 1478 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1479 struct io_async_msghdr *kmsg = req->async_data; 1480 struct socket *sock; 1481 unsigned flags; 1482 int ret, min_ret = 0; 1483 1484 if (req->flags & REQ_F_IMPORT_BUFFER) { 1485 unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs; 1486 int ret; 1487 1488 ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter, req, 1489 &kmsg->vec, uvec_segs, issue_flags); 1490 if (unlikely(ret)) 1491 return ret; 1492 req->flags &= ~REQ_F_IMPORT_BUFFER; 1493 } 1494 1495 sock = sock_from_file(req->file); 1496 if (unlikely(!sock)) 1497 return -ENOTSOCK; 1498 if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) 1499 return -EOPNOTSUPP; 1500 1501 if (!(req->flags & REQ_F_POLLED) && 1502 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 1503 return -EAGAIN; 1504 1505 flags = sr->msg_flags; 1506 if (issue_flags & IO_URING_F_NONBLOCK) 1507 flags |= MSG_DONTWAIT; 1508 if (flags & MSG_WAITALL) 1509 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1510 1511 kmsg->msg.msg_control_user = sr->msg_control; 1512 kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg; 1513 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); 1514 1515 if (unlikely(ret < min_ret)) { 1516 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 1517 return -EAGAIN; 1518 1519 if (ret > 0 && io_net_retry(sock, flags)) { 1520 sr->done_io += ret; 1521 req->flags |= REQ_F_BL_NO_RECYCLE; 1522 return -EAGAIN; 1523 } 1524 if (ret == -ERESTARTSYS) 1525 ret = -EINTR; 1526 req_set_fail(req); 1527 } 1528 1529 if (ret >= 0) 1530 ret += sr->done_io; 1531 else if (sr->done_io) 1532 ret = sr->done_io; 1533 1534 /* 1535 * If we're in io-wq we can't rely on tw ordering guarantees, defer 1536 * flushing notif to io_send_zc_cleanup() 1537 */ 1538 if (!(issue_flags & IO_URING_F_UNLOCKED)) { 1539 io_notif_flush(sr->notif); 1540 sr->notif = NULL; 1541 io_req_msg_cleanup(req, 0); 1542 } 1543 io_req_set_res(req, ret, IORING_CQE_F_MORE); 1544 return IOU_OK; 1545 } 1546 1547 void io_sendrecv_fail(struct io_kiocb *req) 1548 { 1549 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1550 1551 if (sr->done_io) 1552 req->cqe.res = sr->done_io; 1553 1554 if ((req->flags & REQ_F_NEED_CLEANUP) && 1555 (req->opcode == IORING_OP_SEND_ZC || req->opcode == IORING_OP_SENDMSG_ZC)) 1556 req->cqe.flags |= IORING_CQE_F_MORE; 1557 } 1558 1559 #define ACCEPT_FLAGS (IORING_ACCEPT_MULTISHOT | IORING_ACCEPT_DONTWAIT | \ 1560 IORING_ACCEPT_POLL_FIRST) 1561 1562 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1563 { 1564 struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept); 1565 1566 if (sqe->len || sqe->buf_index) 1567 return -EINVAL; 1568 1569 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 1570 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 1571 accept->flags = READ_ONCE(sqe->accept_flags); 1572 accept->nofile = rlimit(RLIMIT_NOFILE); 1573 accept->iou_flags = READ_ONCE(sqe->ioprio); 1574 if (accept->iou_flags & ~ACCEPT_FLAGS) 1575 return -EINVAL; 1576 1577 accept->file_slot = READ_ONCE(sqe->file_index); 1578 if (accept->file_slot) { 1579 if (accept->flags & SOCK_CLOEXEC) 1580 return -EINVAL; 1581 if (accept->iou_flags & IORING_ACCEPT_MULTISHOT && 1582 accept->file_slot != IORING_FILE_INDEX_ALLOC) 1583 return -EINVAL; 1584 } 1585 if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 1586 return -EINVAL; 1587 if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) 1588 accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; 1589 if (accept->iou_flags & IORING_ACCEPT_MULTISHOT) 1590 req->flags |= REQ_F_APOLL_MULTISHOT; 1591 if (accept->iou_flags & IORING_ACCEPT_DONTWAIT) 1592 req->flags |= REQ_F_NOWAIT; 1593 return 0; 1594 } 1595 1596 int io_accept(struct io_kiocb *req, unsigned int issue_flags) 1597 { 1598 struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept); 1599 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 1600 bool fixed = !!accept->file_slot; 1601 struct proto_accept_arg arg = { 1602 .flags = force_nonblock ? O_NONBLOCK : 0, 1603 }; 1604 struct file *file; 1605 unsigned cflags; 1606 int ret, fd; 1607 1608 if (!(req->flags & REQ_F_POLLED) && 1609 accept->iou_flags & IORING_ACCEPT_POLL_FIRST) 1610 return -EAGAIN; 1611 1612 retry: 1613 if (!fixed) { 1614 fd = __get_unused_fd_flags(accept->flags, accept->nofile); 1615 if (unlikely(fd < 0)) 1616 return fd; 1617 } 1618 arg.err = 0; 1619 arg.is_empty = -1; 1620 file = do_accept(req->file, &arg, accept->addr, accept->addr_len, 1621 accept->flags); 1622 if (IS_ERR(file)) { 1623 if (!fixed) 1624 put_unused_fd(fd); 1625 ret = PTR_ERR(file); 1626 if (ret == -EAGAIN && force_nonblock && 1627 !(accept->iou_flags & IORING_ACCEPT_DONTWAIT)) 1628 return IOU_RETRY; 1629 1630 if (ret == -ERESTARTSYS) 1631 ret = -EINTR; 1632 } else if (!fixed) { 1633 fd_install(fd, file); 1634 ret = fd; 1635 } else { 1636 ret = io_fixed_fd_install(req, issue_flags, file, 1637 accept->file_slot); 1638 } 1639 1640 cflags = 0; 1641 if (!arg.is_empty) 1642 cflags |= IORING_CQE_F_SOCK_NONEMPTY; 1643 1644 if (ret >= 0 && (req->flags & REQ_F_APOLL_MULTISHOT) && 1645 io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { 1646 if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1) 1647 goto retry; 1648 return IOU_RETRY; 1649 } 1650 1651 io_req_set_res(req, ret, cflags); 1652 if (ret < 0) 1653 req_set_fail(req); 1654 return IOU_COMPLETE; 1655 } 1656 1657 int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1658 { 1659 struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket); 1660 1661 if (sqe->addr || sqe->rw_flags || sqe->buf_index) 1662 return -EINVAL; 1663 1664 sock->domain = READ_ONCE(sqe->fd); 1665 sock->type = READ_ONCE(sqe->off); 1666 sock->protocol = READ_ONCE(sqe->len); 1667 sock->file_slot = READ_ONCE(sqe->file_index); 1668 sock->nofile = rlimit(RLIMIT_NOFILE); 1669 1670 sock->flags = sock->type & ~SOCK_TYPE_MASK; 1671 if (sock->file_slot && (sock->flags & SOCK_CLOEXEC)) 1672 return -EINVAL; 1673 if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 1674 return -EINVAL; 1675 return 0; 1676 } 1677 1678 int io_socket(struct io_kiocb *req, unsigned int issue_flags) 1679 { 1680 struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket); 1681 bool fixed = !!sock->file_slot; 1682 struct file *file; 1683 int ret, fd; 1684 1685 if (!fixed) { 1686 fd = __get_unused_fd_flags(sock->flags, sock->nofile); 1687 if (unlikely(fd < 0)) 1688 return fd; 1689 } 1690 file = __sys_socket_file(sock->domain, sock->type, sock->protocol); 1691 if (IS_ERR(file)) { 1692 if (!fixed) 1693 put_unused_fd(fd); 1694 ret = PTR_ERR(file); 1695 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 1696 return -EAGAIN; 1697 if (ret == -ERESTARTSYS) 1698 ret = -EINTR; 1699 req_set_fail(req); 1700 } else if (!fixed) { 1701 fd_install(fd, file); 1702 ret = fd; 1703 } else { 1704 ret = io_fixed_fd_install(req, issue_flags, file, 1705 sock->file_slot); 1706 } 1707 io_req_set_res(req, ret, 0); 1708 return IOU_OK; 1709 } 1710 1711 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1712 { 1713 struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect); 1714 struct io_async_msghdr *io; 1715 1716 if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) 1717 return -EINVAL; 1718 1719 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 1720 conn->addr_len = READ_ONCE(sqe->addr2); 1721 conn->in_progress = conn->seen_econnaborted = false; 1722 1723 io = io_msg_alloc_async(req); 1724 if (unlikely(!io)) 1725 return -ENOMEM; 1726 1727 return move_addr_to_kernel(conn->addr, conn->addr_len, &io->addr); 1728 } 1729 1730 int io_connect(struct io_kiocb *req, unsigned int issue_flags) 1731 { 1732 struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect); 1733 struct io_async_msghdr *io = req->async_data; 1734 unsigned file_flags; 1735 int ret; 1736 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 1737 1738 if (unlikely(req->flags & REQ_F_FAIL)) { 1739 ret = -ECONNRESET; 1740 goto out; 1741 } 1742 1743 file_flags = force_nonblock ? O_NONBLOCK : 0; 1744 1745 ret = __sys_connect_file(req->file, &io->addr, connect->addr_len, 1746 file_flags); 1747 if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED) 1748 && force_nonblock) { 1749 if (ret == -EINPROGRESS) { 1750 connect->in_progress = true; 1751 } else if (ret == -ECONNABORTED) { 1752 if (connect->seen_econnaborted) 1753 goto out; 1754 connect->seen_econnaborted = true; 1755 } 1756 return -EAGAIN; 1757 } 1758 if (connect->in_progress) { 1759 /* 1760 * At least bluetooth will return -EBADFD on a re-connect 1761 * attempt, and it's (supposedly) also valid to get -EISCONN 1762 * which means the previous result is good. For both of these, 1763 * grab the sock_error() and use that for the completion. 1764 */ 1765 if (ret == -EBADFD || ret == -EISCONN) 1766 ret = sock_error(sock_from_file(req->file)->sk); 1767 } 1768 if (ret == -ERESTARTSYS) 1769 ret = -EINTR; 1770 out: 1771 if (ret < 0) 1772 req_set_fail(req); 1773 io_req_msg_cleanup(req, issue_flags); 1774 io_req_set_res(req, ret, 0); 1775 return IOU_OK; 1776 } 1777 1778 int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1779 { 1780 struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind); 1781 struct sockaddr __user *uaddr; 1782 struct io_async_msghdr *io; 1783 1784 if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) 1785 return -EINVAL; 1786 1787 uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 1788 bind->addr_len = READ_ONCE(sqe->addr2); 1789 1790 io = io_msg_alloc_async(req); 1791 if (unlikely(!io)) 1792 return -ENOMEM; 1793 return move_addr_to_kernel(uaddr, bind->addr_len, &io->addr); 1794 } 1795 1796 int io_bind(struct io_kiocb *req, unsigned int issue_flags) 1797 { 1798 struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind); 1799 struct io_async_msghdr *io = req->async_data; 1800 struct socket *sock; 1801 int ret; 1802 1803 sock = sock_from_file(req->file); 1804 if (unlikely(!sock)) 1805 return -ENOTSOCK; 1806 1807 ret = __sys_bind_socket(sock, &io->addr, bind->addr_len); 1808 if (ret < 0) 1809 req_set_fail(req); 1810 io_req_set_res(req, ret, 0); 1811 return 0; 1812 } 1813 1814 int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1815 { 1816 struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen); 1817 1818 if (sqe->addr || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in || sqe->addr2) 1819 return -EINVAL; 1820 1821 listen->backlog = READ_ONCE(sqe->len); 1822 return 0; 1823 } 1824 1825 int io_listen(struct io_kiocb *req, unsigned int issue_flags) 1826 { 1827 struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen); 1828 struct socket *sock; 1829 int ret; 1830 1831 sock = sock_from_file(req->file); 1832 if (unlikely(!sock)) 1833 return -ENOTSOCK; 1834 1835 ret = __sys_listen_socket(sock, listen->backlog); 1836 if (ret < 0) 1837 req_set_fail(req); 1838 io_req_set_res(req, ret, 0); 1839 return 0; 1840 } 1841 1842 void io_netmsg_cache_free(const void *entry) 1843 { 1844 struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry; 1845 1846 io_vec_free(&kmsg->vec); 1847 kfree(kmsg); 1848 } 1849 #endif 1850